NCBI Home IEB Home C++ Toolkit docs C Toolkit source browser C Toolkit source browser (2) |
NCBI C++ Toolkit Cross ReferenceC++/src/objtools/readers/rm_reader.cpp |
source navigation diff markup identifier search freetext search file search |
1 /* $Id: rm_reader.cpp 112468 2007-10-18 15:39:21Z ivanovp $ 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government have not placed any restriction on its use or reproduction. 13 * 14 * Although all reasonable efforts have been taken to ensure the accuracy 15 * and reliability of the software and data, the NLM and the U.S. 16 * Government do not and cannot warrant the performance or results that 17 * may be obtained by using this software or data. The NLM and the U.S. 18 * Government disclaim all warranties, express or implied, including 19 * warranties of performance, merchantability or fitness for any particular 20 * purpose. 21 * 22 * Please cite the author in any work or product based on this material. 23 * 24 * =========================================================================== 25 * 26 * Author: Frank Ludwig 27 * 28 * File Description: 29 * Repeat Masker file reader 30 * 31 */ 32 33 #include <ncbi_pch.hpp> 34 #include <corelib/ncbistd.hpp> 35 #include <corelib/ncbithr.hpp> 36 #include <corelib/ncbiutil.hpp> 37 #include <corelib/ncbiexpt.hpp> 38 39 #include <util/static_map.hpp> 40 41 #include <serial/iterator.hpp> 42 #include <serial/objistrasn.hpp> 43 44 // Objects includes 45 #include <objects/general/Int_fuzz.hpp> 46 #include <objects/general/Object_id.hpp> 47 #include <objects/general/User_object.hpp> 48 #include <objects/general/User_field.hpp> 49 #include <objects/general/Dbtag.hpp> 50 51 #include <objects/seqloc/Seq_id.hpp> 52 #include <objects/seqloc/Seq_loc.hpp> 53 #include <objects/seqloc/Seq_interval.hpp> 54 #include <objects/seqloc/Seq_point.hpp> 55 56 #include <objects/seq/Seq_annot.hpp> 57 #include <objects/seq/Annotdesc.hpp> 58 #include <objects/seq/Annot_descr.hpp> 59 #include <objects/seqfeat/SeqFeatData.hpp> 60 61 #include <objects/seqfeat/Seq_feat.hpp> 62 #include <objects/seqfeat/BioSource.hpp> 63 #include <objects/seqfeat/Org_ref.hpp> 64 #include <objects/seqfeat/OrgName.hpp> 65 #include <objects/seqfeat/SubSource.hpp> 66 #include <objects/seqfeat/OrgMod.hpp> 67 #include <objects/seqfeat/Gene_ref.hpp> 68 #include <objects/seqfeat/Cdregion.hpp> 69 #include <objects/seqfeat/Code_break.hpp> 70 #include <objects/seqfeat/Genetic_code.hpp> 71 #include <objects/seqfeat/Genetic_code_table.hpp> 72 #include <objects/seqfeat/RNA_ref.hpp> 73 #include <objects/seqfeat/Trna_ext.hpp> 74 #include <objects/seqfeat/Imp_feat.hpp> 75 #include <objects/seqfeat/Gb_qual.hpp> 76 77 #include <objtools/readers/reader_exception.hpp> 78 #include <objtools/readers/rm_reader.hpp> 79 #include <objtools/error_codes.hpp> 80 81 #include <algorithm> 82 83 84 #define NCBI_USE_ERRCODE_X Objtools_Rd_RepMask 85 86 BEGIN_NCBI_SCOPE 87 88 BEGIN_objects_SCOPE // namespace ncbi::objects:: 89 90 struct CMaskData; 91 92 //----------------------------------------------------------------------------- 93 class CRmOutReader: public CRmReader 94 //----------------------------------------------------------------------------- 95 { 96 friend CRmReader* CRmReader::OpenReader( CNcbiIstream& ); 97 98 // 99 // object management: 100 // 101 protected: 102 CRmOutReader( CNcbiIstream& ); 103 public: 104 virtual ~CRmOutReader(); 105 106 // 107 // interface: 108 // 109 public: 110 virtual void Read( CRef<CSeq_annot>, TFlags flags = fDefaults); 111 112 // 113 // internal helpers: 114 // 115 protected: 116 virtual bool IsHeaderLine( const string& ); 117 virtual bool IsIgnoredLine( const string& ); 118 119 virtual bool ParseRecord( const string& record, CMaskData& ); 120 virtual bool VerifyData( const CMaskData& ); 121 virtual bool MakeFeature( const CMaskData&, CRef<CSeq_feat>&, TFlags flags); 122 123 // 124 // data: 125 // 126 protected: 127 static const unsigned long BUFFERSIZE = 256; 128 char pReadBuffer[ BUFFERSIZE ]; 129 }; 130 131 //----------------------------------------------------------------------------- 132 struct CMaskData 133 //----------------------------------------------------------------------------- 134 { 135 unsigned long sw_score; 136 unsigned long outer_pos_begin; 137 unsigned long outer_pos_end; 138 double perc_div; 139 double perc_del; 140 double perc_ins; 141 string query_sequence; 142 string strand; 143 string matching_repeat; 144 string repeat_class_family; 145 }; 146 147 148 CRmReader::CRmReader( CNcbiIstream& InStream ) 149 : 150 m_InStream( InStream ) 151 { 152 } 153 154 155 CRmReader::~CRmReader() 156 { 157 } 158 159 160 CRmOutReader::CRmOutReader( CNcbiIstream& InStream ) 161 : 162 CRmReader( InStream ) 163 { 164 } 165 166 167 CRmOutReader::~CRmOutReader() 168 { 169 } 170 171 172 void CRmOutReader::Read( CRef<CSeq_annot> entry, TFlags flags ) 173 { 174 const size_t MAX_ERROR_COUNT = 5; 175 176 string line; 177 CSeq_annot::C_Data::TFtable& ftable = entry->SetData().SetFtable(); 178 CRef<CSeq_feat> feat; 179 180 size_t line_counter = 0; 181 size_t record_counter = 0; 182 size_t error_counter = 0; 183 184 while ( ! m_InStream.eof() ) { 185 186 NcbiGetlineEOL( m_InStream, line ); 187 ++line_counter; 188 189 if ( IsHeaderLine( line ) || IsIgnoredLine( line ) ) { 190 continue; 191 } 192 ++record_counter; 193 194 CMaskData mask_data; 195 if ( ! ParseRecord( line, mask_data ) ) { 196 ++error_counter; 197 LOG_POST_X( 1, Error << "Rmo Reader: Parse error in record " 198 << record_counter << " (line " << line_counter 199 << "). Record skipped" ); 200 if ( error_counter < MAX_ERROR_COUNT ) { 201 continue; 202 } 203 else { 204 break; 205 } 206 } 207 208 if ( ! VerifyData( mask_data ) ) { 209 ++error_counter; 210 LOG_POST_X( 2, Error << "Rmo Reader: Verification error in record " 211 << record_counter << " (line " << line_counter 212 << "). Record skipped." ); 213 if ( error_counter < MAX_ERROR_COUNT ) { 214 continue; 215 } 216 else { 217 break; 218 } 219 } 220 221 if ( ! MakeFeature( mask_data, feat, flags ) ) { 222 // we don't tolerate even a few errors here! 223 error_counter = MAX_ERROR_COUNT; 224 LOG_POST_X( 3, Error << "Rmo Reader: Unable to create feature table for record " 225 << record_counter << " (line " << line_counter 226 << "). Aborting file import." ); 227 break; 228 } 229 230 ftable.push_back( feat ); 231 } 232 233 if ( error_counter == MAX_ERROR_COUNT ) { 234 LOG_POST_X( 4, Error << "Rmo Reader: File import aborted due to error count or severity." ); 235 throw 0; // upper layer catches everything in sight and reports error to file_loader. 236 } 237 } 238 239 240 bool CRmOutReader::IsHeaderLine( const string& line ) 241 { 242 string labels_1st_line[] = { "SW", "perc", "query", "position", "matching", "" }; 243 string labels_2nd_line[] = { "score", "div.", "del.", "ins.", "sequence", "" }; 244 245 // try to identify 1st line of column labels: 246 size_t current_offset = 0; 247 size_t i = 0; 248 for ( ; labels_1st_line[i] != ""; ++i ) { 249 current_offset = NStr::FindCase( line, labels_1st_line[i], current_offset ); 250 if ( NPOS == current_offset ) { 251 break; 252 } 253 } 254 if ( labels_1st_line[i] == "" ) { 255 return true; 256 } 257 258 // try to identify 2nd line of column labels: 259 current_offset = 0; 260 i = 0; 261 for ( ; labels_2nd_line[i] != ""; ++i ) { 262 current_offset = NStr::FindCase( line, labels_2nd_line[i], current_offset ); 263 if ( NPOS == current_offset ) { 264 return false; 265 } 266 } 267 return true; 268 } 269 270 271 bool CRmOutReader::IsIgnoredLine( const string& line ) 272 { 273 // 274 // Currently, only lines with only whitespace on them are ignored. 275 // 276 return ( NStr::TruncateSpaces( line ).length() == 0 ); 277 } 278 279 280 bool CRmOutReader::ParseRecord( const string& record, CMaskData& mask_data ) 281 { 282 const size_t MIN_VALUE_COUNT = 15; 283 284 string line = NStr::TruncateSpaces( record ); 285 list< string > values; 286 if ( NStr::Split( line, " \t", values ).size() < MIN_VALUE_COUNT ) { 287 return false; 288 } 289 290 try { 291 // 1: "SW score" 292 list<string>::iterator it = values.begin(); 293 mask_data.sw_score = NStr::StringToUInt( *it ); 294 295 // 2: "perc div." 296 ++it; 297 mask_data.perc_div = NStr::StringToDouble( *it ); 298 299 // 3: "perc del." 300 ++it; 301 mask_data.perc_del = NStr::StringToDouble( *it ); 302 303 // 4: "perc ins." 304 ++it; 305 mask_data.perc_ins = NStr::StringToDouble( *it ); 306 307 // 5: "query sequence" 308 ++it; 309 mask_data.query_sequence = *it; 310 311 // 6: "position begin" 312 ++it; 313 mask_data.outer_pos_begin = NStr::StringToUInt( *it ); 314 315 // 7: "in end" 316 ++it; 317 mask_data.outer_pos_end = NStr::StringToUInt( *it ); 318 319 // 8: "query (left)" 320 ++it; 321 /* not used */ 322 323 // 9: "" (meaning "strand") 324 ++it; 325 mask_data.strand = *it; 326 327 // 10: "matching repeat" 328 ++it; 329 mask_data.matching_repeat = *it; 330 331 // 11: "repeat class/family" 332 ++it; 333 mask_data.repeat_class_family = *it; 334 335 // 12: "position in" 336 ++it; 337 /* not used */ 338 339 // 13: "in end" 340 ++it; 341 /* not used */ 342 343 // 14: "repeat left" 344 ++it; 345 /* not used */ 346 347 // 15: "ID" 348 ++it; 349 /* not used */ 350 351 } 352 catch( ... ) { 353 return false; 354 } 355 356 return true; 357 } 358 359 360 bool CRmOutReader::VerifyData( const CMaskData& mask_data ) 361 { 362 // 363 // This would be the place for any higher level checks of the mask data 364 // collected from the record ... 365 // 366 return true; 367 } 368 369 370 bool CRmOutReader::MakeFeature( const CMaskData& mask_data, CRef<CSeq_feat>& feat, 371 TFlags flags ) 372 { 373 feat.Reset( new CSeq_feat ); 374 feat->ResetLocation(); 375 376 // data: 377 CSeqFeatData& sfdata = feat->SetData(); 378 CImp_feat_Base& imp = sfdata.SetImp(); 379 imp.SetKey( "repeat_region" ); 380 381 // location: 382 CRef<CSeq_loc> location( new CSeq_loc ); 383 CSeq_interval& interval = location->SetInt(); 384 interval.SetFrom( min( mask_data.outer_pos_begin, mask_data.outer_pos_end ) -1 ); 385 interval.SetTo( max( mask_data.outer_pos_begin, mask_data.outer_pos_end ) -1 ); 386 interval.SetStrand( strcmp( mask_data.strand.c_str(), "C" ) ? 387 eNa_strand_plus : eNa_strand_minus ); 388 389 CBioseq::TId ids; 390 CSeq_id::ParseFastaIds(ids, mask_data.query_sequence); 391 location->SetId(*FindBestChoice(ids, CSeq_id::Score)); 392 393 feat->SetLocation( *location ); 394 395 // qualifiers: 396 if (flags) { 397 CSeq_feat::TQual& qual_list = feat->SetQual(); 398 399 if (flags & fIncludeRepeatName) { 400 CRef<CGb_qual> repeat( new CGb_qual ); 401 repeat->SetQual( "repeat_region" ); 402 repeat->SetVal( mask_data.matching_repeat ); 403 qual_list.push_back( repeat ); 404 } 405 406 if (flags & fIncludeRepeatClass) { 407 CRef<CGb_qual> rpt_family( new CGb_qual ); 408 rpt_family->SetQual( "rpt_family" ); 409 rpt_family->SetVal( mask_data.repeat_class_family ); 410 qual_list.push_back( rpt_family ); 411 } 412 413 if (flags & fIncludeStatistics) { 414 CRef<CGb_qual> sw_score( new CGb_qual ); 415 sw_score->SetQual( "sw_score" ); 416 sw_score->SetVal( NStr::IntToString( mask_data.sw_score ) ); 417 qual_list.push_back( sw_score ); 418 419 CRef<CGb_qual> perc_div( new CGb_qual ); 420 perc_div->SetQual( "perc_div" ); 421 perc_div->SetVal( NStr::DoubleToString( mask_data.perc_div ) ); 422 qual_list.push_back( perc_div ); 423 424 CRef<CGb_qual> perc_del( new CGb_qual ); 425 perc_del->SetQual( "perc_del" ); 426 perc_del->SetVal( NStr::DoubleToString( mask_data.perc_del ) ); 427 qual_list.push_back( perc_del ); 428 429 CRef<CGb_qual> perc_ins( new CGb_qual ); 430 perc_ins->SetQual( "perc_ins" ); 431 perc_ins->SetVal( NStr::DoubleToString( mask_data.perc_ins ) ); 432 qual_list.push_back( perc_ins ); 433 } 434 } 435 436 return true; 437 } 438 439 440 CRmReader* CRmReader::OpenReader( CNcbiIstream& InStream ) 441 { 442 // 443 // This is the point to make sure we are dealing with the right file type and 444 // to allocate the specialist reader for any subtype (OUT, HTML) we encouter. 445 // When this function returns the file pointer should be past the file header 446 // and at the beginning of the actual mask data. 447 // 448 // Note: 449 // If something goes wrong during header processing then the file pointer will 450 // still be modified. It's the caller's job to restore the file pointer if this 451 // is possible for this type of stream. 452 // 453 454 // 455 // 2006-03-31: Only supported file type at this time: ReadMasker OUT. 456 // 457 return new CRmOutReader( InStream ); 458 } 459 460 461 void CRmReader::CloseReader( CRmReader* pReader ) 462 { 463 delete pReader; 464 } 465 466 467 END_objects_SCOPE 468 END_NCBI_SCOPE 469
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |