00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033 #include <ncbi_pch.hpp>
00034 #include <corelib/ncbistd.hpp>
00035 #include <corelib/ncbithr.hpp>
00036 #include <corelib/ncbiutil.hpp>
00037 #include <corelib/ncbiexpt.hpp>
00038
00039 #include <util/static_map.hpp>
00040
00041 #include <serial/iterator.hpp>
00042 #include <serial/objistrasn.hpp>
00043
00044
00045 #include <objects/general/Int_fuzz.hpp>
00046 #include <objects/general/Object_id.hpp>
00047 #include <objects/general/User_object.hpp>
00048 #include <objects/general/User_field.hpp>
00049 #include <objects/general/Dbtag.hpp>
00050
00051 #include <objects/seqloc/Seq_id.hpp>
00052 #include <objects/seqloc/Seq_loc.hpp>
00053 #include <objects/seqloc/Seq_interval.hpp>
00054 #include <objects/seqloc/Seq_point.hpp>
00055
00056 #include <objects/seq/Seq_annot.hpp>
00057 #include <objects/seq/Annotdesc.hpp>
00058 #include <objects/seq/Annot_descr.hpp>
00059 #include <objects/seqfeat/SeqFeatData.hpp>
00060
00061 #include <objects/seqfeat/Seq_feat.hpp>
00062 #include <objects/seqfeat/BioSource.hpp>
00063 #include <objects/seqfeat/Org_ref.hpp>
00064 #include <objects/seqfeat/OrgName.hpp>
00065 #include <objects/seqfeat/SubSource.hpp>
00066 #include <objects/seqfeat/OrgMod.hpp>
00067 #include <objects/seqfeat/Gene_ref.hpp>
00068 #include <objects/seqfeat/Cdregion.hpp>
00069 #include <objects/seqfeat/Code_break.hpp>
00070 #include <objects/seqfeat/Genetic_code.hpp>
00071 #include <objects/seqfeat/Genetic_code_table.hpp>
00072 #include <objects/seqfeat/RNA_ref.hpp>
00073 #include <objects/seqfeat/Trna_ext.hpp>
00074 #include <objects/seqfeat/Imp_feat.hpp>
00075 #include <objects/seqfeat/Gb_qual.hpp>
00076
00077 #include <objtools/readers/reader_exception.hpp>
00078 #include <objtools/readers/rm_reader.hpp>
00079 #include <objtools/error_codes.hpp>
00080
00081 #include <algorithm>
00082
00083
00084 #define NCBI_USE_ERRCODE_X Objtools_Rd_RepMask
00085
00086 BEGIN_NCBI_SCOPE
00087
00088 BEGIN_objects_SCOPE
00089
00090 struct CMaskData;
00091
00092
00093 class CRmOutReader: public CRmReader
00094
00095 {
00096 friend CRmReader* CRmReader::OpenReader( CNcbiIstream& );
00097
00098
00099
00100
00101 protected:
00102 CRmOutReader( CNcbiIstream& );
00103 public:
00104 virtual ~CRmOutReader();
00105
00106
00107
00108
00109 public:
00110 virtual void Read( CRef<CSeq_annot>, TFlags flags = fDefaults);
00111
00112
00113
00114
00115 protected:
00116 virtual bool IsHeaderLine( const string& );
00117 virtual bool IsIgnoredLine( const string& );
00118
00119 virtual bool ParseRecord( const string& record, CMaskData& );
00120 virtual bool VerifyData( const CMaskData& );
00121 virtual bool MakeFeature( const CMaskData&, CRef<CSeq_feat>&, TFlags flags);
00122
00123
00124
00125
00126 protected:
00127 static const unsigned long BUFFERSIZE = 256;
00128 char pReadBuffer[ BUFFERSIZE ];
00129 };
00130
00131
00132 struct CMaskData
00133
00134 {
00135 unsigned long sw_score;
00136 unsigned long outer_pos_begin;
00137 unsigned long outer_pos_end;
00138 double perc_div;
00139 double perc_del;
00140 double perc_ins;
00141 string query_sequence;
00142 string strand;
00143 string matching_repeat;
00144 string repeat_class_family;
00145 };
00146
00147
00148 CRmReader::CRmReader( CNcbiIstream& InStream )
00149 :
00150 m_InStream( InStream )
00151 {
00152 }
00153
00154
00155 CRmReader::~CRmReader()
00156 {
00157 }
00158
00159
00160 CRmOutReader::CRmOutReader( CNcbiIstream& InStream )
00161 :
00162 CRmReader( InStream )
00163 {
00164 }
00165
00166
00167 CRmOutReader::~CRmOutReader()
00168 {
00169 }
00170
00171
00172 void CRmOutReader::Read( CRef<CSeq_annot> entry, TFlags flags )
00173 {
00174 const size_t MAX_ERROR_COUNT = 5;
00175
00176 string line;
00177 CSeq_annot::C_Data::TFtable& ftable = entry->SetData().SetFtable();
00178 CRef<CSeq_feat> feat;
00179
00180 size_t line_counter = 0;
00181 size_t record_counter = 0;
00182 size_t error_counter = 0;
00183
00184 while ( ! m_InStream.eof() ) {
00185
00186 NcbiGetlineEOL( m_InStream, line );
00187 ++line_counter;
00188
00189 if ( IsHeaderLine( line ) || IsIgnoredLine( line ) ) {
00190 continue;
00191 }
00192 ++record_counter;
00193
00194 CMaskData mask_data;
00195 if ( ! ParseRecord( line, mask_data ) ) {
00196 ++error_counter;
00197 LOG_POST_X( 1, Error << "Rmo Reader: Parse error in record "
00198 << record_counter << " (line " << line_counter
00199 << "). Record skipped" );
00200 if ( error_counter < MAX_ERROR_COUNT ) {
00201 continue;
00202 }
00203 else {
00204 break;
00205 }
00206 }
00207
00208 if ( ! VerifyData( mask_data ) ) {
00209 ++error_counter;
00210 LOG_POST_X( 2, Error << "Rmo Reader: Verification error in record "
00211 << record_counter << " (line " << line_counter
00212 << "). Record skipped." );
00213 if ( error_counter < MAX_ERROR_COUNT ) {
00214 continue;
00215 }
00216 else {
00217 break;
00218 }
00219 }
00220
00221 if ( ! MakeFeature( mask_data, feat, flags ) ) {
00222
00223 error_counter = MAX_ERROR_COUNT;
00224 LOG_POST_X( 3, Error << "Rmo Reader: Unable to create feature table for record "
00225 << record_counter << " (line " << line_counter
00226 << "). Aborting file import." );
00227 break;
00228 }
00229
00230 ftable.push_back( feat );
00231 }
00232
00233 if ( error_counter == MAX_ERROR_COUNT ) {
00234 LOG_POST_X( 4, Error << "Rmo Reader: File import aborted due to error count or severity." );
00235 throw 0;
00236 }
00237 }
00238
00239
00240 bool CRmOutReader::IsHeaderLine( const string& line )
00241 {
00242 string labels_1st_line[] = { "SW", "perc", "query", "position", "matching", "" };
00243 string labels_2nd_line[] = { "score", "div.", "del.", "ins.", "sequence", "" };
00244
00245
00246 size_t current_offset = 0;
00247 size_t i = 0;
00248 for ( ; labels_1st_line[i] != ""; ++i ) {
00249 current_offset = NStr::FindCase( line, labels_1st_line[i], current_offset );
00250 if ( NPOS == current_offset ) {
00251 break;
00252 }
00253 }
00254 if ( labels_1st_line[i] == "" ) {
00255 return true;
00256 }
00257
00258
00259 current_offset = 0;
00260 i = 0;
00261 for ( ; labels_2nd_line[i] != ""; ++i ) {
00262 current_offset = NStr::FindCase( line, labels_2nd_line[i], current_offset );
00263 if ( NPOS == current_offset ) {
00264 return false;
00265 }
00266 }
00267 return true;
00268 }
00269
00270
00271 bool CRmOutReader::IsIgnoredLine( const string& line )
00272 {
00273
00274
00275
00276 return ( NStr::TruncateSpaces( line ).length() == 0 );
00277 }
00278
00279
00280 bool CRmOutReader::ParseRecord( const string& record, CMaskData& mask_data )
00281 {
00282 const size_t MIN_VALUE_COUNT = 15;
00283
00284 string line = NStr::TruncateSpaces( record );
00285 list< string > values;
00286 if ( NStr::Split( line, " \t", values ).size() < MIN_VALUE_COUNT ) {
00287 return false;
00288 }
00289
00290 try {
00291
00292 list<string>::iterator it = values.begin();
00293 mask_data.sw_score = NStr::StringToUInt( *it );
00294
00295
00296 ++it;
00297 mask_data.perc_div = NStr::StringToDouble( *it );
00298
00299
00300 ++it;
00301 mask_data.perc_del = NStr::StringToDouble( *it );
00302
00303
00304 ++it;
00305 mask_data.perc_ins = NStr::StringToDouble( *it );
00306
00307
00308 ++it;
00309 mask_data.query_sequence = *it;
00310
00311
00312 ++it;
00313 mask_data.outer_pos_begin = NStr::StringToUInt( *it );
00314
00315
00316 ++it;
00317 mask_data.outer_pos_end = NStr::StringToUInt( *it );
00318
00319
00320 ++it;
00321
00322
00323
00324 ++it;
00325 mask_data.strand = *it;
00326
00327
00328 ++it;
00329 mask_data.matching_repeat = *it;
00330
00331
00332 ++it;
00333 mask_data.repeat_class_family = *it;
00334
00335
00336 ++it;
00337
00338
00339
00340 ++it;
00341
00342
00343
00344 ++it;
00345
00346
00347
00348 ++it;
00349
00350
00351 }
00352 catch( ... ) {
00353 return false;
00354 }
00355
00356 return true;
00357 }
00358
00359
00360 bool CRmOutReader::VerifyData( const CMaskData& mask_data )
00361 {
00362
00363
00364
00365
00366 return true;
00367 }
00368
00369
00370 bool CRmOutReader::MakeFeature( const CMaskData& mask_data, CRef<CSeq_feat>& feat,
00371 TFlags flags )
00372 {
00373 feat.Reset( new CSeq_feat );
00374 feat->ResetLocation();
00375
00376
00377 CSeqFeatData& sfdata = feat->SetData();
00378 CImp_feat_Base& imp = sfdata.SetImp();
00379 imp.SetKey( "repeat_region" );
00380
00381
00382 CRef<CSeq_loc> location( new CSeq_loc );
00383 CSeq_interval& interval = location->SetInt();
00384 interval.SetFrom( min( mask_data.outer_pos_begin, mask_data.outer_pos_end ) -1 );
00385 interval.SetTo( max( mask_data.outer_pos_begin, mask_data.outer_pos_end ) -1 );
00386 interval.SetStrand( strcmp( mask_data.strand.c_str(), "C" ) ?
00387 eNa_strand_plus : eNa_strand_minus );
00388
00389 CBioseq::TId ids;
00390 CSeq_id::ParseFastaIds(ids, mask_data.query_sequence);
00391 location->SetId(*FindBestChoice(ids, CSeq_id::Score));
00392
00393 feat->SetLocation( *location );
00394
00395
00396 if (flags) {
00397 CSeq_feat::TQual& qual_list = feat->SetQual();
00398
00399 if (flags & fIncludeRepeatName) {
00400 CRef<CGb_qual> repeat( new CGb_qual );
00401 repeat->SetQual( "repeat_region" );
00402 repeat->SetVal( mask_data.matching_repeat );
00403 qual_list.push_back( repeat );
00404 }
00405
00406 if (flags & fIncludeRepeatClass) {
00407 CRef<CGb_qual> rpt_family( new CGb_qual );
00408 rpt_family->SetQual( "rpt_family" );
00409 rpt_family->SetVal( mask_data.repeat_class_family );
00410 qual_list.push_back( rpt_family );
00411 }
00412
00413 if (flags & fIncludeStatistics) {
00414 CRef<CGb_qual> sw_score( new CGb_qual );
00415 sw_score->SetQual( "sw_score" );
00416 sw_score->SetVal( NStr::IntToString( mask_data.sw_score ) );
00417 qual_list.push_back( sw_score );
00418
00419 CRef<CGb_qual> perc_div( new CGb_qual );
00420 perc_div->SetQual( "perc_div" );
00421 perc_div->SetVal( NStr::DoubleToString( mask_data.perc_div ) );
00422 qual_list.push_back( perc_div );
00423
00424 CRef<CGb_qual> perc_del( new CGb_qual );
00425 perc_del->SetQual( "perc_del" );
00426 perc_del->SetVal( NStr::DoubleToString( mask_data.perc_del ) );
00427 qual_list.push_back( perc_del );
00428
00429 CRef<CGb_qual> perc_ins( new CGb_qual );
00430 perc_ins->SetQual( "perc_ins" );
00431 perc_ins->SetVal( NStr::DoubleToString( mask_data.perc_ins ) );
00432 qual_list.push_back( perc_ins );
00433 }
00434 }
00435
00436 return true;
00437 }
00438
00439
00440 CRmReader* CRmReader::OpenReader( CNcbiIstream& InStream )
00441 {
00442
00443
00444
00445
00446
00447
00448
00449
00450
00451
00452
00453
00454
00455
00456
00457 return new CRmOutReader( InStream );
00458 }
00459
00460
00461 void CRmReader::CloseReader( CRmReader* pReader )
00462 {
00463 delete pReader;
00464 }
00465
00466
00467 END_objects_SCOPE
00468 END_NCBI_SCOPE
00469
00470