NCBI C++ Toolkit Cross Reference

C++/src/objtools/readers/rm_reader.cpp


  1 /*  $Id: rm_reader.cpp 112468 2007-10-18 15:39:21Z ivanovp $
  2  * ===========================================================================
  3  *
  4  *                            PUBLIC DOMAIN NOTICE
  5  *               National Center for Biotechnology Information
  6  *
  7  *  This software/database is a "United States Government Work" under the
  8  *  terms of the United States Copyright Act.  It was written as part of
  9  *  the author's official duties as a United States Government employee and
 10  *  thus cannot be copyrighted.  This software/database is freely available
 11  *  to the public for use. The National Library of Medicine and the U.S.
 12  *  Government have not placed any restriction on its use or reproduction.
 13  *
 14  *  Although all reasonable efforts have been taken to ensure the accuracy
 15  *  and reliability of the software and data, the NLM and the U.S.
 16  *  Government do not and cannot warrant the performance or results that
 17  *  may be obtained by using this software or data. The NLM and the U.S.
 18  *  Government disclaim all warranties, express or implied, including
 19  *  warranties of performance, merchantability or fitness for any particular
 20  *  purpose.
 21  *
 22  *  Please cite the author in any work or product based on this material.
 23  *
 24  * ===========================================================================
 25  *
 26  * Author:  Frank Ludwig
 27  *
 28  * File Description:
 29  *   Repeat Masker file reader
 30  *
 31  */
 32 
 33 #include <ncbi_pch.hpp>
 34 #include <corelib/ncbistd.hpp>
 35 #include <corelib/ncbithr.hpp>
 36 #include <corelib/ncbiutil.hpp>
 37 #include <corelib/ncbiexpt.hpp>
 38 
 39 #include <util/static_map.hpp>
 40 
 41 #include <serial/iterator.hpp>
 42 #include <serial/objistrasn.hpp>
 43 
 44 // Objects includes
 45 #include <objects/general/Int_fuzz.hpp>
 46 #include <objects/general/Object_id.hpp>
 47 #include <objects/general/User_object.hpp>
 48 #include <objects/general/User_field.hpp>
 49 #include <objects/general/Dbtag.hpp>
 50 
 51 #include <objects/seqloc/Seq_id.hpp>
 52 #include <objects/seqloc/Seq_loc.hpp>
 53 #include <objects/seqloc/Seq_interval.hpp>
 54 #include <objects/seqloc/Seq_point.hpp>
 55 
 56 #include <objects/seq/Seq_annot.hpp>
 57 #include <objects/seq/Annotdesc.hpp>
 58 #include <objects/seq/Annot_descr.hpp>
 59 #include <objects/seqfeat/SeqFeatData.hpp>
 60 
 61 #include <objects/seqfeat/Seq_feat.hpp>
 62 #include <objects/seqfeat/BioSource.hpp>
 63 #include <objects/seqfeat/Org_ref.hpp>
 64 #include <objects/seqfeat/OrgName.hpp>
 65 #include <objects/seqfeat/SubSource.hpp>
 66 #include <objects/seqfeat/OrgMod.hpp>
 67 #include <objects/seqfeat/Gene_ref.hpp>
 68 #include <objects/seqfeat/Cdregion.hpp>
 69 #include <objects/seqfeat/Code_break.hpp>
 70 #include <objects/seqfeat/Genetic_code.hpp>
 71 #include <objects/seqfeat/Genetic_code_table.hpp>
 72 #include <objects/seqfeat/RNA_ref.hpp>
 73 #include <objects/seqfeat/Trna_ext.hpp>
 74 #include <objects/seqfeat/Imp_feat.hpp>
 75 #include <objects/seqfeat/Gb_qual.hpp>
 76 
 77 #include <objtools/readers/reader_exception.hpp>
 78 #include <objtools/readers/rm_reader.hpp>
 79 #include <objtools/error_codes.hpp>
 80 
 81 #include <algorithm>
 82 
 83 
 84 #define NCBI_USE_ERRCODE_X   Objtools_Rd_RepMask
 85 
 86 BEGIN_NCBI_SCOPE
 87 
 88 BEGIN_objects_SCOPE // namespace ncbi::objects::
 89 
 90 struct CMaskData;
 91 
 92 //-----------------------------------------------------------------------------
 93 class CRmOutReader: public CRmReader
 94 //-----------------------------------------------------------------------------
 95 {
 96     friend CRmReader* CRmReader::OpenReader( CNcbiIstream& );
 97     
 98     //
 99     //  object management:
100     //
101 protected:
102     CRmOutReader( CNcbiIstream& );
103 public:
104     virtual ~CRmOutReader();
105     
106     //
107     //  interface:
108     //
109 public:
110     virtual void Read( CRef<CSeq_annot>, TFlags flags = fDefaults);
111 
112     //
113     //  internal helpers:
114     //
115 protected:
116     virtual bool IsHeaderLine( const string& );
117     virtual bool IsIgnoredLine( const string& ); 
118     
119     virtual bool ParseRecord( const string& record, CMaskData& );
120     virtual bool VerifyData( const CMaskData& );
121     virtual bool MakeFeature( const CMaskData&, CRef<CSeq_feat>&, TFlags flags);
122     
123     //
124     //  data:
125     //
126 protected:
127     static const unsigned long BUFFERSIZE = 256;
128     char pReadBuffer[ BUFFERSIZE ];
129 };
130 
131 //-----------------------------------------------------------------------------
132 struct CMaskData
133 //-----------------------------------------------------------------------------
134 {
135     unsigned long sw_score;
136     unsigned long outer_pos_begin;
137     unsigned long outer_pos_end;
138     double perc_div;
139     double perc_del;
140     double perc_ins;
141     string query_sequence;
142     string strand;
143     string matching_repeat;
144     string repeat_class_family;
145 };
146     
147 
148 CRmReader::CRmReader( CNcbiIstream& InStream )
149     :
150     m_InStream( InStream )
151 {
152 }
153 
154 
155 CRmReader::~CRmReader()
156 {
157 }
158 
159 
160 CRmOutReader::CRmOutReader( CNcbiIstream& InStream )
161     :
162     CRmReader( InStream )
163 {
164 }
165 
166 
167 CRmOutReader::~CRmOutReader()
168 {
169 }
170 
171 
172 void CRmOutReader::Read( CRef<CSeq_annot> entry, TFlags flags )
173 {
174     const size_t MAX_ERROR_COUNT = 5;
175     
176     string line;
177     CSeq_annot::C_Data::TFtable& ftable = entry->SetData().SetFtable();
178     CRef<CSeq_feat> feat;
179     
180     size_t line_counter = 0;
181     size_t record_counter = 0;
182     size_t error_counter = 0;
183     
184     while ( ! m_InStream.eof() ) {
185 
186         NcbiGetlineEOL( m_InStream, line );
187         ++line_counter;
188         
189         if ( IsHeaderLine( line ) || IsIgnoredLine( line ) ) {
190             continue;
191         }
192         ++record_counter;
193         
194         CMaskData mask_data;
195         if ( ! ParseRecord( line, mask_data ) ) {
196             ++error_counter;
197             LOG_POST_X( 1, Error << "Rmo Reader: Parse error in record " 
198                 << record_counter << " (line " << line_counter 
199                 << "). Record skipped" );
200             if ( error_counter < MAX_ERROR_COUNT ) {
201                 continue;
202             }
203             else {
204                 break;
205             }
206         }
207         
208         if ( ! VerifyData( mask_data ) ) {
209             ++error_counter;
210             LOG_POST_X( 2, Error << "Rmo Reader: Verification error in record " 
211                 << record_counter << " (line " << line_counter 
212                 << "). Record skipped." );
213             if ( error_counter < MAX_ERROR_COUNT ) {
214                 continue;
215             }
216             else {
217                 break;
218             }
219         }
220         
221         if ( ! MakeFeature( mask_data, feat, flags ) ) {
222             // we don't tolerate even a few errors here!
223             error_counter = MAX_ERROR_COUNT;
224             LOG_POST_X( 3, Error << "Rmo Reader: Unable to create feature table for record " 
225                 << record_counter << " (line " << line_counter 
226                 << "). Aborting file import." );
227             break;
228         }
229         
230         ftable.push_back( feat );
231     }
232     
233     if ( error_counter == MAX_ERROR_COUNT ) {
234         LOG_POST_X( 4, Error << "Rmo Reader: File import aborted due to error count or severity." );
235         throw 0; // upper layer catches everything in sight and reports error to file_loader.
236     }
237 }
238 
239 
240 bool CRmOutReader::IsHeaderLine( const string& line )
241 {
242     string labels_1st_line[] = { "SW", "perc", "query", "position", "matching", "" };
243     string labels_2nd_line[] = { "score", "div.", "del.", "ins.", "sequence", "" };
244 
245     // try to identify 1st line of column labels:
246     size_t current_offset = 0;
247     size_t i = 0;
248     for ( ; labels_1st_line[i] != ""; ++i ) {
249         current_offset = NStr::FindCase( line, labels_1st_line[i], current_offset );
250         if ( NPOS == current_offset ) {
251             break;
252         }
253     }
254     if ( labels_1st_line[i] == "" ) {
255         return true;
256     }
257     
258     // try to identify 2nd line of column labels:
259     current_offset = 0;
260     i = 0;
261     for ( ; labels_2nd_line[i] != ""; ++i ) {
262         current_offset = NStr::FindCase( line, labels_2nd_line[i], current_offset );
263         if ( NPOS == current_offset ) {
264             return false;
265         }
266     }
267     return true;
268 }
269 
270 
271 bool CRmOutReader::IsIgnoredLine( const string& line )
272 {
273     //
274     //  Currently, only lines with only whitespace on them are ignored.
275     //
276     return ( NStr::TruncateSpaces( line ).length() == 0 );
277 }
278 
279 
280 bool CRmOutReader::ParseRecord( const string& record, CMaskData& mask_data )
281 {
282     const size_t MIN_VALUE_COUNT = 15;
283     
284     string line = NStr::TruncateSpaces( record );
285     list< string > values;
286     if ( NStr::Split( line, " \t", values ).size() < MIN_VALUE_COUNT ) {
287         return false;
288     }
289     
290     try {
291         // 1: "SW score"
292         list<string>::iterator it = values.begin();
293         mask_data.sw_score = NStr::StringToUInt( *it );
294         
295         // 2: "perc div."
296         ++it;
297         mask_data.perc_div = NStr::StringToDouble( *it );
298         
299         // 3: "perc del."
300         ++it;
301         mask_data.perc_del = NStr::StringToDouble( *it );
302         
303         // 4: "perc ins."
304         ++it;
305         mask_data.perc_ins = NStr::StringToDouble( *it );
306         
307         // 5: "query sequence"
308         ++it;
309         mask_data.query_sequence = *it;
310         
311         // 6: "position begin"
312         ++it;
313         mask_data.outer_pos_begin = NStr::StringToUInt( *it );
314         
315         // 7: "in end"
316         ++it;
317         mask_data.outer_pos_end = NStr::StringToUInt( *it );
318         
319         // 8: "query (left)"
320         ++it;
321         /* not used */
322         
323         // 9: "" (meaning "strand")
324         ++it;
325         mask_data.strand = *it;
326         
327         // 10: "matching repeat"
328         ++it;
329         mask_data.matching_repeat = *it;
330         
331         // 11: "repeat class/family"
332         ++it;
333         mask_data.repeat_class_family = *it;
334         
335         // 12: "position in"
336         ++it;
337         /* not used */
338         
339         // 13: "in end"
340         ++it;
341         /* not used */
342         
343         // 14: "repeat left"
344         ++it;
345         /* not used */
346         
347         // 15: "ID"
348         ++it;
349         /* not used */
350         
351     }
352     catch( ... ) {
353         return false;
354     }
355     
356     return true;
357 }
358 
359 
360 bool CRmOutReader::VerifyData( const CMaskData& mask_data )
361 {
362     //
363     //  This would be the place for any higher level checks of the mask data
364     //  collected from the record ...
365     // 
366     return true;
367 }
368 
369 
370 bool CRmOutReader::MakeFeature( const CMaskData& mask_data, CRef<CSeq_feat>& feat,
371                                 TFlags flags )
372 {
373     feat.Reset( new CSeq_feat );
374     feat->ResetLocation();
375     
376     //  data:
377     CSeqFeatData& sfdata = feat->SetData();
378     CImp_feat_Base& imp = sfdata.SetImp();
379     imp.SetKey( "repeat_region" );
380     
381     //  location:
382     CRef<CSeq_loc> location( new CSeq_loc );
383     CSeq_interval& interval = location->SetInt();
384     interval.SetFrom( min( mask_data.outer_pos_begin, mask_data.outer_pos_end ) -1 );
385     interval.SetTo( max( mask_data.outer_pos_begin, mask_data.outer_pos_end ) -1 );
386     interval.SetStrand( strcmp( mask_data.strand.c_str(), "C" ) ? 
387         eNa_strand_plus : eNa_strand_minus );
388 
389     CBioseq::TId ids;
390     CSeq_id::ParseFastaIds(ids, mask_data.query_sequence);
391     location->SetId(*FindBestChoice(ids, CSeq_id::Score));
392 
393     feat->SetLocation( *location );
394 
395     //  qualifiers:
396     if (flags) {
397         CSeq_feat::TQual& qual_list = feat->SetQual();
398         
399         if (flags & fIncludeRepeatName) {
400             CRef<CGb_qual> repeat( new CGb_qual );
401             repeat->SetQual( "repeat_region" );
402             repeat->SetVal( mask_data.matching_repeat );
403             qual_list.push_back( repeat );
404         }
405 
406         if (flags & fIncludeRepeatClass) {
407             CRef<CGb_qual> rpt_family( new CGb_qual );
408             rpt_family->SetQual( "rpt_family" );
409             rpt_family->SetVal( mask_data.repeat_class_family );
410             qual_list.push_back( rpt_family );
411         }
412 
413         if (flags & fIncludeStatistics) {
414             CRef<CGb_qual> sw_score( new CGb_qual );
415             sw_score->SetQual( "sw_score" );
416             sw_score->SetVal( NStr::IntToString( mask_data.sw_score ) );
417             qual_list.push_back( sw_score );
418             
419             CRef<CGb_qual> perc_div( new CGb_qual );
420             perc_div->SetQual( "perc_div" );
421             perc_div->SetVal( NStr::DoubleToString( mask_data.perc_div ) );
422             qual_list.push_back( perc_div );
423             
424             CRef<CGb_qual> perc_del( new CGb_qual );
425             perc_del->SetQual( "perc_del" );
426             perc_del->SetVal( NStr::DoubleToString( mask_data.perc_del ) );
427             qual_list.push_back( perc_del );
428             
429             CRef<CGb_qual> perc_ins( new CGb_qual );
430             perc_ins->SetQual( "perc_ins" );
431             perc_ins->SetVal( NStr::DoubleToString( mask_data.perc_ins ) );
432             qual_list.push_back( perc_ins );
433         }
434     }
435     
436     return true;
437 }
438 
439 
440 CRmReader* CRmReader::OpenReader( CNcbiIstream& InStream )
441 {
442     //
443     //  This is the point to make sure we are dealing with the right file type and
444     //  to allocate the specialist reader for any subtype (OUT, HTML) we encouter.
445     //  When this function returns the file pointer should be past the file header
446     //  and at the beginning of the actual mask data.
447     //
448     //  Note:
449     //  If something goes wrong during header processing then the file pointer will
450     //  still be modified. It's the caller's job to restore the file pointer if this
451     //  is possible for this type of stream.
452     //
453     
454     //
455     //  2006-03-31: Only supported file type at this time: ReadMasker OUT.
456     //
457     return new CRmOutReader( InStream );
458 }
459 
460 
461 void CRmReader::CloseReader( CRmReader* pReader )
462 {
463     delete pReader;
464 }
465 
466 
467 END_objects_SCOPE
468 END_NCBI_SCOPE
469 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.