src/objtools/readers/rm_reader.cpp

Go to the documentation of this file.
00001 /*  $Id: rm_reader.cpp 112468 2007-10-18 15:39:21Z ivanovp $
00002  * ===========================================================================
00003  *
00004  *                            PUBLIC DOMAIN NOTICE
00005  *               National Center for Biotechnology Information
00006  *
00007  *  This software/database is a "United States Government Work" under the
00008  *  terms of the United States Copyright Act.  It was written as part of
00009  *  the author's official duties as a United States Government employee and
00010  *  thus cannot be copyrighted.  This software/database is freely available
00011  *  to the public for use. The National Library of Medicine and the U.S.
00012  *  Government have not placed any restriction on its use or reproduction.
00013  *
00014  *  Although all reasonable efforts have been taken to ensure the accuracy
00015  *  and reliability of the software and data, the NLM and the U.S.
00016  *  Government do not and cannot warrant the performance or results that
00017  *  may be obtained by using this software or data. The NLM and the U.S.
00018  *  Government disclaim all warranties, express or implied, including
00019  *  warranties of performance, merchantability or fitness for any particular
00020  *  purpose.
00021  *
00022  *  Please cite the author in any work or product based on this material.
00023  *
00024  * ===========================================================================
00025  *
00026  * Author:  Frank Ludwig
00027  *
00028  * File Description:
00029  *   Repeat Masker file reader
00030  *
00031  */
00032 
00033 #include <ncbi_pch.hpp>
00034 #include <corelib/ncbistd.hpp>
00035 #include <corelib/ncbithr.hpp>
00036 #include <corelib/ncbiutil.hpp>
00037 #include <corelib/ncbiexpt.hpp>
00038 
00039 #include <util/static_map.hpp>
00040 
00041 #include <serial/iterator.hpp>
00042 #include <serial/objistrasn.hpp>
00043 
00044 // Objects includes
00045 #include <objects/general/Int_fuzz.hpp>
00046 #include <objects/general/Object_id.hpp>
00047 #include <objects/general/User_object.hpp>
00048 #include <objects/general/User_field.hpp>
00049 #include <objects/general/Dbtag.hpp>
00050 
00051 #include <objects/seqloc/Seq_id.hpp>
00052 #include <objects/seqloc/Seq_loc.hpp>
00053 #include <objects/seqloc/Seq_interval.hpp>
00054 #include <objects/seqloc/Seq_point.hpp>
00055 
00056 #include <objects/seq/Seq_annot.hpp>
00057 #include <objects/seq/Annotdesc.hpp>
00058 #include <objects/seq/Annot_descr.hpp>
00059 #include <objects/seqfeat/SeqFeatData.hpp>
00060 
00061 #include <objects/seqfeat/Seq_feat.hpp>
00062 #include <objects/seqfeat/BioSource.hpp>
00063 #include <objects/seqfeat/Org_ref.hpp>
00064 #include <objects/seqfeat/OrgName.hpp>
00065 #include <objects/seqfeat/SubSource.hpp>
00066 #include <objects/seqfeat/OrgMod.hpp>
00067 #include <objects/seqfeat/Gene_ref.hpp>
00068 #include <objects/seqfeat/Cdregion.hpp>
00069 #include <objects/seqfeat/Code_break.hpp>
00070 #include <objects/seqfeat/Genetic_code.hpp>
00071 #include <objects/seqfeat/Genetic_code_table.hpp>
00072 #include <objects/seqfeat/RNA_ref.hpp>
00073 #include <objects/seqfeat/Trna_ext.hpp>
00074 #include <objects/seqfeat/Imp_feat.hpp>
00075 #include <objects/seqfeat/Gb_qual.hpp>
00076 
00077 #include <objtools/readers/reader_exception.hpp>
00078 #include <objtools/readers/rm_reader.hpp>
00079 #include <objtools/error_codes.hpp>
00080 
00081 #include <algorithm>
00082 
00083 
00084 #define NCBI_USE_ERRCODE_X   Objtools_Rd_RepMask
00085 
00086 BEGIN_NCBI_SCOPE
00087 
00088 BEGIN_objects_SCOPE // namespace ncbi::objects::
00089 
00090 struct CMaskData;
00091 
00092 //-----------------------------------------------------------------------------
00093 class CRmOutReader: public CRmReader
00094 //-----------------------------------------------------------------------------
00095 {
00096     friend CRmReader* CRmReader::OpenReader( CNcbiIstream& );
00097     
00098     //
00099     //  object management:
00100     //
00101 protected:
00102     CRmOutReader( CNcbiIstream& );
00103 public:
00104     virtual ~CRmOutReader();
00105     
00106     //
00107     //  interface:
00108     //
00109 public:
00110     virtual void Read( CRef<CSeq_annot>, TFlags flags = fDefaults);
00111 
00112     //
00113     //  internal helpers:
00114     //
00115 protected:
00116     virtual bool IsHeaderLine( const string& );
00117     virtual bool IsIgnoredLine( const string& ); 
00118     
00119     virtual bool ParseRecord( const string& record, CMaskData& );
00120     virtual bool VerifyData( const CMaskData& );
00121     virtual bool MakeFeature( const CMaskData&, CRef<CSeq_feat>&, TFlags flags);
00122     
00123     //
00124     //  data:
00125     //
00126 protected:
00127     static const unsigned long BUFFERSIZE = 256;
00128     char pReadBuffer[ BUFFERSIZE ];
00129 };
00130 
00131 //-----------------------------------------------------------------------------
00132 struct CMaskData
00133 //-----------------------------------------------------------------------------
00134 {
00135     unsigned long sw_score;
00136     unsigned long outer_pos_begin;
00137     unsigned long outer_pos_end;
00138     double perc_div;
00139     double perc_del;
00140     double perc_ins;
00141     string query_sequence;
00142     string strand;
00143     string matching_repeat;
00144     string repeat_class_family;
00145 };
00146     
00147 
00148 CRmReader::CRmReader( CNcbiIstream& InStream )
00149     :
00150     m_InStream( InStream )
00151 {
00152 }
00153 
00154 
00155 CRmReader::~CRmReader()
00156 {
00157 }
00158 
00159 
00160 CRmOutReader::CRmOutReader( CNcbiIstream& InStream )
00161     :
00162     CRmReader( InStream )
00163 {
00164 }
00165 
00166 
00167 CRmOutReader::~CRmOutReader()
00168 {
00169 }
00170 
00171 
00172 void CRmOutReader::Read( CRef<CSeq_annot> entry, TFlags flags )
00173 {
00174     const size_t MAX_ERROR_COUNT = 5;
00175     
00176     string line;
00177     CSeq_annot::C_Data::TFtable& ftable = entry->SetData().SetFtable();
00178     CRef<CSeq_feat> feat;
00179     
00180     size_t line_counter = 0;
00181     size_t record_counter = 0;
00182     size_t error_counter = 0;
00183     
00184     while ( ! m_InStream.eof() ) {
00185 
00186         NcbiGetlineEOL( m_InStream, line );
00187         ++line_counter;
00188         
00189         if ( IsHeaderLine( line ) || IsIgnoredLine( line ) ) {
00190             continue;
00191         }
00192         ++record_counter;
00193         
00194         CMaskData mask_data;
00195         if ( ! ParseRecord( line, mask_data ) ) {
00196             ++error_counter;
00197             LOG_POST_X( 1, Error << "Rmo Reader: Parse error in record " 
00198                 << record_counter << " (line " << line_counter 
00199                 << "). Record skipped" );
00200             if ( error_counter < MAX_ERROR_COUNT ) {
00201                 continue;
00202             }
00203             else {
00204                 break;
00205             }
00206         }
00207         
00208         if ( ! VerifyData( mask_data ) ) {
00209             ++error_counter;
00210             LOG_POST_X( 2, Error << "Rmo Reader: Verification error in record " 
00211                 << record_counter << " (line " << line_counter 
00212                 << "). Record skipped." );
00213             if ( error_counter < MAX_ERROR_COUNT ) {
00214                 continue;
00215             }
00216             else {
00217                 break;
00218             }
00219         }
00220         
00221         if ( ! MakeFeature( mask_data, feat, flags ) ) {
00222             // we don't tolerate even a few errors here!
00223             error_counter = MAX_ERROR_COUNT;
00224             LOG_POST_X( 3, Error << "Rmo Reader: Unable to create feature table for record " 
00225                 << record_counter << " (line " << line_counter 
00226                 << "). Aborting file import." );
00227             break;
00228         }
00229         
00230         ftable.push_back( feat );
00231     }
00232     
00233     if ( error_counter == MAX_ERROR_COUNT ) {
00234         LOG_POST_X( 4, Error << "Rmo Reader: File import aborted due to error count or severity." );
00235         throw 0; // upper layer catches everything in sight and reports error to file_loader.
00236     }
00237 }
00238 
00239 
00240 bool CRmOutReader::IsHeaderLine( const string& line )
00241 {
00242     string labels_1st_line[] = { "SW", "perc", "query", "position", "matching", "" };
00243     string labels_2nd_line[] = { "score", "div.", "del.", "ins.", "sequence", "" };
00244 
00245     // try to identify 1st line of column labels:
00246     size_t current_offset = 0;
00247     size_t i = 0;
00248     for ( ; labels_1st_line[i] != ""; ++i ) {
00249         current_offset = NStr::FindCase( line, labels_1st_line[i], current_offset );
00250         if ( NPOS == current_offset ) {
00251             break;
00252         }
00253     }
00254     if ( labels_1st_line[i] == "" ) {
00255         return true;
00256     }
00257     
00258     // try to identify 2nd line of column labels:
00259     current_offset = 0;
00260     i = 0;
00261     for ( ; labels_2nd_line[i] != ""; ++i ) {
00262         current_offset = NStr::FindCase( line, labels_2nd_line[i], current_offset );
00263         if ( NPOS == current_offset ) {
00264             return false;
00265         }
00266     }
00267     return true;
00268 }
00269 
00270 
00271 bool CRmOutReader::IsIgnoredLine( const string& line )
00272 {
00273     //
00274     //  Currently, only lines with only whitespace on them are ignored.
00275     //
00276     return ( NStr::TruncateSpaces( line ).length() == 0 );
00277 }
00278 
00279 
00280 bool CRmOutReader::ParseRecord( const string& record, CMaskData& mask_data )
00281 {
00282     const size_t MIN_VALUE_COUNT = 15;
00283     
00284     string line = NStr::TruncateSpaces( record );
00285     list< string > values;
00286     if ( NStr::Split( line, " \t", values ).size() < MIN_VALUE_COUNT ) {
00287         return false;
00288     }
00289     
00290     try {
00291         // 1: "SW score"
00292         list<string>::iterator it = values.begin();
00293         mask_data.sw_score = NStr::StringToUInt( *it );
00294         
00295         // 2: "perc div."
00296         ++it;
00297         mask_data.perc_div = NStr::StringToDouble( *it );
00298         
00299         // 3: "perc del."
00300         ++it;
00301         mask_data.perc_del = NStr::StringToDouble( *it );
00302         
00303         // 4: "perc ins."
00304         ++it;
00305         mask_data.perc_ins = NStr::StringToDouble( *it );
00306         
00307         // 5: "query sequence"
00308         ++it;
00309         mask_data.query_sequence = *it;
00310         
00311         // 6: "position begin"
00312         ++it;
00313         mask_data.outer_pos_begin = NStr::StringToUInt( *it );
00314         
00315         // 7: "in end"
00316         ++it;
00317         mask_data.outer_pos_end = NStr::StringToUInt( *it );
00318         
00319         // 8: "query (left)"
00320         ++it;
00321         /* not used */
00322         
00323         // 9: "" (meaning "strand")
00324         ++it;
00325         mask_data.strand = *it;
00326         
00327         // 10: "matching repeat"
00328         ++it;
00329         mask_data.matching_repeat = *it;
00330         
00331         // 11: "repeat class/family"
00332         ++it;
00333         mask_data.repeat_class_family = *it;
00334         
00335         // 12: "position in"
00336         ++it;
00337         /* not used */
00338         
00339         // 13: "in end"
00340         ++it;
00341         /* not used */
00342         
00343         // 14: "repeat left"
00344         ++it;
00345         /* not used */
00346         
00347         // 15: "ID"
00348         ++it;
00349         /* not used */
00350         
00351     }
00352     catch( ... ) {
00353         return false;
00354     }
00355     
00356     return true;
00357 }
00358 
00359 
00360 bool CRmOutReader::VerifyData( const CMaskData& mask_data )
00361 {
00362     //
00363     //  This would be the place for any higher level checks of the mask data
00364     //  collected from the record ...
00365     // 
00366     return true;
00367 }
00368 
00369 
00370 bool CRmOutReader::MakeFeature( const CMaskData& mask_data, CRef<CSeq_feat>& feat,
00371                                 TFlags flags )
00372 {
00373     feat.Reset( new CSeq_feat );
00374     feat->ResetLocation();
00375     
00376     //  data:
00377     CSeqFeatData& sfdata = feat->SetData();
00378     CImp_feat_Base& imp = sfdata.SetImp();
00379     imp.SetKey( "repeat_region" );
00380     
00381     //  location:
00382     CRef<CSeq_loc> location( new CSeq_loc );
00383     CSeq_interval& interval = location->SetInt();
00384     interval.SetFrom( min( mask_data.outer_pos_begin, mask_data.outer_pos_end ) -1 );
00385     interval.SetTo( max( mask_data.outer_pos_begin, mask_data.outer_pos_end ) -1 );
00386     interval.SetStrand( strcmp( mask_data.strand.c_str(), "C" ) ? 
00387         eNa_strand_plus : eNa_strand_minus );
00388 
00389     CBioseq::TId ids;
00390     CSeq_id::ParseFastaIds(ids, mask_data.query_sequence);
00391     location->SetId(*FindBestChoice(ids, CSeq_id::Score));
00392 
00393     feat->SetLocation( *location );
00394 
00395     //  qualifiers:
00396     if (flags) {
00397         CSeq_feat::TQual& qual_list = feat->SetQual();
00398         
00399         if (flags & fIncludeRepeatName) {
00400             CRef<CGb_qual> repeat( new CGb_qual );
00401             repeat->SetQual( "repeat_region" );
00402             repeat->SetVal( mask_data.matching_repeat );
00403             qual_list.push_back( repeat );
00404         }
00405 
00406         if (flags & fIncludeRepeatClass) {
00407             CRef<CGb_qual> rpt_family( new CGb_qual );
00408             rpt_family->SetQual( "rpt_family" );
00409             rpt_family->SetVal( mask_data.repeat_class_family );
00410             qual_list.push_back( rpt_family );
00411         }
00412 
00413         if (flags & fIncludeStatistics) {
00414             CRef<CGb_qual> sw_score( new CGb_qual );
00415             sw_score->SetQual( "sw_score" );
00416             sw_score->SetVal( NStr::IntToString( mask_data.sw_score ) );
00417             qual_list.push_back( sw_score );
00418             
00419             CRef<CGb_qual> perc_div( new CGb_qual );
00420             perc_div->SetQual( "perc_div" );
00421             perc_div->SetVal( NStr::DoubleToString( mask_data.perc_div ) );
00422             qual_list.push_back( perc_div );
00423             
00424             CRef<CGb_qual> perc_del( new CGb_qual );
00425             perc_del->SetQual( "perc_del" );
00426             perc_del->SetVal( NStr::DoubleToString( mask_data.perc_del ) );
00427             qual_list.push_back( perc_del );
00428             
00429             CRef<CGb_qual> perc_ins( new CGb_qual );
00430             perc_ins->SetQual( "perc_ins" );
00431             perc_ins->SetVal( NStr::DoubleToString( mask_data.perc_ins ) );
00432             qual_list.push_back( perc_ins );
00433         }
00434     }
00435     
00436     return true;
00437 }
00438 
00439 
00440 CRmReader* CRmReader::OpenReader( CNcbiIstream& InStream )
00441 {
00442     //
00443     //  This is the point to make sure we are dealing with the right file type and
00444     //  to allocate the specialist reader for any subtype (OUT, HTML) we encouter.
00445     //  When this function returns the file pointer should be past the file header
00446     //  and at the beginning of the actual mask data.
00447     //
00448     //  Note:
00449     //  If something goes wrong during header processing then the file pointer will
00450     //  still be modified. It's the caller's job to restore the file pointer if this
00451     //  is possible for this type of stream.
00452     //
00453     
00454     //
00455     //  2006-03-31: Only supported file type at this time: ReadMasker OUT.
00456     //
00457     return new CRmOutReader( InStream );
00458 }
00459 
00460 
00461 void CRmReader::CloseReader( CRmReader* pReader )
00462 {
00463     delete pReader;
00464 }
00465 
00466 
00467 END_objects_SCOPE
00468 END_NCBI_SCOPE
00469 
00470 

Generated on Sun Feb 15 02:30:07 2009 for NCBI C++ ToolKit by  doxygen 1.4.6
Modified on Sun Feb 15 15:27:33 2009 by modify_doxy.py rev. 117643