src/objtools/validator/validerror_feat.cpp

Go to the documentation of this file.
00001 /*  $Id: validerror_feat.cpp 153919 2009-03-05 16:26:54Z bollin $
00002  * ===========================================================================
00003  *
00004  *                            PUBLIC DOMAIN NOTICE
00005  *               National Center for Biotechnology Information
00006  *
00007  *  This software/database is a "United States Government Work" under the
00008  *  terms of the United States Copyright Act.  It was written as part of
00009  *  the author's official duties as a United States Government employee and
00010  *  thus cannot be copyrighted.  This software/database is freely available
00011  *  to the public for use. The National Library of Medicine and the U.S.
00012  *  Government have not placed any restriction on its use or reproduction.
00013  *
00014  *  Although all reasonable efforts have been taken to ensure the accuracy
00015  *  and reliability of the software and data, the NLM and the U.S.
00016  *  Government do not and cannot warrant the performance or results that
00017  *  may be obtained by using this software or data. The NLM and the U.S.
00018  *  Government disclaim all warranties, express or implied, including
00019  *  warranties of performance, merchantability or fitness for any particular
00020  *  purpose.
00021  *
00022  *  Please cite the author in any work or product based on this material.
00023  *
00024  * ===========================================================================
00025  *
00026  * Author:  Jonathan Kans, Clifford Clausen, Aaron Ucko......
00027  *
00028  * File Description:
00029  *   validation of Seq_feat
00030  *   .......
00031  *
00032  */
00033 #include <ncbi_pch.hpp>
00034 #include <corelib/ncbistd.hpp>
00035 #include <corelib/ncbistr.hpp>
00036 #include "validatorp.hpp"
00037 #include "utilities.hpp"
00038 
00039 #include <serial/serialbase.hpp>
00040 
00041 #include <objmgr/bioseq_handle.hpp>
00042 #include <objmgr/seq_entry_handle.hpp>
00043 #include <objmgr/feat_ci.hpp>
00044 #include <objmgr/seqdesc_ci.hpp>
00045 #include <objmgr/seq_vector.hpp>
00046 #include <objmgr/scope.hpp>
00047 #include <objmgr/util/sequence.hpp>
00048 #include <objmgr/util/feature.hpp>
00049 
00050 #include <objects/seqfeat/Seq_feat.hpp>
00051 #include <objects/seqfeat/BioSource.hpp>
00052 #include <objects/seqfeat/Cdregion.hpp>
00053 #include <objects/seqfeat/Code_break.hpp>
00054 #include <objects/seqfeat/Gb_qual.hpp>
00055 #include <objects/seqfeat/Genetic_code.hpp>
00056 #include <objects/seqfeat/Genetic_code_table.hpp>
00057 #include <objects/seqfeat/Imp_feat.hpp>
00058 #include <objects/seqfeat/Org_ref.hpp>
00059 #include <objects/seqfeat/Prot_ref.hpp>
00060 #include <objects/seqfeat/RNA_ref.hpp>
00061 #include <objects/seqfeat/SubSource.hpp>
00062 #include <objects/seqfeat/Trna_ext.hpp>
00063 
00064 #include <objects/seqloc/Seq_loc.hpp>
00065 #include <objects/seqloc/Seq_interval.hpp>
00066 #include <objects/seqloc/Seq_point.hpp>
00067 #include <objects/seqloc/Textseq_id.hpp>
00068 
00069 #include <objects/seqset/Seq_entry.hpp>
00070 #include <objects/seqset/Bioseq_set.hpp>
00071 
00072 #include <objects/seq/MolInfo.hpp>
00073 #include <objects/seq/Bioseq.hpp>
00074 #include <objects/seq/seqport_util.hpp>
00075 
00076 #include <objects/pub/Pub.hpp>
00077 #include <objects/pub/Pub_set.hpp>
00078 
00079 #include <objects/general/Dbtag.hpp>
00080 
00081 #include <util/static_set.hpp>
00082 #include <util/sequtil/sequtil_convert.hpp>
00083 
00084 
00085 #include <algorithm>
00086 #include <string>
00087 
00088 
00089 BEGIN_NCBI_SCOPE
00090 BEGIN_SCOPE(objects)
00091 BEGIN_SCOPE(validator)
00092 using namespace sequence;
00093 
00094 
00095 // =============================================================================
00096 //                                     Public
00097 // =============================================================================
00098 
00099 
00100 CValidError_feat::CValidError_feat(CValidError_imp& imp) :
00101     CValidError_base(imp),
00102     m_NumGenes(0),
00103     m_NumGeneXrefs(0)
00104 {
00105 }
00106 
00107 
00108 CValidError_feat::~CValidError_feat(void)
00109 {
00110 }
00111 
00112 
00113 void CValidError_feat::ValidateSeqFeat(const CSeq_feat& feat, bool is_insd_in_sep)
00114 {
00115     if ( !feat.CanGetLocation() ) {
00116         PostErr(eDiag_Critical, eErr_SEQ_FEAT_MissingLocation,
00117             "The feature is missing a location", feat);
00118         return;
00119     }
00120     _ASSERT(feat.CanGetLocation());
00121 
00122     CBioseq_Handle bsh = m_Scope->GetBioseqHandle(feat.GetLocation());
00123     m_Imp.ValidateSeqLoc(feat.GetLocation(), bsh, "Location", feat);
00124     x_ValidateSeqFeatLoc(feat);
00125 
00126     if ( feat.CanGetProduct() ) {
00127         ValidateSeqFeatProduct(feat.GetProduct(), feat);
00128     }
00129     
00130     ValidateFeatPartialness(feat);
00131     
00132     ValidateExcept(feat);
00133     
00134     ValidateSeqFeatData(feat.GetData(), feat, is_insd_in_sep);
00135   
00136     ValidateBothStrands (feat);
00137     
00138     if (feat.CanGetDbxref ()) {
00139         m_Imp.ValidateDbxref (feat.GetDbxref (), feat);
00140     }
00141     
00142     if ( feat.CanGetComment() ) {
00143         ValidateFeatComment(feat.GetComment(), feat);
00144     }
00145 
00146     if ( feat.CanGetCit() ) {
00147         ValidateFeatCit(feat.GetCit(), feat);
00148     }
00149 
00150     const CGene_ref* gene_xref = feat.GetGeneXref();
00151     if ( gene_xref != 0  &&  !gene_xref->IsSuppressed() ) {
00152         ++m_NumGeneXrefs;
00153     }
00154 }
00155 
00156 
00157 // =============================================================================
00158 //                                     Private
00159 // =============================================================================
00160 
00161 
00162 // static member initializations
00163 const string s_PlastidTxt[] = {
00164   "",
00165   "",
00166   "chloroplast",
00167   "chromoplast",
00168   "",
00169   "",
00170   "plastid",
00171   "",
00172   "",
00173   "",
00174   "",
00175   "",
00176   "cyanelle",
00177   "",
00178   "",
00179   "",
00180   "apicoplast",
00181   "leucoplast",
00182   "proplastid",
00183   ""
00184 };
00185 
00186 
00187 static string s_LegalRepeatTypes[] = {
00188   "tandem", "inverted", "flanking", "terminal",
00189   "direct", "dispersed", "other"
00190 };
00191 
00192 
00193 static string s_LegalConsSpliceStrings[] = {
00194   "(5'site:YES, 3'site:YES)",
00195   "(5'site:YES, 3'site:NO)",
00196   "(5'site:YES, 3'site:ABSENT)",
00197   "(5'site:NO, 3'site:YES)",
00198   "(5'site:NO, 3'site:NO)",
00199   "(5'site:NO, 3'site:ABSENT)",
00200   "(5'site:ABSENT, 3'site:YES)",
00201   "(5'site:ABSENT, 3'site:NO)",
00202   "(5'site:ABSENT, 3'site:ABSENT)"
00203 };
00204 
00205 
00206 static bool s_IsLocRefSeqMrna(const CSeq_loc& loc, CScope& scope)
00207 {
00208     CBioseq_Handle bsh = scope.GetBioseqHandle(loc);
00209     if ( bsh ) {
00210         FOR_EACH_SEQID_ON_BIOSEQ (it, *(bsh.GetBioseqCore())) {
00211             if ( (*it)->IdentifyAccession() == CSeq_id::eAcc_refseq_mrna ) {
00212                 return true;
00213             }
00214         }
00215     }
00216 
00217     return false;
00218 }
00219 
00220 
00221 static bool s_IsLocGEDL(const CSeq_loc& loc, CScope& scope)
00222 {
00223     CBioseq_Handle bsh = scope.GetBioseqHandle(loc);
00224     if ( bsh ) {
00225         FOR_EACH_SEQID_ON_BIOSEQ (it, *(bsh.GetBioseqCore())) {
00226             CSeq_id::EAccessionInfo acc_info = (*it)->IdentifyAccession();
00227             if ( acc_info == CSeq_id::eAcc_gb_embl_ddbj  ||
00228                  acc_info == CSeq_id::eAcc_local ) {
00229                 return true;
00230             }
00231         }
00232     }
00233 
00234     return false;
00235 }
00236 
00237 
00238 // private member functions:
00239 
00240 void CValidError_feat::ValidateSeqFeatData
00241 (const CSeqFeatData& data,
00242  const CSeq_feat& feat,
00243  bool is_insd_in_sep)
00244 {
00245     switch ( data.Which () ) {
00246     case CSeqFeatData::e_Gene:
00247         // Validate CGene_ref
00248         ValidateGene(data.GetGene (), feat);
00249         break;
00250     case CSeqFeatData::e_Cdregion:
00251         // Validate CCdregion
00252         ValidateCdregion(data.GetCdregion (), feat);
00253         break;
00254     case CSeqFeatData::e_Prot:
00255         // Validate CProt_ref
00256         ValidateProt(data.GetProt (), feat);
00257         break;
00258     case CSeqFeatData::e_Rna:
00259         // Validate CRNA_ref
00260         ValidateRna(data.GetRna (), feat);
00261         break;
00262     case CSeqFeatData::e_Pub:
00263         // Validate CPubdesc
00264         m_Imp.ValidatePubdesc(data.GetPub (), feat);
00265         break;
00266     case CSeqFeatData::e_Imp:
00267         // Validate CPubdesc
00268         ValidateImp(data.GetImp (), feat, is_insd_in_sep);
00269         break;
00270     case CSeqFeatData::e_Biosrc:
00271         // Validate CBioSource
00272         ValidateFeatBioSource(data.GetBiosrc(), feat);
00273         break;
00274 
00275     case CSeqFeatData::e_Org:
00276     case CSeqFeatData::e_Region:
00277     case CSeqFeatData::e_Seq:
00278     case CSeqFeatData::e_Comment:
00279     case CSeqFeatData::e_Bond:
00280     case CSeqFeatData::e_Site:
00281     case CSeqFeatData::e_Rsite:
00282     case CSeqFeatData::e_User:
00283     case CSeqFeatData::e_Txinit:
00284     case CSeqFeatData::e_Num:
00285     case CSeqFeatData::e_Psec_str:
00286     case CSeqFeatData::e_Non_std_residue:
00287     case CSeqFeatData::e_Het:
00288         break;
00289 
00290     default:
00291         PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidType,
00292             "Invalid SeqFeat type [" + CSeqFeatData::SelectionName(data.Which()) + "]",
00293             feat);
00294         break;
00295     }
00296 
00297     if ( !data.IsGene() ) {
00298         ValidateGeneXRef(feat);
00299     } else {
00300         ValidateOperon(feat);
00301     }
00302     
00303     if ( !data.IsBond() ) {
00304         ValidateBondLocs(feat);
00305     }
00306 }
00307 
00308 
00309 void CValidError_feat::ValidateSeqFeatProduct
00310 (const CSeq_loc& prod, const CSeq_feat& feat)
00311 {
00312     CBioseq_Handle bsh = m_Scope->GetBioseqHandle(feat.GetProduct());
00313     m_Imp.ValidateSeqLoc(feat.GetProduct(), bsh, "Product", feat);
00314     
00315     if ( IsOneBioseq(prod, m_Scope) ) {
00316         const CSeq_id& sid = GetId(prod, m_Scope);
00317     
00318         switch ( sid.Which() ) {
00319         case CSeq_id::e_Genbank:
00320         case CSeq_id::e_Embl:
00321         case CSeq_id::e_Ddbj:
00322         case CSeq_id::e_Tpg:
00323         case CSeq_id::e_Tpe:
00324         case CSeq_id::e_Tpd:
00325             {
00326                 const CTextseq_id* tsid = sid.GetTextseq_Id();
00327                 if ( tsid != NULL ) {
00328                     if ( !tsid->CanGetAccession()  &&  tsid->CanGetName() ) {
00329                         if ( m_Imp.IsNucAcc(tsid->GetName()) ) {
00330                             PostErr(eDiag_Warning, eErr_SEQ_FEAT_BadProductSeqId,
00331                                 "Feature product should not use "
00332                                 "Textseq-id 'name' slot", feat);
00333                         }
00334                     }
00335                 }
00336             }
00337             break;
00338             
00339         default:
00340             break;
00341         }
00342     }
00343 }
00344 
00345 
00346 bool CValidError_feat::IsPlastid(int genome)
00347 {
00348     if ( genome == CBioSource::eGenome_chloroplast  ||
00349          genome == CBioSource::eGenome_chromoplast  ||
00350          genome == CBioSource::eGenome_plastid      ||
00351          genome == CBioSource::eGenome_cyanelle     ||
00352          genome == CBioSource::eGenome_apicoplast   ||
00353          genome == CBioSource::eGenome_leucoplast   ||
00354          genome == CBioSource::eGenome_proplastid  ) { 
00355         return true;
00356     }
00357 
00358     return false;
00359 }
00360 
00361 
00362 bool CValidError_feat::IsOverlappingGenePseudo(const CSeq_feat& feat)
00363 {
00364     const CGene_ref* grp = feat.GetGeneXref();
00365     if ( grp  ) {
00366         return (grp->CanGetPseudo()  &&  grp->GetPseudo());
00367     }
00368 
00369     // !!! DEBUG {
00370     // For testing purposes. Remove when test is done.
00371     if ( m_Imp.AvoidPerfBottlenecks() ) {
00372         return false;
00373     }
00374     // }
00375 
00376     // check overlapping gene
00377     CConstRef<CSeq_feat> overlap = 
00378         GetOverlappingGene(feat.GetLocation(), *m_Scope);
00379     if ( overlap ) {
00380         if ( (overlap->CanGetPseudo()  &&  overlap->GetPseudo())  ||
00381              (overlap->GetData().GetGene().CanGetPseudo()  &&
00382               overlap->GetData().GetGene().GetPseudo()) ) {
00383             return true;
00384         }
00385     }
00386 
00387     return false;
00388 }
00389 
00390 
00391 bool CValidError_feat::SuppressCheck(const string& except_text)
00392 {
00393     static string exceptions[] = {
00394         "ribosomal slippage",
00395         "artificial frameshift",
00396         "nonconsensus splice site"
00397     };
00398 
00399     for ( size_t i = 0; i < sizeof(exceptions) / sizeof(string); ++i ) {
00400     if ( NStr::FindNoCase(except_text, exceptions[i] ) != string::npos )
00401          return true;
00402     }
00403     return false;
00404 }
00405 
00406 
00407 
00408 
00409 
00410 unsigned char CValidError_feat::Residue(unsigned char res)
00411 {
00412     return res == 255 ? '?' : res;
00413 }
00414 
00415 
00416 int CValidError_feat::CheckForRaggedEnd
00417 (const CSeq_loc& loc, 
00418  const CCdregion& cdregion)
00419 {
00420     size_t len = GetLength(loc, m_Scope);
00421     if ( cdregion.GetFrame() > CCdregion::eFrame_one ) {
00422         len -= cdregion.GetFrame() - 1;
00423     }
00424 
00425     int ragged = len % 3;
00426     if ( ragged > 0 ) {
00427         len = GetLength(loc, m_Scope);
00428         size_t last_pos = 0;
00429 
00430         CSeq_loc::TRange range = CSeq_loc::TRange::GetEmpty();
00431         FOR_EACH_CODEBREAK_ON_CDREGION (cbr, cdregion) {
00432             SRelLoc rl(loc, (*cbr)->GetLoc(), m_Scope);
00433             ITERATE (SRelLoc::TRanges, rit, rl.m_Ranges) {
00434                 if ((*rit)->GetTo() > last_pos) {
00435                     last_pos = (*rit)->GetTo();
00436                 }
00437             }
00438         }
00439 
00440         // allowing a partial codon at the end
00441         TSeqPos codon_length = range.GetLength();
00442         if ( (codon_length == 0 || codon_length == 1)  && 
00443             last_pos == len - 1 ) {
00444             ragged = 0;
00445         }
00446     }
00447     return ragged;
00448 }
00449 
00450 
00451 string CValidError_feat::MapToNTCoords
00452 (const CSeq_feat& feat,
00453  const CSeq_loc& product,
00454  TSeqPos pos)
00455 {
00456     string result;
00457 
00458     CSeq_point pnt;
00459     pnt.SetPoint(pos);
00460     pnt.SetStrand( GetStrand(product, m_Scope) );
00461 
00462     try {
00463         pnt.SetId().Assign(GetId(product, m_Scope));
00464     } catch (CObjmgrUtilException) {}
00465 
00466     CSeq_loc tmp;
00467     tmp.SetPnt(pnt);
00468     CRef<CSeq_loc> loc = ProductToSource(feat, tmp, 0, m_Scope);
00469     
00470     loc->GetLabel(&result);
00471 
00472     return result;
00473 }
00474 
00475 
00476 void CValidError_feat::ValidateFeatPartialness(const CSeq_feat& feat)
00477 {
00478     static const string parterr[2] = { "PartialProduct", "PartialLocation" };
00479     static const string parterrs[4] = {
00480         "Start does not include first/last residue of sequence",
00481         "Stop does not include first/last residue of sequence",
00482         "Internal partial intervals do not include first/last residue of sequence",
00483         "Improper use of partial (greater than or less than)"
00484     };
00485 
00486     unsigned int  partial_prod = eSeqlocPartial_Complete, 
00487                   partial_loc  = eSeqlocPartial_Complete;
00488 
00489     bool is_partial = feat.CanGetPartial()  &&  feat.GetPartial();
00490     partial_loc  = SeqLocPartialCheck(feat.GetLocation(), m_Scope );
00491     if (feat.CanGetProduct ()) {
00492         partial_prod = SeqLocPartialCheck(feat.GetProduct (), m_Scope );
00493     }
00494     
00495     if ( (partial_loc  != eSeqlocPartial_Complete)  ||
00496          (partial_prod != eSeqlocPartial_Complete)  ||   
00497          is_partial ) {
00498 
00499         // a feature on a partial sequence should be partial -- it often isn't
00500         if ( !is_partial &&
00501             partial_loc != eSeqlocPartial_Complete  &&
00502             feat.GetLocation ().IsWhole() ) {
00503             PostErr(eDiag_Info, eErr_SEQ_FEAT_PartialProblem,
00504                 "On partial Bioseq, SeqFeat.partial should be TRUE", feat);
00505         }
00506         // a partial feature, with complete location, but partial product
00507         else if ( is_partial                        &&
00508             partial_loc == eSeqlocPartial_Complete  &&
00509             feat.CanGetProduct ()                   &&
00510             feat.GetProduct ().IsWhole()            &&
00511             partial_prod != eSeqlocPartial_Complete ) {
00512             PostErr(eDiag_Warning, eErr_SEQ_FEAT_PartialProblem,
00513                 "When SeqFeat.product is a partial Bioseq, SeqFeat.location "
00514                 "should also be partial", feat);
00515         }
00516         // gene on segmented set is now 'order', should also be partial
00517         else if ( feat.GetData ().IsGene ()  &&
00518             !is_partial                      &&
00519             partial_loc == eSeqlocPartial_Internal ) {
00520             PostErr(eDiag_Warning, eErr_SEQ_FEAT_PartialProblem,
00521                 "Gene of 'order' with otherwise complete location should "
00522                 "have partial flag set", feat);
00523         }
00524         // inconsistent combination of partial/complete product,location,partial flag - part 1
00525         else if (partial_prod == eSeqlocPartial_Complete  &&  feat.CanGetProduct()) {
00526             // if not local bioseq product, lower severity
00527             EDiagSev sev = eDiag_Warning;
00528             if ( IsOneBioseq(feat.GetProduct(), m_Scope) ) {
00529                 const CSeq_id& prod_id = GetId(feat.GetProduct(), m_Scope);
00530                 CBioseq_Handle prod =
00531                     m_Scope->GetBioseqHandleFromTSE(prod_id, m_Imp.GetTSE());
00532                 if ( !prod ) {
00533                     sev = eDiag_Info;
00534                 }
00535             }
00536                         
00537             string str("Inconsistent: Product= complete, Location= ");
00538             str += (partial_loc != eSeqlocPartial_Complete) ? "partial, " : "complete, ";
00539             str += "Feature.partial= ";
00540             str += is_partial ? "TRUE" : "FALSE";
00541             PostErr(sev, eErr_SEQ_FEAT_PartialsInconsistent, str, feat);
00542         }
00543         // inconsistent combination of partial/complete product,location,partial flag - part 2
00544         else if ( partial_loc == eSeqlocPartial_Complete  ||  !is_partial ) {
00545             string str("Inconsistent: ");
00546             if ( feat.CanGetProduct() ) {
00547                 str += "Product= ";
00548                 str += (partial_prod != eSeqlocPartial_Complete) ? "partial, " : "complete, ";
00549             }
00550             str += "Location= ";
00551             str += (partial_loc != eSeqlocPartial_Complete) ? "partial, " : "complete, ";
00552             str += "Feature.partial= ";
00553             str += is_partial ? "TRUE" : "FALSE";
00554             PostErr(eDiag_Warning, eErr_SEQ_FEAT_PartialsInconsistent, str, feat);
00555         }
00556         // 5' or 3' partial location giving unclassified partial product
00557         else if ( (((partial_loc & eSeqlocPartial_Start) != 0)  ||
00558                    ((partial_loc & eSeqlocPartial_Stop) != 0))  &&
00559                   ((partial_prod & eSeqlocPartial_Other) != 0)  &&
00560                   is_partial ) {
00561             PostErr(eDiag_Warning, eErr_SEQ_FEAT_PartialProblem,
00562                 "5' or 3' partial location should not have unclassified"
00563                 " partial in product molinfo descriptor", feat);
00564         }
00565         
00566         // may have other error bits set as well 
00567         unsigned int partials[2] = { partial_prod, partial_loc };
00568         for ( int i = 0; i < 2; ++i ) {
00569             unsigned int errtype = eSeqlocPartial_Nostart;
00570 
00571             for ( int j = 0; j < 4; ++j ) {
00572                 if (partials[i] & errtype) {
00573                     if ( i == 1  &&  j < 2  &&  IsCDDFeat(feat) ) {
00574                         // supress warning
00575                     } else if ( i == 1  &&  j < 2  &&
00576                         IsPartialAtSpliceSite(feat.GetLocation(), errtype) ) {
00577                         PostErr(eDiag_Info, eErr_SEQ_FEAT_PartialProblem,
00578                             parterr[i] + ": " + parterrs[j] + 
00579                             " (but is at consensus splice site)", feat);
00580                     } else if ( i == 1  &&  j < 2  &&
00581                         (feat.GetData().Which() == CSeqFeatData::e_Gene  ||
00582                         feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA) &&
00583                          IsSameAsCDS(feat) ) {
00584                         PostErr(eDiag_Info, eErr_SEQ_FEAT_PartialProblem,
00585                             parterr[i] + ": " + parterrs[j], feat);
00586                     } else if (feat.GetData().Which() == CSeqFeatData::e_Cdregion && j == 0) {
00587                         PostErr (eDiag_Warning, eErr_SEQ_FEAT_PartialProblem,
00588                             parterr[i] + 
00589                             ": 5' partial is not at start AND is not at consensus splice site",
00590                             feat); 
00591                     } else if (feat.GetData().Which() == CSeqFeatData::e_Cdregion && j == 1) {
00592                         PostErr (eDiag_Warning, eErr_SEQ_FEAT_PartialProblem,
00593                             parterr[i] + 
00594                             ": 3' partial is not at stop AND is not at consensus splice site",
00595                             feat);
00596                     } else {
00597                         PostErr (eDiag_Warning, eErr_SEQ_FEAT_PartialProblem,
00598                             parterr[i] + ": " + parterrs[j], feat);
00599                     }
00600                 }
00601                 errtype <<= 1;
00602             }
00603         }
00604     }
00605 }
00606 
00607 
00608 static int s_GetStrictGenCode(const CBioSource& src)
00609 {
00610     int gencode = 0;
00611 
00612     try {
00613       CBioSource::TGenome genome = src.IsSetGenome() ? src.GetGenome() : CBioSource::eGenome_unknown;
00614 
00615         if ( src.IsSetOrg()  &&  src.GetOrg().IsSetOrgname() ) {
00616             const COrgName& orn = src.GetOrg().GetOrgname();
00617 
00618             switch ( genome ) {
00619                 case CBioSource::eGenome_kinetoplast:
00620                 case CBioSource::eGenome_mitochondrion:
00621                     // bacteria and plant organelle code
00622                     gencode = orn.GetMgcode();
00623                     break;
00624                 case CBioSource::eGenome_chloroplast:
00625                 case CBioSource::eGenome_chromoplast:
00626                 case CBioSource::eGenome_plastid:
00627                 case CBioSource::eGenome_cyanelle:
00628                 case CBioSource::eGenome_apicoplast:
00629                 case CBioSource::eGenome_leucoplast:
00630                 case CBioSource::eGenome_proplastid:
00631                     // bacteria and plant plastids are code 11.
00632                     gencode = 11;
00633                     break;
00634                 default:
00635                     gencode = orn.GetGcode();
00636                     break;
00637             }
00638         }
00639     } catch (...) {
00640     }
00641     return gencode;
00642 }
00643 
00644 
00645 void CValidError_feat::ValidateGene(const CGene_ref& gene, const CSeq_feat& feat)
00646 {
00647     ++m_NumGenes;
00648 
00649     if ( (! gene.IsSetLocus()      ||  gene.GetLocus().empty())   &&
00650          (! gene.IsSetAllele()     ||  gene.GetAllele().empty())  &&
00651          (! gene.IsSetDesc()       ||  gene.GetDesc().empty())    &&
00652          (! gene.IsSetMaploc()     ||  gene.GetMaploc().empty())  &&
00653          (! gene.IsSetDb()         ||  gene.GetDb().empty())      &&
00654          (! gene.IsSetSyn()        ||  gene.GetSyn().empty())     &&
00655          (! gene.IsSetLocus_tag()  ||  gene.GetLocus_tag().empty()) ) {
00656         PostErr(eDiag_Warning, eErr_SEQ_FEAT_GeneRefHasNoData,
00657                 "There is a gene feature where all fields are empty", feat);
00658     }
00659     if ( gene.CanGetLocus()  &&  !gene.GetLocus().empty() ) {
00660         ITERATE (string, it, gene.GetLocus() ) {
00661             if ( isspace((unsigned char)(*it)) != 0 ) {
00662                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_LocusTagProblem,
00663                     "Gene locus_tag '" + gene.GetLocus() + 
00664                     "' should be a single word without any spaces", feat);
00665                 break;
00666             }
00667         }         
00668 
00669     }
00670     if ( gene.CanGetDb () ) {
00671         m_Imp.ValidateDbxref(gene.GetDb(), feat);
00672     }
00673 }
00674 
00675 
00676 void CValidError_feat::ValidateCdregion (
00677     const CCdregion& cdregion, 
00678     const CSeq_feat& feat
00679 ) 
00680 {
00681     bool pseudo = (feat.CanGetPseudo()  &&  feat.GetPseudo())  ||
00682         IsOverlappingGenePseudo(feat);
00683     
00684     bool transl_except = false;
00685 
00686     FOR_EACH_GBQUAL_ON_FEATURE (it, feat) {
00687         const CGb_qual& qual = **it;
00688         if ( qual.CanGetQual() ) {
00689             const string& key = qual.GetQual();
00690             if ( NStr::EqualNocase(key, "pseudo") ) {
00691                 pseudo = true;
00692             } else if ( NStr::EqualNocase(key, "exception") ) {
00693                 if ( !feat.IsSetExcept() ) {
00694                     PostErr(eDiag_Warning, eErr_SEQ_FEAT_ExceptInconsistent,
00695                         "Exception flag should be set in coding region", feat);
00696                 }
00697             } else if ( NStr::EqualNocase(key, "transl_except") ) {
00698                 transl_except = true;
00699             } else if ( NStr::EqualNocase(key, "codon") ) {
00700                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_WrongQualOnImpFeat,
00701                     "Use the proper genetic code, if available, "
00702                     "or set transl_excepts on specific codons", feat);
00703             } else if ( NStr::EqualNocase(key, "protein_id") ) {
00704                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_WrongQualOnImpFeat,
00705                     "protein_id should not be a gbqual on a CDS feature", feat);
00706             } else if ( NStr::EqualNocase(key, "transcript_id") ) {
00707                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_WrongQualOnImpFeat,
00708                     "transcript_id should not be a gbqual on a CDS feature", feat);
00709             }
00710         }
00711     }
00712     if ( transl_except  &&  feat.CanGetExcept_text()  &&
00713          NStr::FindNoCase(feat.GetExcept_text(), "RNA editing") != NPOS ) {
00714         PostErr(eDiag_Warning, eErr_SEQ_FEAT_TranslExceptAndRnaEditing,
00715             "CDS has both RNA editing /exception and /transl_except qualifiers", feat);
00716     }
00717     
00718     bool conflict = cdregion.CanGetConflict()  &&  cdregion.GetConflict();
00719     if ( !pseudo  &&  !conflict ) {
00720         ValidateCdTrans(feat);
00721         ValidateSplice(feat, false);
00722     } else if ( conflict ) {
00723         ValidateCdConflict(cdregion, feat);
00724     }
00725     ValidateCdsProductId(feat);
00726 
00727     x_ValidateCdregionCodebreak(cdregion, feat);
00728 
00729     if ( cdregion.IsSetOrf()  &&  cdregion.GetOrf ()  &&
00730          feat.IsSetProduct () ) {
00731         PostErr (eDiag_Warning, eErr_SEQ_FEAT_OrfCdsHasProduct,
00732             "An ORF coding region should not have a product", feat);
00733     }
00734 
00735     if ( pseudo && feat.CanGetProduct () ) {
00736         PostErr(eDiag_Warning, eErr_SEQ_FEAT_PseudoCdsHasProduct,
00737             "A pseudo coding region should not have a product", feat);
00738     }
00739     
00740     CBioseq_Handle bsh = m_Scope->GetBioseqHandle(feat.GetLocation());
00741     if ( bsh ) {
00742         // debug
00743         string label;
00744         (*(bsh.GetCompleteBioseq())).GetLabel(&label, CBioseq::eBoth);
00745 
00746         CSeqdesc_CI diter (bsh, CSeqdesc::e_Source);
00747         if ( diter ) {
00748             const CBioSource& src = diter->GetSource();
00749             int biopgencode = s_GetStrictGenCode(src);
00750             
00751             if ( cdregion.CanGetCode() ) {
00752                 int cdsgencode = cdregion.GetCode().GetId();
00753                 if ( biopgencode != cdsgencode ) {
00754                     int genome = 0;
00755                     
00756                     if ( src.CanGetGenome() ) {
00757                         genome = src.GetGenome();
00758                     }
00759                     
00760                     if ( IsPlastid(genome) ) {
00761                         PostErr(eDiag_Warning, eErr_SEQ_FEAT_GenCodeMismatch,
00762                             "Genetic code conflict between CDS (code " +
00763                             NStr::IntToString (cdsgencode) +
00764                             ") and BioSource.genome biological context (" +
00765                             s_PlastidTxt[genome] + ") (uses code 11)", feat);
00766                     } else {
00767                         PostErr (eDiag_Warning, eErr_SEQ_FEAT_GenCodeMismatch,
00768                             "Genetic code conflict between CDS (code " +
00769                             NStr::IntToString(cdsgencode) +
00770                             ") and BioSource (code " +
00771                             NStr::IntToString(biopgencode) + ")", feat);
00772                     }
00773                 }
00774             }
00775         }
00776     }
00777 
00778     ValidateBadGeneOverlap(feat);
00779     ValidateBadMRNAOverlap(feat);
00780     ValidateCommonCDSProduct(feat);
00781     ValidateCDSPartial(feat);
00782 }
00783 
00784 
00785 void CValidError_feat::x_ValidateCdregionCodebreak
00786 (const CCdregion& cds,
00787  const CSeq_feat& feat)
00788 {
00789     const CSeq_loc& feat_loc = feat.GetLocation();
00790     const CCode_break* prev_cbr = 0;
00791 
00792     FOR_EACH_CODEBREAK_ON_CDREGION (it, cds) {
00793         const CCode_break& cbr = **it;
00794         const CSeq_loc& cbr_loc = cbr.GetLoc();
00795         ECompare comp = Compare(cbr_loc, feat_loc, m_Scope);
00796         if ( (comp != eContained) && (comp != eSame)) {
00797             PostErr (eDiag_Error, eErr_SEQ_FEAT_Range, 
00798                 "Code-break location not in coding region", feat);
00799         }
00800         if ( prev_cbr != 0 ) {
00801             if ( Compare(cbr_loc, prev_cbr->GetLoc(), m_Scope) == eSame ) {
00802                 string msg = "Multiple code-breaks at same location";
00803                 string str;
00804                 cbr_loc.GetLabel(&str);
00805                 if ( !str.empty() ) {
00806                     msg += "[" + str + "]";
00807                 }
00808                 PostErr(eDiag_Error, eErr_SEQ_FEAT_DuplicateTranslExcept,
00809                    msg, feat);
00810             }
00811         }
00812         prev_cbr = &cbr;
00813     }
00814 }
00815 
00816 
00817 // non-pseudo CDS must have product
00818 void CValidError_feat::ValidateCdsProductId(const CSeq_feat& feat)
00819 {
00820     // bail if product exists
00821     if ( feat.CanGetProduct() ) {
00822         return;
00823     }
00824     // bail if pseudo
00825     bool pseudo = (feat.CanGetPseudo()  &&  feat.GetPseudo())  ||
00826         IsOverlappingGenePseudo(feat);
00827     if ( pseudo ) {
00828         return;
00829     }
00830     // bail if location has just stop
00831     if ( feat.CanGetLocation() ) {
00832         const CSeq_loc& loc = feat.GetLocation();
00833         if ( loc.IsPartialStart(eExtreme_Biological)  &&  !loc.IsPartialStop(eExtreme_Biological) ) {
00834             if ( GetLength(loc, m_Scope) <= 5 ) {
00835                 return;
00836             }
00837         }
00838     }
00839     // supress in case of the appropriate exception
00840     if ( feat.CanGetExcept()  &&  feat.CanGetExcept_text()  &&
00841          !IsBlankString(feat.GetExcept_text()) ) {
00842         if ( NStr::Find(feat.GetExcept_text(),
00843             "rearrangement required for product") != NPOS ) {
00844            return;
00845         }
00846     }
00847     
00848     // non-pseudo CDS must have /product
00849     PostErr(eDiag_Warning, eErr_SEQ_FEAT_MissingCDSproduct,
00850         "Expected CDS product absent", feat);
00851 }
00852 
00853 
00854 void CValidError_feat::ValidateCdConflict
00855 (const CCdregion& cdregion,
00856  const CSeq_feat& feat)
00857 {
00858     CBioseq_Handle nuc  = m_Scope->GetBioseqHandle(feat.GetLocation());
00859     CBioseq_Handle prot = m_Scope->GetBioseqHandle(feat.GetProduct());
00860     
00861     // translate the coding region
00862     string transl_prot;
00863     try {
00864         CCdregion_translate::TranslateCdregion(
00865             transl_prot, 
00866             nuc, 
00867             feat.GetLocation(), 
00868             cdregion,
00869             false,   // do not include stop codons
00870             false);  // do not remove trailing X/B/Z
00871     } catch ( const runtime_error& ) {
00872     }
00873 
00874     CSeqVector prot_vec = prot.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
00875     prot_vec.SetCoding(CSeq_data::e_Ncbieaa);
00876 
00877     string prot_seq;
00878     prot_vec.GetSeqData(0, prot_vec.size(), prot_seq);
00879 
00880     if ( transl_prot.empty()  ||  prot_seq.empty()  ||  transl_prot == prot_seq ) {
00881         PostErr(eDiag_Error, eErr_SEQ_FEAT_BadConflictFlag,
00882             "Coding region conflict flag should not be set", feat);
00883     } else {
00884         PostErr(eDiag_Warning, eErr_SEQ_FEAT_ConflictFlagSet,
00885             "Coding region conflict flag is set", feat);
00886     }
00887 }
00888 
00889 
00890 void CValidError_feat::ValidateSplice(const CSeq_feat& feat, bool check_all)
00891 {
00892     // !!! suppress if NCBISubValidate
00893     //if (GetAppProperty ("NcbiSubutilValidation") != NULL)
00894     //    return;
00895 
00896     bool report_errors = true, has_errors = false;
00897 
00898     // specific biological exceptions suppress check
00899     if ( feat.CanGetExcept()  &&  feat.GetExcept()  &&
00900          feat.CanGetExcept_text() ) {
00901         if ( SuppressCheck(feat.GetExcept_text()) ) {
00902             report_errors = false;
00903         }
00904     }
00905         
00906     size_t num_of_parts = 0;
00907     ENa_strand  strand = eNa_strand_unknown;
00908 
00909     // !!! The C version treated seq_loc equiv as one whereas the iterator
00910     // treats it as many. 
00911     const CSeq_loc& location = feat.GetLocation ();
00912 
00913     for (CSeq_loc_CI citer(location); citer; ++citer) {
00914         ++num_of_parts;
00915         if ( num_of_parts == 1 ) {  // first part
00916             strand = citer.GetStrand();
00917         } else {
00918             if ( strand != citer.GetStrand() ) {
00919                 return;         //bail on mixed strand
00920             }
00921         }
00922     }
00923 
00924     if ( num_of_parts == 0 ) {
00925         return;
00926     }
00927     if ( !check_all  &&  num_of_parts == 1 ) {
00928         return;
00929     }
00930     
00931     bool partial_first = location.IsPartialStart(eExtreme_Biological);
00932     bool partial_last = location.IsPartialStop(eExtreme_Biological);
00933 
00934 
00935     size_t counter = 0;
00936     const CSeq_id* last_id = 0;
00937     CBioseq_Handle bsh;
00938     size_t seq_len = 0;
00939     CSeqVector seq_vec;
00940     string label;
00941 
00942     for (CSeq_loc_CI citer(location); citer; ++citer) {
00943         ++counter;
00944 
00945         const CSeq_id& seq_id = citer.GetSeq_id();
00946         if ( last_id == 0  ||  !last_id->Match(seq_id) ) {
00947             bsh = m_Scope->GetBioseqHandle(seq_id);
00948             if ( !bsh ) {
00949                 break;
00950             }
00951             seq_vec = 
00952                 bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac,
00953                                  citer.GetStrand());
00954             seq_len = seq_vec.size();
00955 
00956             // get the label. if m_SuppressContext flag in true, 
00957             // get the worst label.
00958             const CBioseq& bsq = *bsh.GetCompleteBioseq();
00959             bsq.GetLabel(&label, CBioseq::eContent, m_Imp.IsSuppressContext());
00960 
00961             last_id = &seq_id;
00962         }
00963 
00964         TSeqPos acceptor = citer.GetRange().GetFrom();
00965         TSeqPos donor = citer.GetRange().GetTo();
00966 
00967         TSeqPos start = acceptor;
00968         TSeqPos stop = donor;
00969 
00970         if ( citer.GetStrand() == eNa_strand_minus ) {
00971             swap(acceptor, donor);
00972             stop = seq_len - donor - 1;
00973             start = seq_len - acceptor - 1;
00974         }
00975 
00976         // set severity level
00977         // genomic product set or NT_ contig always relaxes to SEV_WARNING
00978         EDiagSev sev = eDiag_Warning;
00979         if ( m_Imp.IsSpliceErr()                   &&
00980              !(m_Imp.IsGPS() || m_Imp.IsRefSeq())  &&
00981              !check_all ) {
00982             sev = eDiag_Error;
00983         }
00984 
00985         // check donor on all but last exon and on sequence
00986         if ( ((check_all && !partial_last)  ||  counter < num_of_parts)  &&
00987              (stop < seq_len - 2) ) {
00988             try {
00989                 CSeqVector::TResidue res1 = seq_vec[stop + 1];    
00990                 CSeqVector::TResidue res2 = seq_vec[stop + 2];
00991 
00992                 if ( IsResidue(res1)  &&  IsResidue(res2) ) {
00993                     if ( (res1 != 'G')  || (res2 != 'T' ) ) {
00994                         has_errors = true;
00995                         string msg;
00996                         if ( (res1 == 'G')  && (res2 == 'C' ) ) { // GC minor splice site
00997                             sev = eDiag_Info;
00998                             msg = "Rare splice donor consensus (GC) found instead of "
00999                                 "(GT) after exon ending at position " +
01000                                 NStr::IntToString(donor + 1) + " of " + label;
01001                         } else {
01002                             msg = "Splice donor consensus (GT) not found after exon"
01003                                 " ending at position " + 
01004                                 NStr::IntToString(donor + 1) + " of " + label;
01005                         }
01006                         if (report_errors) {
01007                             PostErr(sev, eErr_SEQ_FEAT_NotSpliceConsensus, msg, feat);
01008                         }
01009                     }
01010                 }
01011             } catch ( exception& ) {
01012                 break;
01013             }
01014         }
01015 
01016         if ( ((check_all && !partial_first)  ||  counter != 1)  &&
01017              (start > 1) ) {
01018             try {
01019                 CSeqVector::TResidue res1 = seq_vec[start - 2];
01020                 CSeqVector::TResidue res2 = seq_vec[start - 1];
01021                 
01022                 if ( IsResidue(res1)  &&  IsResidue(res2) ) {
01023                     if ( (res1 != 'A')  ||  (res2 != 'G') ) {
01024                         has_errors = true;
01025                         if (report_errors) {
01026                             PostErr(sev, eErr_SEQ_FEAT_NotSpliceConsensus,
01027                                 "Splice acceptor consensus (AG) not found before "
01028                                 "exon starting at position " + 
01029                                 NStr::IntToString(acceptor + 1) + " of " + label, feat);
01030                         }
01031                     }
01032                 }
01033             } catch ( exception& ) {
01034             }
01035         }
01036     } // end of for loop
01037 
01038     if (!report_errors  &&  !has_errors) {
01039         PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnnecessaryException,
01040             "feature has exception but passes splice site test", feat);
01041     }
01042 }
01043 
01044 
01045 void CValidError_feat::ValidateProt(const CProt_ref& prot, const CSerialObject& obj) 
01046 {
01047     CProt_ref::EProcessed processed = CProt_ref::eProcessed_not_set;
01048 
01049     if ( prot.CanGetProcessed() ) {
01050         processed = prot.GetProcessed();
01051     }
01052 
01053     bool empty = true;
01054     if ( processed != CProt_ref::eProcessed_signal_peptide  &&
01055          processed != CProt_ref::eProcessed_transit_peptide ) {
01056         if ( prot.IsSetName()  &&
01057             (!prot.GetName().empty()  ||  !prot.GetName().front().empty()) ) {
01058             empty = false;
01059         }
01060         if ( prot.CanGetDesc()  &&  !prot.GetDesc().empty() ) {
01061             empty = false;
01062         }
01063         if ( prot.CanGetEc()  &&  !prot.GetEc().empty() ) {
01064             empty = false;
01065         }
01066         if ( prot.CanGetActivity()  &&  !prot.GetActivity().empty() ) {
01067             empty = false;
01068         }
01069         if ( prot.CanGetDb()  &&  !prot.GetDb().empty() ) {
01070             empty = false;
01071         }
01072 
01073         if ( empty ) {
01074             PostErr(eDiag_Warning, eErr_SEQ_FEAT_ProtRefHasNoData,
01075                 "There is a protein feature where all fields are empty", obj);
01076         }
01077     }
01078     if ( prot.CanGetDb () ) {
01079         m_Imp.ValidateDbxref(prot.GetDb(), obj);
01080     }
01081     if ( prot.CanGetDesc()  &&  !prot.GetDesc().empty()  &&
01082          prot.GetName().empty() ) {
01083         PostErr(eDiag_Warning, eErr_SEQ_FEAT_NoNameForProtein,
01084             "Protein feature has description but no name", obj);
01085     }
01086 }
01087 
01088 
01089 void CValidError_feat::ValidateRna(const CRNA_ref& rna, const CSeq_feat& feat) 
01090 {
01091     const CRNA_ref::EType& rna_type = rna.GetType ();
01092 
01093     if ( rna_type == CRNA_ref::eType_mRNA ) {
01094         bool pseudo = (feat.CanGetPseudo()  &&  feat.GetPseudo())  ||  IsOverlappingGenePseudo(feat);
01095         
01096         if ( !pseudo ) {
01097             ValidateMrnaTrans(feat);      /* transcription check */
01098             ValidateSplice(feat, false);
01099         }
01100         ValidateBadGeneOverlap(feat);
01101         ValidateCommonMRNAProduct(feat);
01102     }
01103 
01104     if ( rna.CanGetExt()  &&
01105          rna.GetExt().Which() == CRNA_ref::C_Ext::e_TRNA ) {
01106         const CTrna_ext& trna = rna.GetExt ().GetTRNA ();
01107         if ( trna.CanGetAnticodon () ) {
01108             const CSeq_loc& anticodon = trna.GetAnticodon();
01109             size_t anticodon_len = 0;
01110             bool bad_anticodon = false;
01111             for ( CSeq_loc_CI it(anticodon); it; ++it ) {
01112                 anticodon_len += GetLength(anticodon, m_Scope);
01113                 ECompare comp = sequence::Compare(anticodon,
01114                                                   feat.GetLocation(),
01115                                                   m_Scope);
01116                 if ( comp != eContained  &&  comp != eSame ) {
01117                     bad_anticodon = true;
01118                 }
01119             }
01120             if ( bad_anticodon ) {
01121                 PostErr (eDiag_Error, eErr_SEQ_FEAT_Range,
01122                     "Anticodon location not in tRNA", feat);
01123             }
01124             if ( anticodon_len != 3 ) {
01125                 PostErr (eDiag_Warning, eErr_SEQ_FEAT_Range,
01126                     "Anticodon is not 3 bases in length", feat);
01127             }
01128             ValidateAnticodon(anticodon, feat);
01129         }
01130         ValidateTrnaCodons(trna, feat);
01131     }
01132 
01133     if ( rna_type == CRNA_ref::eType_tRNA ) {
01134         FOR_EACH_GBQUAL_ON_FEATURE (gbqual, feat) {
01135             if ( NStr::CompareNocase((**gbqual).GetVal (), "anticodon") == 0 ) {
01136                 PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
01137                     "Unparsed anticodon qualifier in tRNA", feat);
01138                 break;
01139             }
01140         }
01141         /* tRNA with string extension */
01142         if ( rna.CanGetExt()  &&  
01143              rna.GetExt().Which () == CRNA_ref::C_Ext::e_Name ) {
01144             PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
01145                 "Unparsed product qualifier in tRNA", feat);
01146         }
01147     }
01148 
01149     if ( rna_type == CRNA_ref::eType_unknown ) {
01150         PostErr(eDiag_Warning, eErr_SEQ_FEAT_RNAtype0,
01151             "RNA type 0 (unknown) not supported", feat);
01152     }
01153 
01154     if ( feat.CanGetProduct() ) {
01155         ValidateRnaProductType(rna, feat);
01156     }
01157 }
01158 
01159 
01160 void CValidError_feat::ValidateAnticodon(const CSeq_loc& anticodon, const CSeq_feat& feat)
01161 {
01162     bool ordered = true;
01163     bool adjacent = false;
01164     bool unmarked_strand = false;
01165     bool mixed_strand = false;
01166 
01167     CSeq_loc_CI prev;
01168     for (CSeq_loc_CI curr(anticodon); curr; ++curr) {
01169         if ( !curr.GetSeq_loc().IsInt()  &&  !curr.GetSeq_loc().IsPnt() ) {
01170             continue;
01171         }
01172         
01173         if ( prev  &&  curr  &&
01174              IsSameBioseq(curr.GetSeq_id(), prev.GetSeq_id(), m_Scope) ) {
01175             CSeq_loc_CI::TRange prev_range = prev.GetRange();
01176             CSeq_loc_CI::TRange curr_range = curr.GetRange();
01177             if ( ordered ) {
01178                 if ( curr.GetStrand() == eNa_strand_minus ) {
01179                     if (prev_range.GetTo() < curr_range.GetTo()) {
01180                         ordered = false;
01181                     }
01182                     if (curr_range.GetTo() + 1 == prev_range.GetFrom()) {
01183                         adjacent = true;
01184                     }
01185                 } else {
01186                     if (prev_range.GetTo() > curr_range.GetTo()) {
01187                         ordered = false;
01188                     }
01189                     if (prev_range.GetTo() + 1 == curr_range.GetFrom()) {
01190                         adjacent = true;
01191                     }
01192                 }
01193             }
01194             ENa_strand curr_strand = curr.GetStrand();
01195             ENa_strand prev_strand = prev.GetStrand();
01196             if ( curr_range == prev_range  &&  curr_strand == prev_strand ) {
01197                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_DuplicateInterval,
01198                     "Duplicate anticodon exons in location", feat);
01199             }
01200             if ( curr_strand != prev_strand ) {
01201                 if (curr_strand == eNa_strand_plus  &&  prev_strand == eNa_strand_unknown) {
01202                     unmarked_strand = true;
01203                 } else if (curr_strand == eNa_strand_unknown  &&  prev_strand == eNa_strand_plus) {
01204                     unmarked_strand = true;
01205                 } else {
01206                     mixed_strand = true;
01207                 }
01208             }
01209         }
01210         prev = curr;
01211     }
01212     if (adjacent) {
01213         PostErr(eDiag_Warning, eErr_SEQ_FEAT_AbuttingIntervals,
01214             "Adjacent intervals in Anticodon", feat);
01215     }
01216 
01217     // trans splicing exception turns off both mixed_strand and out_of_order messages
01218     bool trans_splice = false;
01219     if (feat.CanGetExcept()  &&  feat.GetExcept()  && feat.CanGetExcept_text()) {
01220         if (NStr::FindNoCase(feat.GetExcept_text(), "trans-splicing") != NPOS) {
01221             trans_splice = true;
01222         }
01223     }
01224     if (!trans_splice) {
01225         if (mixed_strand) {
01226             PostErr(eDiag_Warning, eErr_SEQ_FEAT_MixedStrand,
01227                 "Mixed strands in Anticodon", feat);
01228         }
01229         if (unmarked_strand) {
01230             PostErr(eDiag_Warning, eErr_SEQ_FEAT_MixedStrand,
01231                 "Mixed plus and unknown strands in Anticodon", feat);
01232         }
01233         if (!ordered) {
01234             PostErr(eDiag_Warning, eErr_SEQ_FEAT_SeqLocOrder,
01235                 "Intervals out of order in Anticodon", feat);
01236         }
01237     }
01238 }
01239 
01240 
01241 void CValidError_feat::ValidateRnaProductType
01242 (const CRNA_ref& rna,
01243  const CSeq_feat& feat)
01244 {
01245     CBioseq_Handle bsh = m_Scope->GetBioseqHandle(feat.GetProduct());
01246     if ( !bsh ) {
01247         return;
01248     }
01249     CSeqdesc_CI di(bsh, CSeqdesc::e_Molinfo);
01250     if ( !di ) {
01251         return;
01252     }
01253     const CMolInfo& mol_info = di->GetMolinfo();
01254     if ( !mol_info.CanGetBiomol() ) {
01255         return;
01256     }
01257     int biomol = mol_info.GetBiomol();
01258     
01259     switch ( rna.GetType() ) {
01260 
01261     case CRNA_ref::eType_mRNA:
01262         if ( biomol == CMolInfo::eBiomol_mRNA ) {
01263             return;
01264         }        
01265         break;
01266 
01267     case CRNA_ref::eType_tRNA:
01268         if ( biomol == CMolInfo::eBiomol_tRNA ) {
01269             return;
01270         }
01271         break;
01272 
01273     case CRNA_ref::eType_rRNA:
01274         if ( biomol == CMolInfo::eBiomol_rRNA ) {
01275             return;
01276         }
01277         break;
01278 
01279     default:
01280         return;
01281     }
01282 
01283     PostErr(eDiag_Error, eErr_SEQ_FEAT_RnaProductMismatch,
01284         "Type of RNA does not match MolInfo of product Bioseq", feat);
01285 }
01286 
01287 
01288 int s_LegalNcbieaaValues[] = { 42, 65, 66, 67, 68, 69, 70, 71, 72, 73,
01289                                75, 76, 77, 78, 80, 81, 82, 83, 84, 85,
01290                                86, 87, 88, 89, 90 };
01291 
01292 // in Ncbistdaa order
01293 static const char* kAANames[] = {
01294     "---", "Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile",
01295     "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr", "Val",
01296     "Trp", "OTHER", "Tyr", "Glx", "Sec", "TERM"
01297 };
01298 
01299 
01300 const char* GetAAName(unsigned char aa, bool is_ascii)
01301 {
01302     try {
01303         if (is_ascii) {
01304             aa = CSeqportUtil::GetMapToIndex
01305                 (CSeq_data::e_Ncbieaa, CSeq_data::e_Ncbistdaa, aa);
01306         }
01307         return (aa < sizeof(kAANames)/sizeof(*kAANames)) ? kAANames[aa] : "OTHER";
01308     } catch (...) {
01309         return "OTHER";
01310     }
01311 }
01312 
01313 
01314 static string GetGeneticCodeName (int gcode)
01315 {
01316     const CGenetic_code_table& code_table = CGen_code_table::GetCodeTable();
01317     const list<CRef<CGenetic_code> >& codes = code_table.Get();
01318 
01319     for ( list<CRef<CGenetic_code> >::const_iterator code_it = codes.begin(), code_it_end = codes.end();  code_it != code_it_end;  ++code_it ) {
01320         if ((*code_it)->GetId() == gcode) {
01321             return (*code_it)->GetName();
01322         }
01323     }
01324     return "unknown";
01325 }
01326 
01327 void CValidError_feat::ValidateTrnaCodons(const CTrna_ext& trna, const CSeq_feat& feat)
01328 {
01329     if (!trna.IsSetAa()) {
01330         return;
01331     }
01332 
01333     vector<char> seqData;
01334     string str = "";
01335     
01336     switch (trna.GetAa().Which()) {
01337         case CTrna_ext::C_Aa::e_Iupacaa:
01338             str = trna.GetAa().GetIupacaa();
01339             CSeqConvert::Convert(str, CSeqUtil::e_Iupacaa, 0, str.size(), seqData, CSeqUtil::e_Ncbieaa);
01340             break;
01341         case CTrna_ext::C_Aa::e_Ncbi8aa:
01342             str = trna.GetAa().GetNcbi8aa();
01343             CSeqConvert::Convert(str, CSeqUtil::e_Ncbi8aa, 0, str.size(), seqData, CSeqUtil::e_Ncbieaa);
01344             break;
01345         case CTrna_ext::C_Aa::e_Ncbistdaa:
01346             str = trna.GetAa().GetNcbi8aa();
01347             CSeqConvert::Convert(str, CSeqUtil::e_Ncbistdaa, 0, str.size(), seqData, CSeqUtil::e_Ncbieaa);
01348             break;
01349         case CTrna_ext::C_Aa::e_Ncbieaa:
01350             seqData.push_back(trna.GetAa().GetNcbieaa());
01351             break;
01352         default:
01353             NCBI_THROW (CCoreException, eCore, "Unrecognized tRNA aa coding");
01354             break;
01355     }
01356     unsigned char aa = seqData[0];
01357 
01358     // make sure the amino acid is valid
01359     bool found = false;
01360     for ( int i = 0; i < 25; ++i ) {
01361         if ( aa == s_LegalNcbieaaValues[i] ) {
01362             found = true;
01363             break;
01364         }
01365     }
01366     if ( !found ) {
01367         PostErr(eDiag_Error, eErr_SEQ_FEAT_BadTrnaAA, 
01368             "Invalid tRNA amino acid (" + NStr::IntToString(aa) + ")", feat);
01369         aa = ' ';
01370     }
01371 
01372     // Retrive the Genetic code id for the tRNA
01373     int gcode = 1;
01374     CBioseq_Handle bsh = m_Scope->GetBioseqHandle(feat.GetLocation());
01375     if ( bsh ) {
01376         // need only the closest biosoure.
01377         CSeqdesc_CI diter(bsh, CSeqdesc::e_Source);
01378         if ( diter ) {
01379             gcode = diter->GetSource().GetGenCode();
01380         }
01381     }
01382     
01383     const string& ncbieaa = CGen_code_table::GetNcbieaa(gcode);
01384     if ( ncbieaa.length() != 64 ) {
01385         return;
01386     }
01387 
01388     string codename = GetGeneticCodeName (gcode);
01389     char buf[2];
01390     buf[0] = aa;
01391     buf[1] = 0;
01392     string aaname = buf;
01393     aaname += "/";
01394     aaname += GetAAName (aa, true);
01395         
01396     EDiagSev sev = (aa == 'U') ? eDiag_Warning : eDiag_Error;
01397 
01398     bool modified_codon_recognition = false;
01399     if ( feat.CanGetExcept_text()  &&
01400          NStr::FindNoCase(feat.GetExcept_text(), "modified codon recognition") != NPOS ) {
01401         modified_codon_recognition = true;
01402     }
01403 
01404     ITERATE( CTrna_ext::TCodon, iter, trna.GetCodon() ) {
01405         // test that codon value is in range 0 - 63
01406         if ( *iter < 0  ||  *iter > 63 ) {
01407             PostErr(sev, eErr_SEQ_FEAT_TrnaCodonWrong,
01408                 "tRNA codon value (" + NStr::IntToString(*iter) + 
01409                 ") out of range", feat);
01410             continue;
01411         }
01412 
01413         if ( !modified_codon_recognition ) {
01414             unsigned char taa = ncbieaa[*iter];
01415             if ( taa != aa ) {
01416                 if ( (aa == 'U')  &&  (taa == '*')  &&  (*iter == 14) ) {
01417                     // selenocysteine normally uses TGA (14), so ignore without requiring exception in record
01418                     // TAG (11) is used for pyrrolysine in archaebacteria
01419                     // TAA (10) is not yet known to be used for an exceptional amino acid
01420                 } else {
01421                     string codon = CGen_code_table::IndexToCodon(*iter);
01422                     NStr::ReplaceInPlace (codon, "T", "U");
01423 
01424                     PostErr(sev, eErr_SEQ_FEAT_TrnaCodonWrong,
01425                       "tRNA codon (" + codon + ") does not match amino acid (" 
01426                        + aaname + ") specified by genetic code ("
01427                        + NStr::IntToString (gcode) + "/" + codename + ")", feat);
01428                 }
01429             }
01430         }
01431     }
01432 }
01433 
01434 
01435 void CValidError_feat::ValidateImp(const CImp_feat& imp, const CSeq_feat& feat, bool is_insd_in_sep)
01436 {
01437     CSeqFeatData::ESubtype subtype = feat.GetData().GetSubtype();
01438     const string& key = imp.GetKey();
01439 
01440     switch ( subtype ) {
01441     case CSeqFeatData::eSubtype_exon:
01442         if ( m_Imp.IsValidateExons() ) {
01443             ValidateSplice(feat, true);
01444         }
01445         break;
01446 
01447     case CSeqFeatData::eSubtype_bad:
01448         PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnknownImpFeatKey, 
01449             "Unknown feature key " + key, feat);
01450         break;
01451 
01452     case CSeqFeatData::eSubtype_virion:
01453     case CSeqFeatData::eSubtype_mutation:
01454     case CSeqFeatData::eSubtype_allele:
01455         PostErr(eDiag_Error, eErr_SEQ_FEAT_UnknownImpFeatKey,
01456             "Feature key " + key + " is no longer legal", feat);
01457         break;
01458 
01459     case CSeqFeatData::eSubtype_polyA_site:
01460         {
01461             CSeq_loc::TRange range = feat.GetLocation().GetTotalRange();
01462             if ( range.GetFrom() != range.GetTo() ) {
01463                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_PolyAsiteNotPoint,
01464                     "PolyA_site should be a single point", feat);
01465             }
01466         }
01467         break;
01468 
01469     case CSeqFeatData::eSubtype_mat_peptide:
01470     case CSeqFeatData::eSubtype_sig_peptide:
01471     case CSeqFeatData::eSubtype_transit_peptide:
01472         PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidForType,
01473             "Peptide processing feature should be converted to the "
01474             "appropriate protein feature subtype", feat);
01475         ValidatePeptideOnCodonBoundry(feat, key);
01476         break;
01477         
01478     case CSeqFeatData::eSubtype_mRNA:
01479     case CSeqFeatData::eSubtype_tRNA:
01480     case CSeqFeatData::eSubtype_rRNA:
01481     case CSeqFeatData::eSubtype_snRNA:
01482     case CSeqFeatData::eSubtype_scRNA:
01483     case CSeqFeatData::eSubtype_snoRNA:
01484     case CSeqFeatData::eSubtype_misc_RNA:
01485     case CSeqFeatData::eSubtype_precursor_RNA:
01486     // !!! what about other RNA types (preRNA, otherRNA)?
01487         PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidForType,
01488               "RNA feature should be converted to the appropriate RNA feature "
01489               "subtype, location should be converted manually", feat);
01490         break;
01491 
01492     case CSeqFeatData::eSubtype_Imp_CDS:
01493         {
01494             // impfeat CDS must be pseudo; fail if not
01495             bool pseudo = (feat.CanGetPseudo()  &&  feat.GetPseudo())  ||
01496                           IsOverlappingGenePseudo(feat);
01497             if ( !pseudo ) {
01498                 PostErr(eDiag_Info, eErr_SEQ_FEAT_ImpCDSnotPseudo,
01499                     "ImpFeat CDS should be pseudo", feat);
01500             }
01501 
01502             FOR_EACH_GBQUAL_ON_FEATURE (gbqual, feat) {
01503                 if ( NStr::CompareNocase( (*gbqual)->GetQual(), "translation") == 0 ) {
01504                     PostErr(eDiag_Error, eErr_SEQ_FEAT_ImpCDShasTranslation,
01505                         "ImpFeat CDS with /translation found", feat);
01506                 }
01507             }
01508         }
01509         break;
01510     case CSeqFeatData::eSubtype_imp:
01511         PostErr(eDiag_Error, eErr_SEQ_FEAT_UnknownImpFeatKey,
01512             "Unknown feature key " + key, feat);
01513         break;
01514     default:
01515         break;
01516     }// end of switch statement  
01517     
01518     // validate the feature's location
01519     if ( imp.CanGetLoc() ) {
01520         const string& imp_loc = imp.GetLoc();
01521         if ( imp_loc.find("one-of") != string::npos ) {
01522             PostErr(eDiag_Error, eErr_SEQ_FEAT_ImpFeatBadLoc,
01523                 "ImpFeat loc " + imp_loc + 
01524                 " has obsolete 'one-of' text for feature " + key, feat);
01525         } else if ( feat.GetLocation().IsInt() ) {
01526             const CSeq_interval& seq_int = feat.GetLocation().GetInt();
01527             string temp_loc = NStr::IntToString(seq_int.GetFrom() + 1) +
01528                 ".." + NStr::IntToString(seq_int.GetTo() + 1);
01529             if ( imp_loc != temp_loc ) {
01530                 PostErr(eDiag_Error, eErr_SEQ_FEAT_ImpFeatBadLoc,
01531                     "ImpFeat loc " + imp_loc + " does not equal feature location " +
01532                     temp_loc + "for feature " + key, feat);
01533             }
01534         }
01535     }
01536 
01537     if ( feat.CanGetQual() ) {
01538         ValidateImpGbquals(imp, feat, is_insd_in_sep);
01539     }
01540     
01541     // Make sure a feature has its mandatory qualifiers
01542     bool found = false;
01543     switch (subtype) {
01544     case CSeqFeatData::eSubtype_conflict:
01545     case CSeqFeatData::eSubtype_old_sequence:
01546         if (!feat.IsSetCit()  &&  NStr::IsBlank(feat.GetNamedQual("compare"))) {
01547             PostErr(eDiag_Warning, eErr_SEQ_FEAT_MissingQualOnImpFeat,
01548                 "Feature " + key + " requires either /compare or /citation (or both)",
01549                 feat);
01550         }
01551         break;
01552     default:
01553         ITERATE (CFeatQualAssoc::TGBQualTypeVec, required, CFeatQualAssoc::GetMandatoryGbquals(subtype)) {
01554             found = false;
01555             FOR_EACH_GBQUAL_ON_FEATURE (qual, feat) {
01556                 if (CGbqualType::GetType(**qual) == *required) {
01557                     found = true;
01558                     break;
01559                 }
01560             }
01561             if (!found) {
01562                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_MissingQualOnImpFeat,
01563                     "Missing qualifier " + CGbqualType::GetString(*required) +
01564                     " for feature " + key, feat);
01565             }
01566         }
01567         break;
01568     }
01569 }
01570 
01571 
01572 static bool s_RptUnitIsBaseRange (string str, int& from, int& to)
01573 
01574 {
01575     if (str.length() > 25) {
01576         return false;
01577     }
01578     SIZE_TYPE pos = NStr::Find (str, "..");
01579     if (pos == string::npos) {
01580         return false;
01581     }
01582     try {
01583         from = NStr::StringToInt (str.substr(0, pos));
01584         to = NStr::StringToInt (str.substr (pos + 2));
01585     } catch (...) {
01586         return false;
01587     }
01588     if (from < 0 || to < 0) {
01589         return false;
01590     }
01591     return true;
01592 }
01593 
01594 typedef enum {
01595   eAccessionFormat_valid = 0,
01596   eAccessionFormat_no_start_letters,
01597   eAccessionFormat_wrong_number_of_digits,
01598   eAccessionFormat_null,
01599   eAccessionFormat_too_long,
01600   eAccessionFormat_missing_version,
01601   eAccessionFormat_bad_version } EAccessionFormatError;
01602 
01603 static EAccessionFormatError s_ValidateAccessionString (string accession, bool require_version)
01604 {
01605     if (NStr::IsBlank (accession)) {
01606         return eAccessionFormat_null;
01607     } else if (accession.length() >= 16) {
01608         return eAccessionFormat_too_long;
01609     } else if (accession.length() < 3 
01610                || ! isalpha (accession.c_str()[0]) 
01611                || ! isupper (accession.c_str()[0])
01612                || ! isalpha (accession.c_str()[1])
01613                || ! isupper (accession.c_str()[1])) {
01614         return eAccessionFormat_no_start_letters;
01615     }
01616     
01617     string str = accession;
01618     if (NStr::StartsWith (str, "NZ_")) {
01619         str = str.substr(3);
01620     }
01621     
01622     const char *cp = str.c_str();
01623     int numAlpha = 0;
01624 
01625     while (isalpha (*cp)) {
01626         numAlpha++;
01627         cp++;
01628     }
01629 
01630     int numUndersc = 0;
01631 
01632     while (*cp == '_') {
01633         numUndersc++;
01634         cp++;
01635     }
01636 
01637     int numDigits = 0;
01638     while (isdigit (*cp)) {
01639         numDigits++;
01640         cp++;
01641     }
01642 
01643     if ((*cp != '\0' && *cp != ' ' && *cp != '.') || numUndersc > 1) {
01644         return eAccessionFormat_wrong_number_of_digits;
01645     }
01646 
01647     EAccessionFormatError rval = eAccessionFormat_valid;
01648 
01649     if (require_version) {
01650         if (*cp != '.') {
01651             rval = eAccessionFormat_missing_version;
01652         }
01653         cp++;
01654         int numVersion = 0;
01655         while (isdigit (*cp)) {
01656             numVersion++;
01657             cp++;
01658         }
01659         if (numVersion < 1) {
01660             rval = eAccessionFormat_missing_version;
01661         } else if (*cp != '\0' && *cp != ' ') {
01662             rval = eAccessionFormat_bad_version;
01663         }
01664     }
01665 
01666 
01667     if (numUndersc == 0) {
01668         if ((numAlpha == 1 && numDigits == 5) 
01669             || (numAlpha == 2 && numDigits == 6)
01670             || (numAlpha == 3 && numDigits == 5)
01671             || (numAlpha == 4 && numDigits == 8)
01672             || (numAlpha == 5 && numDigits == 7)) {
01673             return rval;
01674         } 
01675     } else if (numUndersc == 1) {
01676         if (numAlpha != 2 || (numDigits != 6 && numDigits != 8 && numDigits != 9)) {
01677             return eAccessionFormat_wrong_number_of_digits;
01678         }
01679         char first_letter = accession.c_str()[0];
01680         char second_letter = accession.c_str()[1];
01681         if (first_letter == 'N' || first_letter == 'X' || first_letter == 'Z') { 
01682             if (second_letter == 'M' || second_letter == 'C'
01683                 || second_letter == 'T' || second_letter == 'P'
01684                 || second_letter == 'G' || second_letter == 'R'
01685                 || second_letter == 'S' || second_letter == 'W'
01686                 || second_letter == 'W' || second_letter == 'Z') {
01687                 return rval;
01688             }
01689         }
01690         if ((first_letter == 'A' || first_letter == 'Y')
01691             && second_letter == 'P') {
01692             return rval;
01693         }
01694     }
01695 
01696     return eAccessionFormat_wrong_number_of_digits;
01697 }
01698 
01699 
01700 static bool s_StringConsistsOf (string str, string consist)
01701 {
01702     const char *src = str.c_str();
01703     const char *find = consist.c_str();
01704     bool rval = true;
01705 
01706     while (*src != 0 && rval) {
01707         if (strchr (find, *src) == NULL) {
01708             rval = false;
01709         }
01710         src++;
01711     }
01712     return rval;
01713 }
01714 
01715 
01716 void CValidError_feat::ValidateImpGbquals
01717 (const CImp_feat& imp,
01718  const CSeq_feat& feat,
01719  bool is_insd_in_sep)
01720 {
01721     CSeqFeatData::ESubtype ftype = feat.GetData().GetSubtype();
01722     const string& key = imp.GetKey();
01723 
01724     FOR_EACH_GBQUAL_ON_FEATURE (qual, feat) {
01725         const string& qual_str = (*qual)->GetQual();
01726 
01727         if ( qual_str == "gsdb_id" ) {
01728             continue;
01729         }
01730 
01731         CGbqualType::EType gbqual = CGbqualType::GetType(qual_str);
01732         
01733         if ( gbqual == CGbqualType::e_Bad ) {
01734             if ( !qual_str.empty() ) {
01735                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnknownImpFeatQual,
01736                     "Unknown qualifier " + qual_str, feat);
01737             } else {
01738                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnknownImpFeatQual,
01739                     "Empty qualifier", feat);
01740             }
01741         } else {
01742             if ( !CFeatQualAssoc::IsLegalGbqual(ftype, gbqual) ) {
01743                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_WrongQualOnImpFeat,
01744                     "Wrong qualifier " + qual_str + " for feature " + 
01745                     key, feat);
01746             }
01747             
01748             string val = (*qual)->GetVal();
01749             
01750             bool error = false;
01751             switch ( gbqual ) {
01752             case CGbqualType::e_Rpt_type:
01753                 {{
01754                     for ( size_t i = 0; 
01755                     i < sizeof(s_LegalRepeatTypes) / sizeof(string); 
01756                     ++i ) {
01757                         if ( val.find(s_LegalRepeatTypes[i]) != string::npos ) {
01758                             bool left = false, right = false;
01759                             if ( i > 0 ) {
01760                                 left = val[i-1] == ','  ||  val[i-1] == '(';
01761                             }
01762                             if ( i < val.length() - 1 ) {
01763                                 right = val[i+1] == ','  ||  val[i+1] == ')';
01764                             }
01765                             if ( left  &&  right ) {
01766                                 error = true;
01767                             }
01768                             break;
01769                         }
01770                     }
01771                 }}
01772                 break;
01773                 
01774             case CGbqualType::e_Rpt_unit:
01775             case CGbqualType::e_Rpt_unit_seq:
01776                 {{
01777                     bool found = false, multiple_rpt_unit = true;
01778                     ITERATE(string, it, val) {
01779                         if ( *it <= ' ' ) {
01780                             found = true;
01781                         } else if ( *it == '('  ||  *it == ')'  ||
01782                             *it == ','  ||  *it == '.'  ||
01783                             isdigit((unsigned char)(*it)) ) {
01784                         } else {
01785                             multiple_rpt_unit = false;
01786                         }
01787                     }
01788                     /*
01789                     if ( found || 
01790                     (!multiple_rpt_unit && val.length() > 48) ) {
01791                     error = true;
01792                     }
01793                     */
01794                     if ( NStr::CompareNocase(key, "repeat_region") == 0  &&
01795                          !multiple_rpt_unit ) {
01796                         if (val.length() <= GetLength(feat.GetLocation(), m_Scope) ) {
01797                             bool just_nuc_letters = true;
01798                             static const string nuc_letters = "ACGTNacgtn";
01799                             
01800                             ITERATE(string, it, val) {
01801                                 if ( nuc_letters.find(*it) == NPOS ) {
01802                                     just_nuc_letters = false;
01803                                     break;
01804                                 }
01805                             }
01806                             
01807                             if ( just_nuc_letters ) {
01808                                 CSeqVector vec = GetSequenceFromFeature(feat, *m_Scope);
01809                                 if ( !vec.empty() ) {
01810                                     string vec_data;
01811                                     vec.GetSeqData(0, vec.size(), vec_data);
01812                                     if (NStr::FindNoCase (vec_data, val) == string::npos) {
01813                                         PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidQualifierValue,
01814                                             "repeat_region /rpt_unit and underlying "
01815                                             "sequence do not match", feat);
01816                                     }
01817                                 }
01818                                 
01819                                 
01820                             } else {
01821                                 PostErr(eDiag_Info, eErr_SEQ_FEAT_InvalidQualifierValue,
01822                                     "rpt_unit_seq qualifier contains invalid characters", feat);
01823                             }
01824                         } else {
01825                             PostErr(eDiag_Info, eErr_SEQ_FEAT_InvalidQualifierValue,
01826                                 "Length of rpt_unit_seq is greater than feature length", feat);
01827                         }                            
01828                     }
01829                 }}
01830                 break;
01831             case CGbqualType::e_Rpt_unit_range:
01832                 {{
01833                     int from, to;
01834                     if (!s_RptUnitIsBaseRange(val, from, to)) {
01835                         PostErr (eDiag_Warning, eErr_SEQ_FEAT_InvalidQualifierValue,
01836                                  "/rpt_unit_range is not a base range", feat);
01837                     } else {
01838                         CSeq_loc::TRange range = feat.GetLocation().GetTotalRange();
01839                         if (from < range.GetFrom() || from > range.GetTo() || to < range.GetFrom() || to > range.GetTo()) {
01840                             PostErr (eDiag_Warning, eErr_SEQ_FEAT_InvalidQualifierValue,
01841                                      "/rpt_unit_range is not within sequence length", feat);   
01842                         }
01843                     }
01844                 }}
01845                 break;
01846             case CGbqualType::e_Label:
01847                 {{
01848                     bool only_digits = true,
01849                         has_spaces = false;
01850                     
01851                     ITERATE(string, it, val) {
01852                         if ( isspace((unsigned char)(*it)) ) {
01853                             has_spaces = true;
01854                         }
01855                         if ( !isdigit((unsigned char)(*it)) ) {
01856                             only_digits = false;
01857                         }
01858                     }
01859                     if (only_digits  ||  has_spaces) {
01860                         PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue, "Illegal value for qualifier " + qual_str, feat);
01861                     }
01862                 }}
01863                 break;
01864             case CGbqualType::e_Replace:
01865                 {{
01866                     CBioseq_Handle bh = m_Scope->GetBioseqHandle(feat.GetLocation());
01867                     if (bh) {
01868                         if (bh.IsNa()) {
01869                             if (NStr::Equal(key, "variation")) {
01870                                 if (!s_StringConsistsOf (val, "acgt")) {
01871                                     PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
01872                                              val + "  is not a legal value for qualifier " + qual_str
01873                                              + " - should only be composed of acgt unambiguous nucleotide bases",
01874                                              feat);
01875                                 }
01876                             } else if (!s_StringConsistsOf (val, "acgtmrwsykvhdbn")) {
01877                                   PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
01878                                            val + "  is not a legal value for qualifier " + qual_str
01879                                            + " - should only be composed of acgtmrwsykvhdbn nucleotide bases",
01880                                            feat);
01881                             }
01882                         } else if (bh.IsAa()) {
01883                             if (!s_StringConsistsOf (val, "acdefghiklmnpqrstuvwy*")) {
01884                                 PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
01885                                          val + "  is not a legal value for qualifier " + qual_str
01886                                          + " - should only be composed of acdefghiklmnpqrstuvwy* amino acids",
01887                                          feat);
01888                             }
01889                         }
01890 
01891                         // if no point in location with fuzz, info if text matches sequence
01892                         bool has_fuzz = false;
01893                         for( objects::CSeq_loc_CI it(feat.GetLocation()); it && !has_fuzz; ++it) {
01894                             if (it.IsPoint() && it.GetSeq_loc().GetPnt().IsSetFuzz()) {
01895                                 has_fuzz = true;
01896                             }
01897                         }
01898                         if (!has_fuzz && val.length() == GetLength (feat.GetLocation(), m_Scope)) {
01899                             string bases = "";
01900                             bool match = false;
01901                             try {
01902                               CCdregion_translate::ReadSequenceByLocation (bases, bh, feat.GetLocation(), CCdregion_translate::eTruncate);
01903                               if (NStr::EqualNocase(val, bases)) {
01904                                   PostErr (eDiag_Info, eErr_SEQ_FEAT_SuspiciousQualifierValue,
01905                                            "/replace already matches underlying sequence (" + val + ")",
01906                                            feat);
01907                               }
01908                             } catch ( ... ) {
01909                             }
01910                         }
01911                     }
01912                 }}
01913                 break;
01914 
01915             case CGbqualType::e_Cons_splice:
01916                 {{
01917                     error = true;
01918                     for (size_t i = 0; 
01919                          i < sizeof(s_LegalConsSpliceStrings) / sizeof(string);
01920                          ++i) {
01921                         if ( NStr::CompareNocase(val, s_LegalConsSpliceStrings[i]) == 0 ) {
01922                             error = false;
01923                             break;
01924                         }
01925                     }
01926                 }}
01927                 break;
01928 
01929             case CGbqualType::e_Mobile_element:
01930                 {{
01931                 }}
01932                 break;
01933 
01934             case CGbqualType::e_Compare:
01935                 {{
01936                     if (!NStr::StartsWith (val, "(")) {
01937                         EAccessionFormatError valid_accession = s_ValidateAccessionString (val, true);  
01938                         if (valid_accession == eAccessionFormat_missing_version) {
01939                             PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
01940                                      val + " accession missing version for qualifier " + qual_str, feat);
01941                         } else if (valid_accession == eAccessionFormat_bad_version) {
01942                             PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
01943                                      val + " accession has bad version for qualifier " + qual_str, feat);
01944                         } else if (valid_accession != eAccessionFormat_valid) {
01945                             PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
01946                                      val + " is not a legal accession for qualifier " + qual_str, feat);
01947                         } else if (is_insd_in_sep && NStr::Find (val, "_") == string::npos) {
01948                             PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
01949                                      "RefSeq accession " + val + " cannot be used for qualifier " + qual_str, feat);
01950                         }
01951                     }
01952                 }}
01953                 break;
01954 
01955             default:
01956                 break;
01957             } // end of switch statement
01958             if ( error ) {
01959                 PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
01960                     val + " is not a legal value for qualifier " + qual_str, feat);
01961             }
01962         }
01963     }  // end of ITERATE 
01964 }
01965 
01966 
01967 void CValidError_feat::ValidatePeptideOnCodonBoundry
01968 (const CSeq_feat& feat, 
01969  const string& key)
01970 {
01971     const CSeq_loc& loc = feat.GetLocation();
01972 
01973     // !!! DEBUG {
01974     if( m_Imp.AvoidPerfBottlenecks() ) {
01975         return;
01976     } 
01977     // } DEBUG
01978 
01979     CConstRef<CSeq_feat> cds = GetOverlappingCDS(loc, *m_Scope);
01980     if ( !cds ) {
01981         return;
01982     }
01983     const CCdregion& cdr = cds->GetData().GetCdregion();
01984 
01985     TSeqPos pos1 = LocationOffset(cds->GetLocation(), loc, eOffset_FromStart);
01986     TSeqPos pos2 = pos1 + GetLength(loc, m_Scope);
01987     unsigned int frame = 0;
01988     switch (cdr.GetFrame()) {
01989         case CCdregion::eFrame_not_set:
01990         case CCdregion::eFrame_one:
01991             frame = 0;
01992             break;
01993         case CCdregion::eFrame_two:
01994             frame = 1;
01995             break;
01996         case CCdregion::eFrame_three:
01997             frame = 2;
01998             break;
01999     }
02000     // note - have to add 3 to prevent negative result from subtraction
02001     TSeqPos mod1 = (pos1 + 3 - frame) %3;
02002     TSeqPos mod2 = (pos2 + 3 - frame) %3;
02003 
02004     if ( mod1 != 0 && loc.IsPartialStart(eExtreme_Biological) 
02005          && cds->GetLocation().IsPartialStart(eExtreme_Biological) 
02006          && pos1 == 0) {
02007         mod1 = 0;
02008     }
02009     if ( mod2 != 0 && loc.IsPartialStop(eExtreme_Biological) 
02010          && cds->GetLocation().IsPartialStop(eExtreme_Biological) 
02011          && pos2 == GetLength (cds->GetLocation(), m_Scope)) {
02012         mod2 = 0;
02013     }
02014 
02015     if ( (mod1 != 0)  &&  (mod2 != 0) ) {
02016         PostErr(eDiag_Warning, eErr_SEQ_FEAT_PeptideFeatOutOfFrame,
02017             "Start and stop of " + key + " are out of frame with CDS codons",
02018             feat);
02019     } else if (mod1 != 0) {
02020         PostErr(eDiag_Warning, eErr_SEQ_FEAT_PeptideFeatOutOfFrame, 
02021             "Start of " + key + " is out of frame with CDS codons", feat);
02022     } else if (mod2 != 0) {
02023         PostErr(eDiag_Warning, eErr_SEQ_FEAT_PeptideFeatOutOfFrame,
02024             "Stop of " + key + " is out of frame with CDS codons", feat);
02025     }
02026 }
02027 
02028 
02029 static const string sc_BypassMrnaTransCheckText[] = {
02030     "RNA editing",
02031     "artificial frameshift",
02032     "mismatches in transcription"
02033     "reasons given in citation",
02034     "unclassified transcription discrepancy",    
02035 };
02036 DEFINE_STATIC_ARRAY_MAP(CStaticArraySet<string>, sc_BypassMrnaTransCheck, sc_BypassMrnaTransCheckText);
02037 
02038 
02039 void CValidError_feat::ValidateMrnaTrans(const CSeq_feat& feat)
02040 {
02041     bool has_errors = false, unclassified_except = false,
02042         mismatch_except = false, report_errors = true;
02043     string farstr;
02044 
02045     if (feat.CanGetPseudo()  &&  feat.GetPseudo()) {
02046         return;
02047     }
02048     if (!feat.CanGetProduct()) {
02049         return;
02050     }
02051 
02052     if (feat.CanGetExcept()  &&  feat.GetExcept()  &&
02053         feat.CanGetExcept_text()) {
02054         const string& except_text = feat.GetExcept_text();
02055         ITERATE (CStaticArraySet<string>, it, sc_BypassMrnaTransCheck) {
02056             if (NStr::FindNoCase(except_text, *it) != NPOS) {
02057                 report_errors = false;  // biological exception
02058             }
02059             if (NStr::FindNoCase(except_text, "unclassified transcription discrepancy") != NPOS) {
02060                 unclassified_except = true;
02061             }
02062             if (NStr::FindNoCase(except_text, "mismatches in transcription") != NPOS) {
02063                 mismatch_except = true;
02064                 report_errors = true;
02065             }
02066         }
02067     }
02068 
02069     CConstRef<CSeq_id> product_id;
02070     try {
02071         product_id.Reset(&GetId(feat.GetProduct(), m_Scope));
02072     } catch (CException&) {
02073     }
02074     if (!product_id) {
02075         return;
02076     }
02077     
02078     CBioseq_Handle nuc = m_Scope->GetBioseqHandle(feat.GetLocation());
02079     if (!nuc) {
02080         has_errors = true;
02081         if (report_errors  ||  unclassified_except) {
02082             PostErr(eDiag_Error, eErr_SEQ_FEAT_MrnaTransFail,
02083                 "Unable to transcribe mRNA", feat);
02084         };
02085     }
02086     if (nuc) {
02087         EDiagSev sev = eDiag_Error;
02088         CBioseq_Handle rna = m_Scope->GetBioseqHandleFromTSE(*product_id, nuc);
02089         if (!rna) {
02090             // if not local bioseq product, lower severity (with the exception of Refseq)
02091             sev = m_Imp.IsRefSeq() ? eDiag_Error : eDiag_Warning;
02092             if (m_Imp.IsFarFetchMRNAproducts()) {
02093                 rna = m_Scope->GetBioseqHandle(feat.GetProduct());
02094             }
02095             if (!rna) {
02096                 return;
02097             }
02098             farstr = "(far)";
02099         }
02100         _ASSERT(nuc  &&  rna);
02101     
02102         CSeqVector nuc_vec(feat.GetLocation(), *m_Scope,
02103                            CBioseq_Handle::eCoding_Iupac);
02104         CSeqVector rna_vec(feat.GetProduct(), *m_Scope,
02105                            CBioseq_Handle::eCoding_Iupac);
02106 
02107         size_t nuc_len = nuc_vec.size();
02108         size_t rna_len = rna_vec.size();
02109 
02110         if (nuc_len != rna_len) {
02111             has_errors = true;
02112             if (nuc_len < rna_len) {
02113                 size_t count_a = 0, count_no_a = 0;
02114                 // count 'A's in the tail
02115                 for (CSeqVector_CI iter(rna_vec, nuc_len); iter; ++iter) {
02116                     if ((*iter == 'A')  ||  (*iter == 'a')) {
02117                         ++count_a;
02118                     } else {
02119                         ++count_no_a;
02120                     }
02121                 }
02122                 if (count_a < (19 * count_no_a)) { // less then 5%
02123                     if (report_errors) {
02124                         PostErr(sev, eErr_SEQ_FEAT_TranscriptLen,
02125                             "Transcript length [" + NStr::IntToString(nuc_len) + 
02126                             "] less than " + farstr + "product length [" +
02127                             NStr::IntToString(rna_len) + "], and tail < 95% polyA",
02128                             feat);
02129                     }
02130                 } else {
02131                     if (report_errors) {
02132                         PostErr(eDiag_Info, eErr_SEQ_FEAT_TranscriptLen,
02133                             "Transcript length [" + NStr::IntToString(nuc_len) + 
02134                             "] less than " + farstr + "product length [" +
02135                             NStr::IntToString(rna_len) + "], but tail >= 95% polyA",
02136                             feat);
02137                     }
02138                 }            
02139             } else {
02140                 if (report_errors) {
02141                     PostErr(sev, eErr_SEQ_FEAT_TranscriptLen,
02142                         "Transcript length [" + NStr::IntToString(nuc_vec.size()) + "] " +
02143                         "greater than " + farstr +"product length [" + 
02144                         NStr::IntToString(rna_vec.size()) + "]", feat);
02145                 }
02146             }
02147             // allow base-by-base comparison on common length
02148             rna_len = nuc_len = min(nuc_len, rna_len);
02149         }
02150         _ASSERT(nuc_len == rna_len);
02151 
02152         if (nuc_len > 0) {
02153             CSeqVector_CI nuc_ci(nuc_vec);
02154             CSeqVector_CI rna_ci(rna_vec);
02155             size_t mismatches = 0;
02156 
02157             // compare content of common length
02158             while ((nuc_ci  &&  rna_ci)  &&  (nuc_ci.GetPos() < nuc_len)) {
02159                 if (*nuc_ci != *rna_ci) {
02160                     ++mismatches;
02161                 }
02162                 ++nuc_ci;
02163                 ++rna_ci;
02164             }
02165             if (mismatches > 0) {
02166                 has_errors = true;
02167                 if (report_errors  &&  !mismatch_except) {
02168                     PostErr(eDiag_Error, eErr_SEQ_FEAT_TranscriptMismatches,
02169                         "There are " + NStr::IntToString(mismatches) + 
02170                         " mismatches out of " + NStr::IntToString(nuc_len) +
02171                         " bases between the transcript and " + farstr + "product sequence",
02172                         feat);
02173                 }
02174             }
02175         }
02176     }
02177 
02178     if (!report_errors  &&  !has_errors) {
02179         PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnnecessaryException,
02180             "mRNA has exception but passes transcription test", feat);
02181     }
02182 }
02183 
02184 
02185 void CValidError_feat::ValidateCommonMRNAProduct(const CSeq_feat& feat)
02186 {
02187     if ( (feat.CanGetPseudo()  &&  feat.GetPseudo())  ||
02188          IsOverlappingGenePseudo(feat) ) {
02189         return;
02190     }
02191 
02192     if ( !feat.CanGetProduct() ) {
02193         return;
02194     }
02195 
02196     CBioseq_Handle bsh = m_Scope->GetBioseqHandle(feat.GetProduct());
02197     if ( !bsh ) {
02198         const CSeq_id& sid = GetId(feat.GetProduct(), m_Scope);
02199         if ( sid.IsLocal() ) {
02200             if ( m_Imp.IsGPS()  ||  m_Imp.IsRefSeq() ) {
02201                 PostErr(eDiag_Error, eErr_SEQ_FEAT_MissingMRNAproduct,
02202                     "Product Bioseq of mRNA feature is not "
02203                     "packaged in the record", feat);
02204             }
02205         }
02206     } else {
02207 
02208         // !!! DEBUG {
02209         if( m_Imp.AvoidPerfBottlenecks() ) {
02210             return;
02211         } 
02212         // } DEBUG
02213 
02214         CFeat_CI mrna(
02215             bsh, SAnnotSelector(CSeqFeatData::e_Rna)
02216             .SetOverlapTotalRange()
02217             .SetResolveNone()
02218             .SetByProduct());
02219         while ( mrna ) {
02220             if ( &(mrna->GetOriginalFeature()) != &feat ) {
02221                     PostErr(eDiag_Critical, eErr_SEQ_FEAT_MultipleMRNAproducts,
02222                         "Same product Bioseq from multiple mRNA features", feat);
02223                     break;
02224             }
02225             ++mrna;
02226         }
02227     }
02228 }
02229 
02230 
02231 void CValidError_feat::ValidateBothStrands(const CSeq_feat& feat)
02232 {
02233     const CSeq_loc& location = feat.GetLocation ();
02234     bool bothstrands = false, bothreverse = false;
02235 
02236     for ( CSeq_loc_CI citer (location); citer; ++citer ) {
02237         ENa_strand strand = citer.GetStrand();
02238         if ( strand == eNa_strand_both ) {
02239             bothstrands = true;
02240         } else if (strand == eNa_strand_both_rev) {
02241             bothreverse = true;
02242         }
02243     }
02244     if (bothstrands || bothreverse) {
02245         EDiagSev sev = eDiag_Warning;
02246         string prefix = "Feature";
02247         if (feat.IsSetData()) {
02248             if (feat.GetData().IsCdregion()) {
02249                 prefix = "CDS";
02250                 sev = eDiag_Error;
02251             } else if (feat.GetData().IsRna() && feat.GetData().GetRna().IsSetType()
02252                        && feat.GetData().GetRna().GetType() == CRNA_ref::eType_mRNA) {
02253                 prefix = "mRNA";
02254                 sev = eDiag_Error;
02255             }
02256         }
02257         string suffix = "";
02258         if (bothstrands && bothreverse) {
02259             suffix = "(forward and reverse)";
02260         } else if (bothstrands) {
02261             suffix = "(forward)";
02262         } else if (bothreverse) {
02263             suffix = "(reverse)";
02264         }
02265 
02266         PostErr (sev, eErr_SEQ_FEAT_BothStrands, 
02267                 prefix + " may not be on both " + suffix + " strands", feat);  
02268     }
02269 }
02270 
02271 
02272 // Precondition: feat is a coding region
02273 void CValidError_feat::ValidateCommonCDSProduct
02274 (const CSeq_feat& feat)
02275 {
02276     if ( (feat.CanGetPseudo()  &&  feat.GetPseudo())  ||
02277           IsOverlappingGenePseudo(feat) ) {
02278         return;
02279     }
02280     
02281     if ( !feat.CanGetProduct() ) {
02282         return;
02283     }
02284     
02285     const CCdregion& cdr = feat.GetData().GetCdregion();
02286     if ( cdr.CanGetOrf() ) {
02287         return;
02288     }
02289 
02290     CBioseq_Handle prod = m_Scope->GetBioseqHandle(feat.GetProduct());
02291     if ( !prod ) {
02292         const CSeq_id* sid = 0;
02293         try {
02294             sid = &(GetId(feat.GetProduct(), m_Scope));
02295         } catch (const CObjmgrUtilException&) {}
02296 
02297         // okay to have far RefSeq product, but only if genomic product set
02298         if ( sid == 0  ||  !sid->IsOther() ) {
02299             if ( m_Imp.IsGPS() ) {
02300                 return;
02301             }
02302         }
02303         // or just a bioseq
02304         if ( m_Imp.GetTSE().IsSeq() ) {
02305             return;
02306         }
02307 
02308         // or in a standalone Seq-annot
02309         if ( m_Imp.IsStandaloneAnnot() ) {
02310             return;
02311         }
02312 
02313         PostErr(eDiag_Warning, eErr_SEQ_FEAT_MultipleCDSproducts,
02314             "Unable to find product Bioseq from CDS feature", feat);
02315         return;
02316     }
02317     CBioseq_Handle nuc  = m_Scope->GetBioseqHandle(feat.GetLocation());
02318     if ( nuc ) {
02319         CSeq_entry_Handle prod_nps = 
02320             prod.GetExactComplexityLevel(CBioseq_set::eClass_nuc_prot);
02321         CSeq_entry_Handle nuc_nps = 
02322             nuc.GetExactComplexityLevel(CBioseq_set::eClass_nuc_prot);
02323 
02324         if ( (prod_nps != nuc_nps)  &&  (!m_Imp.IsNT())  &&  (!m_Imp.IsGPS()) ) {
02325             PostErr(eDiag_Error, eErr_SEQ_FEAT_CDSproductPackagingProblem,
02326                 "Protein product not packaged in nuc-prot set with nucleotide", 
02327                 feat);
02328         }
02329     }
02330     const CSeq_feat* sfp = GetCDSForProduct(prod);
02331     if ( sfp == 0 ) {
02332         return;
02333     }
02334     
02335     if ( &feat != sfp ) {
02336         // if genomic product set, with one cds on contig and one on cdna,
02337         // do not report.
02338         if ( m_Imp.IsGPS() ) {
02339             // feature packaging test will do final contig vs. cdna check
02340             CBioseq_Handle sfh = m_Scope->GetBioseqHandle(sfp->GetLocation());
02341             if ( nuc != sfh ) {
02342                 return;
02343             }
02344         }
02345         PostErr(eDiag_Critical, eErr_SEQ_FEAT_MultipleCDSproducts, 
02346             "Same product Bioseq from multiple CDS features", feat);
02347     }
02348 }
02349 
02350 
02351 void CValidError_feat::ValidateCDSPartial(const CSeq_feat& feat)
02352 {
02353     if ( !feat.CanGetProduct()  ||  !feat.CanGetLocation() ) {
02354         return;
02355     }
02356 
02357     CBioseq_Handle bsh = m_Scope->GetBioseqHandle(feat.GetProduct());
02358     if ( !bsh ) {
02359         return;
02360     }
02361 
02362     CSeqdesc_CI sd(bsh, CSeqdesc::e_Molinfo);
02363     if ( !sd ) {
02364         return;
02365     }
02366     const CMolInfo& molinfo = sd->GetMolinfo();
02367 
02368     const CSeq_loc& loc = feat.GetLocation();
02369     bool partial5 = loc.IsPartialStart(eExtreme_Biological);
02370     bool partial3 = loc.IsPartialStop(eExtreme_Biological);
02371 
02372     if ( molinfo.CanGetCompleteness() ) {
02373         switch ( molinfo.GetCompleteness() ) {
02374         case CMolInfo::eCompleteness_unknown:
02375             break;
02376 
02377         case CMolInfo::eCompleteness_complete:
02378             if ( partial5 || partial3 ) {
02379                 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
02380                     "CDS is partial but protein is complete", feat);
02381             }
02382             break;
02383 
02384         case CMolInfo::eCompleteness_partial:
02385             break;
02386 
02387         case CMolInfo::eCompleteness_no_left:
02388             if ( !partial5 ) {
02389                 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
02390                     "CDS is 5' complete but protein is NH2 partial", feat);
02391             }
02392             if ( partial3 ) {
02393                 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
02394                     "CDS is 3' partial but protein is NH2 partial", feat);
02395             }
02396             break;
02397 
02398         case CMolInfo::eCompleteness_no_right:
02399             if (! partial3) {
02400                 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
02401                     "CDS is 3' complete but protein is CO2 partial", feat);
02402             }
02403             if (partial5) {
02404                 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
02405                     "CDS is 5' partial but protein is CO2 partial", feat);
02406             }
02407             break;
02408 
02409         case CMolInfo::eCompleteness_no_ends:
02410             if ( partial5 && partial3 ) {
02411             } else if ( partial5 ) {
02412                 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
02413                     "CDS is 5' partial but protein has neither end", feat);
02414             } else if ( partial3 ) {
02415                 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
02416                     "CDS is 3' partial but protein has neither end", feat);
02417             } else {
02418                 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
02419                     "CDS is complete but protein has neither end", feat);
02420             }
02421             break;
02422 
02423         case CMolInfo::eCompleteness_has_left:
02424             break;
02425 
02426         case CMolInfo::eCompleteness_has_right:
02427             break;
02428 
02429         case CMolInfo::eCompleteness_other:
02430             break;
02431 
02432         default:
02433             break;
02434         }
02435     }
02436 }
02437 
02438 
02439 void CValidError_feat::ValidateBadMRNAOverlap(const CSeq_feat& feat)
02440 {
02441     const CSeq_loc& loc = feat.GetLocation();
02442 
02443     // !!! DEBUG {
02444     if( m_Imp.AvoidPerfBottlenecks() ) {
02445         return;
02446     } 
02447     // } DEBUG
02448 
02449     CConstRef<CSeq_feat> mrna = GetBestOverlappingFeat(
02450         loc,
02451         CSeqFeatData::eSubtype_mRNA,
02452         eOverlap_Simple,
02453         *m_Scope);
02454     if ( !mrna ) {
02455         return;
02456     }
02457 
02458     mrna = GetBestOverlappingFeat(
02459         loc,
02460         CSeqFeatData::eSubtype_mRNA,
02461         eOverlap_CheckIntRev,
02462         *m_Scope);
02463     if ( mrna ) {
02464         return;
02465     }
02466 
02467     mrna = GetBestOverlappingFeat(
02468         loc,
02469         CSeqFeatData::eSubtype_mRNA,
02470         eOverlap_Interval,
02471         *m_Scope);
02472     if ( !mrna ) {
02473         return;
02474     }
02475 
02476     mrna = GetBestOverlappingFeat(
02477         loc,
02478         CSeqFeatData::eSubtype_mRNA,
02479         eOverlap_Subset,
02480         *m_Scope);
02481 
02482     EDiagSev sev = eDiag_Error;
02483     if ( m_Imp.IsNC()  ||  m_Imp.IsNT()  ||  
02484          (feat.CanGetExcept()  &&  feat.GetExcept()) ) {
02485         sev = eDiag_Warning;
02486     }
02487     if ( mrna ) {
02488         // ribosomal slippage exception suppresses CDSmRNArange warning
02489         bool supress = false;
02490 
02491         if ( feat.CanGetExcept_text() ) {
02492             const CSeq_feat::TExcept_text& text = feat.GetExcept_text();
02493             if ( NStr::FindNoCase(text, "ribosomal slippage") != NPOS ) {
02494                 supress = true;
02495             }
02496         }
02497         if ( !supress ) {
02498             PostErr(sev, eErr_SEQ_FEAT_CDSmRNArange,
02499                 "mRNA contains CDS but internal intron-exon boundaries "
02500                 "do not match", feat);
02501         }
02502     } else {
02503         PostErr(sev, eErr_SEQ_FEAT_CDSmRNArange,
02504             "mRNA overlaps or contains CDS but does not completely "
02505             "contain intervals", feat);
02506     }
02507 }
02508 
02509 
02510 void CValidError_feat::ValidateBadGeneOverlap(const CSeq_feat& feat)
02511 {
02512     const CGene_ref* grp = feat.GetGeneXref();
02513     if ( grp != 0 ) {
02514         return;
02515     }
02516 
02517     // !!! DEBUG {
02518     if( m_Imp.AvoidPerfBottlenecks() ) {
02519         return;
02520     } 
02521     // } DEBUG
02522     
02523     // look for overlapping genes
02524     if (GetOverlappingGene(feat.GetLocation(), *m_Scope)) {
02525         return;
02526     }
02527 
02528     // look for intersecting genes
02529     if (!GetBestOverlappingFeat(feat.GetLocation(),
02530                                   CSeqFeatData::e_Gene,
02531                                   eOverlap_Simple,
02532                                   *m_Scope)) {
02533         return;
02534     }
02535 
02536     // found an intersecting (but not overlapping) gene
02537     // set severity level
02538     EDiagSev sev = (m_Imp.IsNC()  ||  m_Imp.IsNT()) ? eDiag_Warning : eDiag_Error;
02539 
02540     // report error
02541     if (feat.GetData().IsCdregion()) {
02542         PostErr(sev, eErr_SEQ_FEAT_CDSgeneRange, 
02543             "gene overlaps CDS but does not completely contain it", feat);
02544     } else if (feat.GetData().IsRna()) {
02545         if (GetOverlappingOperon(feat.GetLocation(), *m_Scope)) {
02546             return;
02547         }
02548         PostErr(sev, eErr_SEQ_FEAT_mRNAgeneRange,
02549             "gene overlaps mRNA but does not completely contain it", feat);
02550     }
02551 }
02552 
02553 
02554 static const string s_LegalExceptionStrings[] = {
02555     "RNA editing",
02556     "reasons given in citation",
02557     "rearrangement required for product",
02558     "ribosomal slippage",
02559     "trans-splicing",
02560     "alternative processing",
02561     "artificial frameshift",
02562     "nonconsensus splice site",
02563     "modified codon recognition",
02564     "alternative start codon",
02565     "dicistronic gene",
02566     "transcribed product replaced",
02567     "translated product replaced",
02568     "transcribed pseudogene",
02569 };
02570 
02571 
02572 static const string s_RefseqExceptionStrings [] = {
02573     "unclassified transcription discrepancy",
02574     "unclassified translation discrepancy",
02575     "mismatches in transcription",
02576     "mismatches in translation",
02577     "adjusted for low-quality genome",
02578 };
02579 
02580 
02581 void CValidError_feat::ValidateExcept(const CSeq_feat& feat)
02582 {
02583     if (feat.IsSetExcept_text() && !NStr::IsBlank (feat.GetExcept_text()) && !feat.IsSetExcept()) {
02584         PostErr(eDiag_Warning, eErr_SEQ_FEAT_ExceptInconsistent,
02585             "Exception text is set, but exception flag is not set", feat);
02586     } else if (feat.IsSetExcept() && (!feat.IsSetExcept_text() || NStr::IsBlank (feat.GetExcept_text()))) {
02587         PostErr(eDiag_Warning, eErr_SEQ_FEAT_ExceptInconsistent,
02588             "Exception flag is set, but exception text is empty", feat);
02589     }
02590     if ( feat.CanGetExcept_text()  &&  !feat.GetExcept_text ().empty() ) {
02591         ValidateExceptText(feat.GetExcept_text(), feat);
02592     }
02593 }
02594 
02595 
02596 void CValidError_feat::ValidateExceptText(const string& text, const CSeq_feat& feat)
02597 {
02598     if ( text.empty() ) return;
02599 
02600     EDiagSev sev = eDiag_Error;
02601     bool found = false;
02602 
02603     string str;
02604     string::size_type   begin = 0, end, textlen = text.length();
02605 
02606     const string* except_begin = s_LegalExceptionStrings;
02607     const string* except_end = 
02608         &(s_LegalExceptionStrings[sizeof(s_LegalExceptionStrings) / sizeof(string)]);
02609     const string* refseq_begin = s_RefseqExceptionStrings;
02610     const string* refseq_end = 
02611         &(s_RefseqExceptionStrings[sizeof(s_RefseqExceptionStrings) / sizeof(string)]);
02612     
02613     while ( begin < textlen ) {
02614         found = false;
02615         end = min( text.find_first_of(',', begin), textlen );
02616         str = NStr::TruncateSpaces( text.substr(begin, end) );
02617         if ( find(except_begin, except_end, str) != except_end ) {
02618             found = true;
02619         }
02620         if ( !found  &&  (m_Imp.IsGPS() || m_Imp.IsRefSeq()) ) {
02621             if ( find(refseq_begin, refseq_end, str) != refseq_end ) {
02622                found = true;
02623             }
02624         }
02625         if ( !found ) {
02626             if ( m_Imp.IsNC()  ||  m_Imp.IsNT() ) {
02627                 sev = eDiag_Warning;
02628             }
02629             PostErr(sev, eErr_SEQ_FEAT_InvalidQualifierValue,
02630                 str + " is not legal exception explanation", feat);
02631         }
02632         begin = end + 1;
02633     }
02634 }
02635 
02636 
02637 
02638 
02639 void CValidError_feat::ReportCdTransErrors
02640 (const CSeq_feat& feat,
02641  bool show_stop,
02642  bool got_stop, 
02643  bool no_end,
02644  int ragged,
02645  bool report_errors,
02646  bool& has_errors)
02647 {
02648     if (show_stop) {
02649         if (!got_stop  && !no_end) {
02650             has_errors = true;
02651             if (report_errors) {
02652                 PostErr(eDiag_Error, eErr_SEQ_FEAT_NoStop, 
02653                     "Missing stop codon", feat);
02654             }
02655         } else if (got_stop  &&  no_end) {
02656             has_errors = true;
02657             if (report_errors) {
02658                 PostErr (eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
02659                     "Got stop codon, but 3'end is labeled partial", feat);
02660             }
02661         } else if (got_stop  &&  !no_end  &&  ragged) {
02662             has_errors = true;
02663             if (report_errors) {
02664                 PostErr (eDiag_Error, eErr_SEQ_FEAT_TransLen, 
02665                     "Coding region extends " + NStr::IntToString(ragged) +
02666                     " base(s) past stop codon", feat);
02667             }
02668         }
02669     }
02670 }
02671 
02672 
02673 static const string sc_BypassCdsTransCheckText[] = {
02674   "RNA editing",
02675   "artificial frameshift",
02676   "mismatches in translation",
02677   "rearrangement required for product",
02678   "reasons given in citation",
02679   "unclassified translation discrepancy"  
02680 };
02681 DEFINE_STATIC_ARRAY_MAP(CStaticArraySet<string>, sc_BypassCdsTransCheck, sc_BypassCdsTransCheckText);
02682 
02683 
02684 static void s_LocIdType(const CSeq_loc& loc, CScope& scope, const CSeq_entry& tse,
02685                         bool& is_nt, bool& is_ng, bool& is_nw, bool& is_nc)
02686 {
02687     is_nt = is_ng = is_nw = is_nc = false;
02688     if (!IsOneBioseq(loc, &scope)) {
02689         return;
02690     }
02691     const CSeq_id& id = GetId(loc, &scope);
02692     CBioseq_Handle bsh = scope.GetBioseqHandleFromTSE(id, tse);
02693     if (bsh) {
02694         FOR_EACH_SEQID_ON_BIOSEQ (it, *(bsh.GetBioseqCore())) {
02695             CSeq_id::EAccessionInfo info = (*it)->IdentifyAccession();
02696             is_nt |= (info == CSeq_id::eAcc_refseq_contig);
02697             is_ng |= (info == CSeq_id::eAcc_refseq_genomic);
02698             is_nw |= (info == CSeq_id::eAcc_refseq_wgs_intermed);
02699             is_nc |= (info == CSeq_id::eAcc_refseq_chromosome);
02700         }
02701     }
02702 }
02703 
02704 
02705 void CValidError_feat::ValidateCdTrans(const CSeq_feat& feat)
02706 {
02707     // bail if not CDS
02708     if (!feat.GetData().IsCdregion()) {
02709         return;
02710     }
02711 
02712     bool prot_ok = true;
02713     int  ragged = 0;
02714     bool has_errors = false, unclassified_except = false,
02715         mismatch_except = false, report_errors = true;
02716     string farstr;
02717     
02718     if (feat.CanGetExcept()  &&  feat.GetExcept()  &&
02719         feat.CanGetExcept_text()) {
02720         const string& except_text = feat.GetExcept_text();
02721         ITERATE (CStaticArraySet<string>, it, sc_BypassCdsTransCheck) {
02722             if (NStr::FindNoCase(except_text, *it) != NPOS) {
02723                 report_errors = false;  // biological exception
02724             }
02725             if (NStr::FindNoCase(except_text, "unclassified transcription discrepancy") != NPOS) {
02726                 unclassified_except = true;
02727             }
02728             if (NStr::FindNoCase(except_text, "mismatches in transcription") != NPOS) {
02729                 mismatch_except = true;
02730                 report_errors = true;
02731             }
02732         }
02733     }
02734 
02735     // pseudo gene
02736     FOR_EACH_GBQUAL_ON_FEATURE (it, feat) {
02737         if ((*it)->IsSetQual()  &&  NStr::EqualNocase((*it)->GetQual(), "pseudo")) {
02738             return;
02739         }
02740     }
02741 
02742     const CCdregion& cdregion = feat.GetData().GetCdregion();
02743 
02744     // check for unparsed transl_except
02745     bool transl_except = false;
02746     if (!cdregion.IsSetCode_break()) {
02747         FOR_EACH_GBQUAL_ON_FEATURE (it, feat) {
02748             if ((*it)->IsSetQual()  &&  NStr::Equal((*it)->GetQual(), "transl_except")) {
02749                 transl_except = true;
02750             }
02751         }
02752     }
02753     
02754     const CSeq_loc& location = feat.GetLocation();
02755     
02756     int gc = 0;
02757     if ( cdregion.CanGetCode() ) {
02758         // We assume that the id is set for all Genetic_code
02759         gc = cdregion.GetCode().GetId();
02760     }
02761     string gccode = NStr::IntToString(gc);
02762 
02763     string transl_prot;   // translated protein
02764     bool alt_start = false;
02765     try {
02766         CCdregion_translate::TranslateCdregion(
02767             transl_prot, 
02768             feat, 
02769             *m_Scope,
02770             true,   // include stop codons
02771             false,  // do not remove trailing X/B/Z
02772             &alt_start,
02773             CCdregion_translate::eTruncate);
02774     } catch (CException&) {
02775     }
02776     if (transl_prot.empty()) {
02777         if (report_errors) {
02778             PostErr (eDiag_Error, eErr_SEQ_FEAT_CdTransFail, 
02779                 "Unable to translate", feat);
02780         }
02781     }
02782 
02783     // check alternative start codon
02784     if (alt_start  &&  gc == 1) {
02785         EDiagSev sev = eDiag_Warning;
02786         if (s_IsLocRefSeqMrna(feat.GetLocation(), *m_Scope)) {
02787             sev = eDiag_Error;
02788         } else if (s_IsLocGEDL(feat.GetLocation(), *m_Scope)) {
02789             sev = eDiag_Info;
02790         }
02791         if (sev != eDiag_Info  &&
02792             feat.CanGetExcept()  &&  feat.GetExcept()  &&
02793             feat.CanGetExcept_text()  &&  !feat.GetExcept_text().empty()) {
02794             if (feat.GetExcept_text().find("alternative start codon") != NPOS) {
02795                 sev = eDiag_Info;
02796             }
02797         }
02798         if (sev != eDiag_Info) {
02799             has_errors = true;
02800             if (report_errors) {
02801                 PostErr(sev, eErr_SEQ_FEAT_AltStartCodon,
02802                     "Alternative start codon used", feat);
02803             }
02804         }
02805     }
02806 
02807     bool no_end = false;
02808     unsigned int part_loc = SeqLocPartialCheck(location, m_Scope);
02809     unsigned int part_prod = eSeqlocPartial_Complete;
02810     if (feat.IsSetProduct()) {
02811         part_prod = SeqLocPartialCheck(feat.GetProduct(), m_Scope);
02812         if ( (part_loc & eSeqlocPartial_Stop)  ||
02813             (part_prod & eSeqlocPartial_Stop) ) {
02814             no_end = true;
02815         } else {    
02816             // complete stop, so check for ragged end
02817             ragged = CheckForRaggedEnd(location, cdregion);
02818         }
02819     }
02820         
02821     
02822     // check for code break not on a codon
02823     has_errors = x_ValidateCodeBreakNotOnCodon(feat, location, cdregion, report_errors);
02824     
02825     if (cdregion.GetFrame() > CCdregion::eFrame_one) {
02826         has_errors = true;
02827         EDiagSev sev = s_IsLocRefSeqMrna(feat.GetLocation(), *m_Scope) ?
02828             eDiag_Error : eDiag_Warning;
02829         if (!(part_loc & eSeqlocPartial_Start)) {
02830             if (report_errors) {
02831                 PostErr(sev, eErr_SEQ_FEAT_PartialProblem, 
02832                     "Suspicious CDS location - frame > 1 but not 5' partial",
02833                     feat);
02834             }
02835         } else if ((part_loc & eSeqlocPartial_Nostart)  &&
02836             !IsPartialAtSpliceSite(location, eSeqlocPartial_Nostart)) {
02837             if (report_errors) {
02838                 PostErr(sev, eErr_SEQ_FEAT_PartialProblem, 
02839                     "Suspicious CDS location - frame > 1 and not at consensus splice site",
02840                     feat);
02841             }
02842         }
02843     }
02844     
02845     bool no_beg = 
02846         (part_loc & eSeqlocPartial_Start)  ||  (part_prod & eSeqlocPartial_Start);
02847 
02848     bool reported_bad_start_codon = false;
02849 
02850     // count internal stops
02851     size_t internal_stop_count = 0;
02852     bool got_stop = false;
02853     if (!transl_prot.empty()) {
02854         bool got_dash = (transl_prot[0] == '-');
02855         got_stop = (transl_prot[transl_prot.length() - 1] == '*');
02856     
02857         ITERATE(string, it, transl_prot) {
02858             if ( *it == '*' ) {
02859                 ++internal_stop_count;
02860             }
02861         }
02862         if (got_stop) {
02863             --internal_stop_count;
02864         }
02865     
02866         if (internal_stop_count > 0) {
02867             has_errors = true;
02868             if (got_dash) {
02869                 if (report_errors  ||  unclassified_except) {
02870                     PostErr(eDiag_Error, eErr_SEQ_FEAT_StartCodon,
02871                         "Illegal start codon and " + 
02872                         NStr::IntToString(internal_stop_count) +
02873                         " internal stops. Probably wrong genetic code [" +
02874                         gccode + "]", feat);
02875                 }
02876             } 
02877             if (report_errors  ||  unclassified_except) {
02878                 PostErr(eDiag_Error, eErr_SEQ_FEAT_InternalStop, 
02879                     NStr::IntToString(internal_stop_count) + 
02880                     " internal stops. Genetic code [" + gccode + "]", feat);
02881             }
02882             prot_ok = false;
02883             if (internal_stop_count > 5) {
02884                 return;
02885             }
02886         } else if (got_dash) {
02887             has_errors = true;
02888             if (report_errors) {
02889                 PostErr(eDiag_Error, eErr_SEQ_FEAT_StartCodon, 
02890                     "Illegal start codon used. Wrong genetic code [" +
02891                     gccode + "] or protein should be partial", feat);
02892                 reported_bad_start_codon = true;
02893             }
02894         }
02895     }
02896     
02897     bool show_stop = true;
02898 
02899     const CSeq_id* protid = 0;
02900     CBioseq_Handle prot_handle;
02901     if (feat.IsSetProduct()) {
02902         try {
02903             protid = &GetId(feat.GetProduct(), m_Scope);
02904         } catch (CException&) {}
02905         if (protid != NULL) {
02906             prot_handle = m_Scope->GetBioseqHandleFromTSE(*protid, m_Imp.GetTSE());
02907             if (!prot_handle  &&  m_Imp.IsFarFetchCDSproducts()) {
02908                 prot_handle = m_Scope->GetBioseqHandle(*protid);
02909             }
02910         }
02911     }
02912 
02913     if (!prot_handle) {
02914         if  (!m_Imp.IsStandaloneAnnot()) {
02915             if (transl_prot.length() > 6) {
02916                 bool is_nt, is_ng, is_nw, is_nc;
02917                 s_LocIdType(location, *m_Scope, m_Imp.GetTSE(), is_nt, is_ng, is_nw, is_nc);
02918                 if (!(is_nt || is_ng || is_nw)) {
02919                     EDiagSev sev = eDiag_Error;
02920                     if (IsDeltaOrFarSeg(location, m_Scope)) {
02921                         sev = eDiag_Warning;
02922                     }
02923                     if (is_nc) {
02924                         sev = eDiag_Warning;
02925                         if ( m_Imp.GetTSE().IsSeq()) {
02926                             sev = eDiag_Info;
02927                         }
02928                     }
02929                     if (sev != eDiag_Info) {
02930                         has_errors = true;
02931                         if (report_errors) {
02932                             PostErr(sev, eErr_SEQ_FEAT_NoProtein, 
02933                                 "No protein Bioseq given", feat);
02934                         }
02935                     }
02936                 }
02937             }
02938             ReportCdTransErrors(feat, show_stop, got_stop, no_end, ragged,
02939                 report_errors, has_errors);
02940         }
02941     }
02942 
02943     // can't check for mismatches unless there is a product
02944     if (feat.IsSetProduct()) {
02945         CSeqVector prot_vec = prot_handle.GetSeqVector();
02946         prot_vec.SetCoding(CSeq_data::e_Ncbieaa);
02947         size_t prot_len = prot_vec.size(); 
02948         size_t len = transl_prot.length();
02949 
02950         if (got_stop  &&  (len == prot_len + 1)) { // ok, got stop
02951             --len;
02952         }
02953 
02954         // ignore terminal 'X' from partial last codon if present
02955         while (prot_len > 0) {
02956             if (prot_vec[prot_len - 1] == 'X') {  //remove terminal X
02957                 --prot_len;
02958             } else {
02959                 break;
02960             }
02961         }
02962         
02963         while (len > 0) {
02964             if (transl_prot[len - 1] == 'X') {  //remove terminal X
02965                 --len;
02966             } else {
02967                 break;
02968             }
02969         }
02970 
02971         vector<TSeqPos> mismatches;
02972         if (len == prot_len)  {                // could be identical
02973             for (TSeqPos i = 0; i < len; ++i) {
02974                 if (transl_prot[i] != prot_vec[i]) {
02975                     has_errors = true;
02976                     mismatches.push_back(i);
02977                     prot_ok = false;
02978                 }
02979             }
02980         } else {
02981             has_errors = true;
02982             if (report_errors) {
02983                 PostErr(eDiag_Error, eErr_SEQ_FEAT_TransLen,
02984                     "Given protein length [" + NStr::IntToString(prot_len) + 
02985                     "] does not match " + farstr + "translation length [" + 
02986                     NStr::IntToString(len) + "]", feat);
02987             }
02988         }
02989         
02990         // Mismatch on first residue
02991         string msg;
02992         if (!mismatches.empty()  &&  mismatches.front() == 0) {
02993             if (feat.IsSetPartial() && feat.GetPartial() && (!no_beg) && (!no_end)) {
02994                 if (report_errors) {
02995                     PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem, 
02996                         "Start of location should probably be partial",
02997                         feat);
02998                 }
02999             } else if (transl_prot[mismatches.front()] == '-') {
03000                 if (report_errors && !reported_bad_start_codon) {
03001                     PostErr(eDiag_Error, eErr_SEQ_FEAT_StartCodon,
03002                         "Illegal start codon used. Wrong genetic code [" +
03003                         gccode + "] or protein should be partial", feat);
03004                 }
03005             }
03006         }
03007 
03008         char prot_res, transl_res;
03009         string nuclocstr;
03010         if (mismatches.size() > 10) {
03011             // report total number of mismatches and the details of the 
03012             // first and last.
03013             nuclocstr = MapToNTCoords(feat, feat.GetProduct(), mismatches.front());
03014             prot_res = prot_vec[mismatches.front()];
03015             transl_res = Residue(transl_prot[mismatches.front()]);
03016             msg = 
03017                 NStr::IntToString(mismatches.size()) + " mismatches found. " +
03018                 "First mismatch at " + NStr::IntToString(mismatches.front() + 1) +
03019                 ", residue in protein [" + prot_res + "]" +
03020                 " != translation [" + transl_res + "]";
03021             if (!nuclocstr.empty()) {
03022                 msg += " at " + nuclocstr;
03023             }
03024             nuclocstr = MapToNTCoords(feat, feat.GetProduct(), mismatches.back());
03025             prot_res = prot_vec[mismatches.back()];
03026             transl_res = Residue(transl_prot[mismatches.back()]);
03027             msg += 
03028                 ". Last mismatch at " + NStr::IntToString(mismatches.back() + 1) +
03029                 ", residue in protein [" + prot_res + "]" +
03030                 " != translation [" + transl_res + "]";
03031             if (!nuclocstr.empty()) {
03032                 msg += " at " + nuclocstr;
03033             }
03034             msg += ". Genetic code [" + gccode + "]";
03035             if (report_errors  &&  !mismatch_except) {
03036                 PostErr(eDiag_Error, eErr_SEQ_FEAT_MisMatchAA, msg, feat);
03037             }
03038         } else {
03039             // report individual mismatches
03040             for (size_t i = 0; i < mismatches.size(); ++i) {
03041                 nuclocstr = MapToNTCoords(feat, feat.GetProduct(), mismatches[i]);
03042                 prot_res = prot_vec[mismatches[i]];
03043                 transl_res = Residue(transl_prot[mismatches[i]]);
03044                 msg = farstr + "Residue " + NStr::IntToString(mismatches[i] + 1) +
03045                     " in protein [" + prot_res + "]" +
03046                     " != translation [" + transl_res + "]";
03047                 if (!nuclocstr.empty()) {
03048                     msg += " at " + nuclocstr;
03049                 }
03050                 if (report_errors  &&  !mismatch_except) {
03051                     PostErr(eDiag_Error, eErr_SEQ_FEAT_MisMatchAA, msg, feat);
03052                 }
03053             }
03054         }
03055 
03056         if (feat.CanGetPartial()  &&  feat.GetPartial()  &&
03057             mismatches.empty()) {
03058             if (!no_beg  && !no_end) {
03059                 if (report_errors) {
03060                     if (!got_stop) {
03061                         PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem, 
03062                             "End of location should probably be partial", feat);
03063                     } else {
03064                         PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
03065                             "This SeqFeat should not be partial", feat);
03066                     }
03067                 }
03068                 show_stop = false;
03069             }
03070         }
03071     }
03072 
03073     ReportCdTransErrors(feat, show_stop, got_stop, no_end, ragged, 
03074         report_errors, has_errors);
03075 
03076     if (report_errors && transl_except) {
03077         if (prot_ok) {
03078             PostErr(eDiag_Warning, eErr_SEQ_FEAT_TranslExcept,
03079                 "Unparsed transl_except qual (but protein is okay). Skipped", feat);
03080         } else {
03081             PostErr(eDiag_Warning, eErr_SEQ_FEAT_TranslExcept,
03082                 "Unparsed transl_except qual. Skipped", feat);
03083         }
03084     }
03085 
03086     if (!report_errors  &&  has_errors) {
03087         PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnnecessaryException,
03088             "CDS has exception but passes translation test", feat);
03089     }
03090 }
03091 
03092 
03093 bool CValidError_feat::x_ValidateCodeBreakNotOnCodon
03094 (const CSeq_feat& feat,
03095  const CSeq_loc& loc, 
03096  const CCdregion& cdregion,
03097  bool report_errors)
03098 {
03099     TSeqPos len = GetLength(loc, m_Scope);
03100     bool has_errors = false;
03101 
03102     FOR_EACH_CODEBREAK_ON_CDREGION (cbr, cdregion) {
03103         size_t codon_length = GetLength((*cbr)->GetLoc(), m_Scope);
03104         TSeqPos from = LocationOffset(loc, (*cbr)->GetLoc(), 
03105             eOffset_FromStart, m_Scope);
03106         TSeqPos to = from + codon_length - 1;
03107         
03108         // check for code break not on a codon
03109         if (codon_length == 3  ||
03110             ((codon_length == 1  ||  codon_length == 2)  &&  to == len - 1)) {
03111             size_t start_pos;
03112             switch (cdregion.GetFrame()) {
03113             case CCdregion::eFrame_two:
03114                 start_pos = 1;
03115                 break;
03116             case CCdregion::eFrame_three:
03117                 start_pos = 2;
03118                 break;
03119             default:
03120                 start_pos = 0;
03121                 break;
03122             }
03123             if ((from % 3) != start_pos) {
03124                 if (report_errors) {
03125                     PostErr(eDiag_Warning, eErr_SEQ_FEAT_TranslExceptPhase,
03126                         "transl_except qual out of frame.", feat);
03127                 }
03128                 has_errors = true;
03129             }
03130         }
03131     }
03132     return has_errors;
03133 }
03134 
03135 
03136 // Check for redundant gene Xref
03137 void CValidError_feat::ValidateGeneXRef(const CSeq_feat& feat)
03138 {
03139     const CGene_ref* grp = feat.GetGeneXref();
03140     if ( !grp  ||  grp->IsSuppressed() ) {
03141         return;
03142     }
03143 
03144     // !!! DEBUG {
03145     if( m_Imp.AvoidPerfBottlenecks() ) {
03146         return;
03147     } 
03148     // } DEBUG
03149     
03150     CConstRef<CSeq_feat> overlap = 
03151         GetOverlappingGene(feat.GetLocation(), *m_Scope);
03152     if ( !overlap ) {
03153         return;
03154     }
03155     
03156     const CGene_ref& gene = overlap->GetData().GetGene();
03157     if ( !gene.IsSuppressed() ) {
03158         if ( gene.CanGetAllele()  && !gene.GetAllele().empty() ) {
03159             const string& allele = gene.GetAllele();
03160 
03161             FOR_EACH_GBQUAL_ON_FEATURE (qual_iter, feat) {
03162                 const CGb_qual& qual = **qual_iter;
03163                 if ( qual.CanGetQual()  &&
03164                      NStr::Compare(qual.GetQual(), "allele") == 0 ) {
03165                     if ( qual.CanGetVal()  &&
03166                          NStr::CompareNocase(qual.GetVal(), allele) == 0 ) {
03167                         PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidQualifierValue,
03168                             "Redundant allele qualifier (" + allele + 
03169                             ") on gene and feature", feat);
03170                     } else if (feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_variation) {
03171                         PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidQualifierValue,
03172                             "Mismatched allele qualifier on gene (" + allele + 
03173                             ") and feature (" + qual.GetVal() +")", feat);
03174                     }
03175                 }
03176             }
03177         }
03178     }
03179 
03180     string label, gene_label;
03181     grp->GetLabel(&label);
03182     gene.GetLabel(&gene_label);
03183     
03184     if ( NStr::CompareNocase(label, gene_label) == 0 ) {
03185         PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnnecessaryGeneXref,
03186             "Unnecessary gene cross-reference " + label, feat);
03187     }
03188 }
03189 
03190 
03191 void CValidError_feat::ValidateOperon(const CSeq_feat& gene)
03192 {
03193     CConstRef<CSeq_feat> operon = 
03194         GetOverlappingOperon(gene.GetLocation(), *m_Scope);
03195     if ( !operon  ||  !operon->CanGetQual() ) {
03196         return;
03197     }
03198 
03199     string label;
03200     feature::GetLabel(gene, &label, feature::eContent, m_Scope);
03201     if ( label.empty() ) {
03202         return;
03203     }
03204 
03205     FOR_EACH_GBQUAL_ON_FEATURE (qual_iter, gene) {
03206         const CGb_qual& qual = **qual_iter;
03207         if( qual.CanGetQual()  &&  qual.CanGetVal() ) {
03208             if ( NStr::Compare(qual.GetQual(), "operon") == 0  &&
03209                  NStr::CompareNocase(qual.GetVal(), label) ) {
03210                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidQualifierValue,
03211                     "Operon is same as gene - " + label, gene);
03212             }
03213         }
03214     }
03215 }
03216 
03217 
03218 bool CValidError_feat::IsPartialAtSpliceSite
03219 (const CSeq_loc& loc,
03220  unsigned int tag)
03221 {
03222     if ( tag != eSeqlocPartial_Nostart && tag != eSeqlocPartial_Nostop ) {
03223         return false;
03224     }
03225 
03226     CSeq_loc_CI first, last;
03227     for ( CSeq_loc_CI sl_iter(loc); sl_iter; ++sl_iter ) { // EQUIV_IS_ONE not supported
03228         if ( !first ) {
03229             first = sl_iter;
03230         }
03231         last = sl_iter;
03232     }
03233 
03234     if ( first.GetStrand() != last.GetStrand() ) {
03235         return false;
03236     }
03237     CSeq_loc_CI temp = (tag == eSeqlocPartial_Nostart) ? first : last;
03238 
03239     CBioseq_Handle bsh = m_Scope->GetBioseqHandle(temp.GetSeq_id());
03240     if ( !bsh ) {
03241         return false;
03242     }
03243     
03244     TSeqPos acceptor = temp.GetRange().GetFrom();
03245     TSeqPos donor = temp.GetRange().GetTo();
03246     TSeqPos start = acceptor;
03247     TSeqPos stop = donor;
03248 
03249     CSeqVector vec = bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac,
03250         temp.GetStrand());
03251     TSeqPos len = vec.size();
03252 
03253     if ( temp.GetStrand() == eNa_strand_minus ) {
03254         swap(acceptor, donor);
03255         stop = len - donor - 1;
03256         start = len - acceptor - 1;
03257     }
03258 
03259     bool result = false;
03260 
03261     
03262     if ( (tag == eSeqlocPartial_Nostop)  &&  (stop < len - 2) ) {
03263         try {
03264             CSeqVector::TResidue res1 = vec[stop + 1];
03265             CSeqVector::TResidue res2 = vec[stop + 2];
03266 
03267             if ( IsResidue(res1)  &&  IsResidue(res2) ) {
03268                 if ( (res1 == 'G'  &&  res2 == 'T')  || 
03269                     (res1 == 'G'  &&  res2 == 'C') ) {
03270                     result = true;
03271                 }
03272             }
03273         } catch ( exception& ) {
03274             return false;
03275         }
03276     } else if ( (tag == eSeqlocPartial_Nostart)  &&  (start > 1) ) {
03277         try {
03278             CSeqVector::TResidue res1 = vec[start - 2];    
03279             CSeqVector::TResidue res2 = vec[start - 1];
03280         
03281             if ( IsResidue(res1)  &&  IsResidue(res2) ) {
03282                 if ( (res1 == 'A')  &&  (res2 == 'G') ) { 
03283                     result = true;
03284                 }
03285             }
03286         } catch ( exception& ) {
03287             return false;
03288         }
03289     }
03290 
03291     return result;    
03292 }
03293 
03294 
03295 void CValidError_feat::ValidateFeatCit
03296 (const CPub_set& cit,
03297  const CSeq_feat& feat)
03298 {
03299     if ( !feat.CanGetCit() ) {
03300         return;
03301     }
03302 
03303     if ( cit.IsPub() ) {
03304         ITERATE( CPub_set::TPub, pi, cit.GetPub() ) {
03305             if ( (*pi)->IsEquiv() ) {
03306                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnnecessaryCitPubEquiv,
03307                     "Citation on feature has unexpected internal Pub-equiv",
03308                     feat);
03309                 return;
03310             }
03311         }
03312     }
03313 }
03314 
03315 void CValidError_feat::ValidateFeatComment
03316 (const string& comment,
03317  const CSeq_feat& feat)
03318 {
03319     if ( m_Imp.IsSerialNumberInComment(comment) ) {
03320         PostErr(eDiag_Info, eErr_SEQ_FEAT_SerialInComment,
03321             "Feature comment may refer to reference by serial number - "
03322             "attach reference specific comments to the reference "
03323             "REMARK instead.", feat);
03324     }
03325 }
03326 
03327 
03328 void CValidError_feat::ValidateFeatBioSource
03329 (const CBioSource& bsrc,
03330  const CSeq_feat& feat)
03331 {
03332     if ( bsrc.IsSetIs_focus() ) {
03333         PostErr(eDiag_Error, eErr_SEQ_FEAT_FocusOnBioSourceFeature,
03334             "Focus must be on BioSource descriptor, not BioSource feature.",
03335             feat);
03336     }
03337 
03338     CBioseq_Handle bsh = m_Scope->GetBioseqHandle(feat.GetLocation());
03339     if ( !bsh ) {
03340         return;
03341     }
03342     CSeqdesc_CI dbsrc_i(bsh, CSeqdesc::e_Source);
03343     if ( !dbsrc_i ) {
03344         return;
03345     }
03346     
03347     const COrg_ref& org = bsrc.GetOrg();           
03348     const CBioSource& dbsrc = dbsrc_i->GetSource();
03349     const COrg_ref& dorg = dbsrc.GetOrg(); 
03350 
03351     if ( org.CanGetTaxname()  &&  !org.GetTaxname().empty()  &&
03352             dorg.CanGetTaxname() ) {
03353         string taxname = org.GetTaxname();
03354         string dtaxname = dorg.GetTaxname();
03355         if ( NStr::CompareNocase(taxname, dtaxname) != 0 ) {
03356             if ( !dbsrc.CanGetIs_focus()  &&  !m_Imp.IsTransgenic(dbsrc) ) {
03357                 PostErr(eDiag_Error, eErr_SEQ_DESCR_BioSourceNeedsFocus,
03358                     "BioSource descriptor must have focus or transgenic "
03359                     "when BioSource feature with different taxname is "
03360                     "present.", feat);
03361             }
03362         }
03363     }
03364 
03365     FOR_EACH_DBXREF_ON_FEATURE (it, feat) {
03366         if ((*it)->GetType() == CDbtag::eDbtagType_taxon) {
03367             PostErr(eDiag_Warning, eErr_SEQ_FEAT_TaxonDbxrefOnFeature,
03368                 "BioSource feature has taxon xref in common feature db_xref list",
03369                 feat);
03370         }
03371     }
03372 
03373     m_Imp.ValidateBioSource(bsrc, feat);
03374 }
03375 
03376 
03377 void CValidError_feat::ValidateBondLocs(const CSeq_feat& feat)
03378 {
03379     _ASSERT( ! feat.GetData().IsBond() );
03380     
03381     // Bond Seq_locs only allowed on Bond features.
03382     CSeq_loc_CI loc_ci(feat.GetLocation());
03383     for (; loc_ci; ++loc_ci) {
03384         if (loc_ci.GetSeq_loc().IsBond()) {
03385             PostErr(eDiag_Warning, eErr_SEQ_FEAT_ImproperBondLocation,
03386                 "Bond location should only be on bond features", feat);
03387             break;
03388         }
03389     }
03390 }
03391 
03392 // REQUIRES: feature is either Gene or mRNA
03393 bool CValidError_feat::IsSameAsCDS(const CSeq_feat& feat)
03394 {
03395     EOverlapType overlap_type;
03396     if ( feat.GetData().IsGene() ) {
03397         overlap_type = eOverlap_Simple;
03398     } else if ( feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA ) {
03399         overlap_type = eOverlap_CheckIntervals;
03400     } else {
03401         return false;
03402     }
03403 
03404     CConstRef<CSeq_feat> cds = GetBestOverlappingFeat(
03405             feat.GetLocation(),
03406             CSeqFeatData::e_Cdregion,
03407             overlap_type,
03408             *m_Scope);
03409         if ( cds ) {
03410             if ( TestForOverlap(
03411                     cds->GetLocation(),
03412                     feat.GetLocation(),
03413                     eOverlap_Simple) == 0 ) {
03414                 return true;
03415             }
03416         }
03417     return false;
03418 }
03419 
03420 
03421 bool CValidError_feat::IsCDDFeat(const CSeq_feat& feat) const
03422 {
03423     if ( feat.GetData().IsRegion() ) {
03424         if ( feat.CanGetDbxref() ) {
03425             FOR_EACH_DBXREF_ON_FEATURE (db, feat) {
03426                 if ( (*db)->CanGetDb()  &&
03427                     NStr::Compare((*db)->GetDb(), "CDD") == 0 ) {
03428                     return true;
03429                 }
03430             }
03431         }
03432     }
03433     return false;
03434 }
03435 
03436 void CValidError_feat::x_ValidateSeqFeatLoc(const CSeq_feat& feat)
03437 {
03438     // check for location on multiple near non-part bioseqs
03439     //CSeq_entry_Handle tse = m_Scope->GetSeq_entryHandle(m_Imp.GetTSE());
03440     CSeq_entry_Handle tse = m_Imp.GetTSEH();
03441     const CSeq_loc& loc = feat.GetLocation();
03442     if (IsOneBioseq(loc, m_Scope)) {
03443         return;
03444     }
03445 
03446     CBioseq_Handle bsh;
03447     for (CSeq_loc_CI it(loc) ;it; ++it) {
03448         CBioseq_Handle seq = m_Scope->GetBioseqHandleFromTSE(it.GetSeq_id(), tse);
03449         if (seq  &&  !seq.GetExactComplexityLevel(CBioseq_set::eClass_parts)) {
03450             if (bsh  &&  seq != bsh) {
03451                 PostErr(eDiag_Error, eErr_SEQ_FEAT_MultipleBioseqs,
03452                     "Feature location refers to multiple near bioseqs.", feat);
03453             } else {
03454                 bsh = seq;
03455             }
03456         }
03457     }
03458 }
03459 
03460 
03461 END_SCOPE(validator)
03462 END_SCOPE(objects)
03463 END_NCBI_SCOPE
03464 
03465 

Generated on Sun Mar 15 19:46:08 2009 for NCBI C++ ToolKit by  doxygen 1.4.6
Modified on Mon Mar 16 12:50:19 2009 by modify_doxy.py rev. 117643