00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033 #include <ncbi_pch.hpp>
00034 #include <corelib/ncbistd.hpp>
00035 #include <corelib/ncbistr.hpp>
00036 #include "validatorp.hpp"
00037 #include "utilities.hpp"
00038
00039 #include <serial/serialbase.hpp>
00040
00041 #include <objmgr/bioseq_handle.hpp>
00042 #include <objmgr/seq_entry_handle.hpp>
00043 #include <objmgr/feat_ci.hpp>
00044 #include <objmgr/seqdesc_ci.hpp>
00045 #include <objmgr/seq_vector.hpp>
00046 #include <objmgr/scope.hpp>
00047 #include <objmgr/util/sequence.hpp>
00048 #include <objmgr/util/feature.hpp>
00049
00050 #include <objects/seqfeat/Seq_feat.hpp>
00051 #include <objects/seqfeat/BioSource.hpp>
00052 #include <objects/seqfeat/Cdregion.hpp>
00053 #include <objects/seqfeat/Code_break.hpp>
00054 #include <objects/seqfeat/Gb_qual.hpp>
00055 #include <objects/seqfeat/Genetic_code.hpp>
00056 #include <objects/seqfeat/Genetic_code_table.hpp>
00057 #include <objects/seqfeat/Imp_feat.hpp>
00058 #include <objects/seqfeat/Org_ref.hpp>
00059 #include <objects/seqfeat/Prot_ref.hpp>
00060 #include <objects/seqfeat/RNA_ref.hpp>
00061 #include <objects/seqfeat/SubSource.hpp>
00062 #include <objects/seqfeat/Trna_ext.hpp>
00063
00064 #include <objects/seqloc/Seq_loc.hpp>
00065 #include <objects/seqloc/Seq_interval.hpp>
00066 #include <objects/seqloc/Seq_point.hpp>
00067 #include <objects/seqloc/Textseq_id.hpp>
00068
00069 #include <objects/seqset/Seq_entry.hpp>
00070 #include <objects/seqset/Bioseq_set.hpp>
00071
00072 #include <objects/seq/MolInfo.hpp>
00073 #include <objects/seq/Bioseq.hpp>
00074 #include <objects/seq/seqport_util.hpp>
00075
00076 #include <objects/pub/Pub.hpp>
00077 #include <objects/pub/Pub_set.hpp>
00078
00079 #include <objects/general/Dbtag.hpp>
00080
00081 #include <util/static_set.hpp>
00082 #include <util/sequtil/sequtil_convert.hpp>
00083
00084
00085 #include <algorithm>
00086 #include <string>
00087
00088
00089 BEGIN_NCBI_SCOPE
00090 BEGIN_SCOPE(objects)
00091 BEGIN_SCOPE(validator)
00092 using namespace sequence;
00093
00094
00095
00096
00097
00098
00099
00100 CValidError_feat::CValidError_feat(CValidError_imp& imp) :
00101 CValidError_base(imp),
00102 m_NumGenes(0),
00103 m_NumGeneXrefs(0)
00104 {
00105 }
00106
00107
00108 CValidError_feat::~CValidError_feat(void)
00109 {
00110 }
00111
00112
00113 void CValidError_feat::ValidateSeqFeat(const CSeq_feat& feat, bool is_insd_in_sep)
00114 {
00115 if ( !feat.CanGetLocation() ) {
00116 PostErr(eDiag_Critical, eErr_SEQ_FEAT_MissingLocation,
00117 "The feature is missing a location", feat);
00118 return;
00119 }
00120 _ASSERT(feat.CanGetLocation());
00121
00122 CBioseq_Handle bsh = m_Scope->GetBioseqHandle(feat.GetLocation());
00123 m_Imp.ValidateSeqLoc(feat.GetLocation(), bsh, "Location", feat);
00124 x_ValidateSeqFeatLoc(feat);
00125
00126 if ( feat.CanGetProduct() ) {
00127 ValidateSeqFeatProduct(feat.GetProduct(), feat);
00128 }
00129
00130 ValidateFeatPartialness(feat);
00131
00132 ValidateExcept(feat);
00133
00134 ValidateSeqFeatData(feat.GetData(), feat, is_insd_in_sep);
00135
00136 ValidateBothStrands (feat);
00137
00138 if (feat.CanGetDbxref ()) {
00139 m_Imp.ValidateDbxref (feat.GetDbxref (), feat);
00140 }
00141
00142 if ( feat.CanGetComment() ) {
00143 ValidateFeatComment(feat.GetComment(), feat);
00144 }
00145
00146 if ( feat.CanGetCit() ) {
00147 ValidateFeatCit(feat.GetCit(), feat);
00148 }
00149
00150 const CGene_ref* gene_xref = feat.GetGeneXref();
00151 if ( gene_xref != 0 && !gene_xref->IsSuppressed() ) {
00152 ++m_NumGeneXrefs;
00153 }
00154 }
00155
00156
00157
00158
00159
00160
00161
00162
00163 const string s_PlastidTxt[] = {
00164 "",
00165 "",
00166 "chloroplast",
00167 "chromoplast",
00168 "",
00169 "",
00170 "plastid",
00171 "",
00172 "",
00173 "",
00174 "",
00175 "",
00176 "cyanelle",
00177 "",
00178 "",
00179 "",
00180 "apicoplast",
00181 "leucoplast",
00182 "proplastid",
00183 ""
00184 };
00185
00186
00187 static string s_LegalRepeatTypes[] = {
00188 "tandem", "inverted", "flanking", "terminal",
00189 "direct", "dispersed", "other"
00190 };
00191
00192
00193 static string s_LegalConsSpliceStrings[] = {
00194 "(5'site:YES, 3'site:YES)",
00195 "(5'site:YES, 3'site:NO)",
00196 "(5'site:YES, 3'site:ABSENT)",
00197 "(5'site:NO, 3'site:YES)",
00198 "(5'site:NO, 3'site:NO)",
00199 "(5'site:NO, 3'site:ABSENT)",
00200 "(5'site:ABSENT, 3'site:YES)",
00201 "(5'site:ABSENT, 3'site:NO)",
00202 "(5'site:ABSENT, 3'site:ABSENT)"
00203 };
00204
00205
00206 static bool s_IsLocRefSeqMrna(const CSeq_loc& loc, CScope& scope)
00207 {
00208 CBioseq_Handle bsh = scope.GetBioseqHandle(loc);
00209 if ( bsh ) {
00210 FOR_EACH_SEQID_ON_BIOSEQ (it, *(bsh.GetBioseqCore())) {
00211 if ( (*it)->IdentifyAccession() == CSeq_id::eAcc_refseq_mrna ) {
00212 return true;
00213 }
00214 }
00215 }
00216
00217 return false;
00218 }
00219
00220
00221 static bool s_IsLocGEDL(const CSeq_loc& loc, CScope& scope)
00222 {
00223 CBioseq_Handle bsh = scope.GetBioseqHandle(loc);
00224 if ( bsh ) {
00225 FOR_EACH_SEQID_ON_BIOSEQ (it, *(bsh.GetBioseqCore())) {
00226 CSeq_id::EAccessionInfo acc_info = (*it)->IdentifyAccession();
00227 if ( acc_info == CSeq_id::eAcc_gb_embl_ddbj ||
00228 acc_info == CSeq_id::eAcc_local ) {
00229 return true;
00230 }
00231 }
00232 }
00233
00234 return false;
00235 }
00236
00237
00238
00239
00240 void CValidError_feat::ValidateSeqFeatData
00241 (const CSeqFeatData& data,
00242 const CSeq_feat& feat,
00243 bool is_insd_in_sep)
00244 {
00245 switch ( data.Which () ) {
00246 case CSeqFeatData::e_Gene:
00247
00248 ValidateGene(data.GetGene (), feat);
00249 break;
00250 case CSeqFeatData::e_Cdregion:
00251
00252 ValidateCdregion(data.GetCdregion (), feat);
00253 break;
00254 case CSeqFeatData::e_Prot:
00255
00256 ValidateProt(data.GetProt (), feat);
00257 break;
00258 case CSeqFeatData::e_Rna:
00259
00260 ValidateRna(data.GetRna (), feat);
00261 break;
00262 case CSeqFeatData::e_Pub:
00263
00264 m_Imp.ValidatePubdesc(data.GetPub (), feat);
00265 break;
00266 case CSeqFeatData::e_Imp:
00267
00268 ValidateImp(data.GetImp (), feat, is_insd_in_sep);
00269 break;
00270 case CSeqFeatData::e_Biosrc:
00271
00272 ValidateFeatBioSource(data.GetBiosrc(), feat);
00273 break;
00274
00275 case CSeqFeatData::e_Org:
00276 case CSeqFeatData::e_Region:
00277 case CSeqFeatData::e_Seq:
00278 case CSeqFeatData::e_Comment:
00279 case CSeqFeatData::e_Bond:
00280 case CSeqFeatData::e_Site:
00281 case CSeqFeatData::e_Rsite:
00282 case CSeqFeatData::e_User:
00283 case CSeqFeatData::e_Txinit:
00284 case CSeqFeatData::e_Num:
00285 case CSeqFeatData::e_Psec_str:
00286 case CSeqFeatData::e_Non_std_residue:
00287 case CSeqFeatData::e_Het:
00288 break;
00289
00290 default:
00291 PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidType,
00292 "Invalid SeqFeat type [" + CSeqFeatData::SelectionName(data.Which()) + "]",
00293 feat);
00294 break;
00295 }
00296
00297 if ( !data.IsGene() ) {
00298 ValidateGeneXRef(feat);
00299 } else {
00300 ValidateOperon(feat);
00301 }
00302
00303 if ( !data.IsBond() ) {
00304 ValidateBondLocs(feat);
00305 }
00306 }
00307
00308
00309 void CValidError_feat::ValidateSeqFeatProduct
00310 (const CSeq_loc& prod, const CSeq_feat& feat)
00311 {
00312 CBioseq_Handle bsh = m_Scope->GetBioseqHandle(feat.GetProduct());
00313 m_Imp.ValidateSeqLoc(feat.GetProduct(), bsh, "Product", feat);
00314
00315 if ( IsOneBioseq(prod, m_Scope) ) {
00316 const CSeq_id& sid = GetId(prod, m_Scope);
00317
00318 switch ( sid.Which() ) {
00319 case CSeq_id::e_Genbank:
00320 case CSeq_id::e_Embl:
00321 case CSeq_id::e_Ddbj:
00322 case CSeq_id::e_Tpg:
00323 case CSeq_id::e_Tpe:
00324 case CSeq_id::e_Tpd:
00325 {
00326 const CTextseq_id* tsid = sid.GetTextseq_Id();
00327 if ( tsid != NULL ) {
00328 if ( !tsid->CanGetAccession() && tsid->CanGetName() ) {
00329 if ( m_Imp.IsNucAcc(tsid->GetName()) ) {
00330 PostErr(eDiag_Warning, eErr_SEQ_FEAT_BadProductSeqId,
00331 "Feature product should not use "
00332 "Textseq-id 'name' slot", feat);
00333 }
00334 }
00335 }
00336 }
00337 break;
00338
00339 default:
00340 break;
00341 }
00342 }
00343 }
00344
00345
00346 bool CValidError_feat::IsPlastid(int genome)
00347 {
00348 if ( genome == CBioSource::eGenome_chloroplast ||
00349 genome == CBioSource::eGenome_chromoplast ||
00350 genome == CBioSource::eGenome_plastid ||
00351 genome == CBioSource::eGenome_cyanelle ||
00352 genome == CBioSource::eGenome_apicoplast ||
00353 genome == CBioSource::eGenome_leucoplast ||
00354 genome == CBioSource::eGenome_proplastid ) {
00355 return true;
00356 }
00357
00358 return false;
00359 }
00360
00361
00362 bool CValidError_feat::IsOverlappingGenePseudo(const CSeq_feat& feat)
00363 {
00364 const CGene_ref* grp = feat.GetGeneXref();
00365 if ( grp ) {
00366 return (grp->CanGetPseudo() && grp->GetPseudo());
00367 }
00368
00369
00370
00371 if ( m_Imp.AvoidPerfBottlenecks() ) {
00372 return false;
00373 }
00374
00375
00376
00377 CConstRef<CSeq_feat> overlap =
00378 GetOverlappingGene(feat.GetLocation(), *m_Scope);
00379 if ( overlap ) {
00380 if ( (overlap->CanGetPseudo() && overlap->GetPseudo()) ||
00381 (overlap->GetData().GetGene().CanGetPseudo() &&
00382 overlap->GetData().GetGene().GetPseudo()) ) {
00383 return true;
00384 }
00385 }
00386
00387 return false;
00388 }
00389
00390
00391 bool CValidError_feat::SuppressCheck(const string& except_text)
00392 {
00393 static string exceptions[] = {
00394 "ribosomal slippage",
00395 "artificial frameshift",
00396 "nonconsensus splice site"
00397 };
00398
00399 for ( size_t i = 0; i < sizeof(exceptions) / sizeof(string); ++i ) {
00400 if ( NStr::FindNoCase(except_text, exceptions[i] ) != string::npos )
00401 return true;
00402 }
00403 return false;
00404 }
00405
00406
00407
00408
00409
00410 unsigned char CValidError_feat::Residue(unsigned char res)
00411 {
00412 return res == 255 ? '?' : res;
00413 }
00414
00415
00416 int CValidError_feat::CheckForRaggedEnd
00417 (const CSeq_loc& loc,
00418 const CCdregion& cdregion)
00419 {
00420 size_t len = GetLength(loc, m_Scope);
00421 if ( cdregion.GetFrame() > CCdregion::eFrame_one ) {
00422 len -= cdregion.GetFrame() - 1;
00423 }
00424
00425 int ragged = len % 3;
00426 if ( ragged > 0 ) {
00427 len = GetLength(loc, m_Scope);
00428 size_t last_pos = 0;
00429
00430 CSeq_loc::TRange range = CSeq_loc::TRange::GetEmpty();
00431 FOR_EACH_CODEBREAK_ON_CDREGION (cbr, cdregion) {
00432 SRelLoc rl(loc, (*cbr)->GetLoc(), m_Scope);
00433 ITERATE (SRelLoc::TRanges, rit, rl.m_Ranges) {
00434 if ((*rit)->GetTo() > last_pos) {
00435 last_pos = (*rit)->GetTo();
00436 }
00437 }
00438 }
00439
00440
00441 TSeqPos codon_length = range.GetLength();
00442 if ( (codon_length == 0 || codon_length == 1) &&
00443 last_pos == len - 1 ) {
00444 ragged = 0;
00445 }
00446 }
00447 return ragged;
00448 }
00449
00450
00451 string CValidError_feat::MapToNTCoords
00452 (const CSeq_feat& feat,
00453 const CSeq_loc& product,
00454 TSeqPos pos)
00455 {
00456 string result;
00457
00458 CSeq_point pnt;
00459 pnt.SetPoint(pos);
00460 pnt.SetStrand( GetStrand(product, m_Scope) );
00461
00462 try {
00463 pnt.SetId().Assign(GetId(product, m_Scope));
00464 } catch (CObjmgrUtilException) {}
00465
00466 CSeq_loc tmp;
00467 tmp.SetPnt(pnt);
00468 CRef<CSeq_loc> loc = ProductToSource(feat, tmp, 0, m_Scope);
00469
00470 loc->GetLabel(&result);
00471
00472 return result;
00473 }
00474
00475
00476 void CValidError_feat::ValidateFeatPartialness(const CSeq_feat& feat)
00477 {
00478 static const string parterr[2] = { "PartialProduct", "PartialLocation" };
00479 static const string parterrs[4] = {
00480 "Start does not include first/last residue of sequence",
00481 "Stop does not include first/last residue of sequence",
00482 "Internal partial intervals do not include first/last residue of sequence",
00483 "Improper use of partial (greater than or less than)"
00484 };
00485
00486 unsigned int partial_prod = eSeqlocPartial_Complete,
00487 partial_loc = eSeqlocPartial_Complete;
00488
00489 bool is_partial = feat.CanGetPartial() && feat.GetPartial();
00490 partial_loc = SeqLocPartialCheck(feat.GetLocation(), m_Scope );
00491 if (feat.CanGetProduct ()) {
00492 partial_prod = SeqLocPartialCheck(feat.GetProduct (), m_Scope );
00493 }
00494
00495 if ( (partial_loc != eSeqlocPartial_Complete) ||
00496 (partial_prod != eSeqlocPartial_Complete) ||
00497 is_partial ) {
00498
00499
00500 if ( !is_partial &&
00501 partial_loc != eSeqlocPartial_Complete &&
00502 feat.GetLocation ().IsWhole() ) {
00503 PostErr(eDiag_Info, eErr_SEQ_FEAT_PartialProblem,
00504 "On partial Bioseq, SeqFeat.partial should be TRUE", feat);
00505 }
00506
00507 else if ( is_partial &&
00508 partial_loc == eSeqlocPartial_Complete &&
00509 feat.CanGetProduct () &&
00510 feat.GetProduct ().IsWhole() &&
00511 partial_prod != eSeqlocPartial_Complete ) {
00512 PostErr(eDiag_Warning, eErr_SEQ_FEAT_PartialProblem,
00513 "When SeqFeat.product is a partial Bioseq, SeqFeat.location "
00514 "should also be partial", feat);
00515 }
00516
00517 else if ( feat.GetData ().IsGene () &&
00518 !is_partial &&
00519 partial_loc == eSeqlocPartial_Internal ) {
00520 PostErr(eDiag_Warning, eErr_SEQ_FEAT_PartialProblem,
00521 "Gene of 'order' with otherwise complete location should "
00522 "have partial flag set", feat);
00523 }
00524
00525 else if (partial_prod == eSeqlocPartial_Complete && feat.CanGetProduct()) {
00526
00527 EDiagSev sev = eDiag_Warning;
00528 if ( IsOneBioseq(feat.GetProduct(), m_Scope) ) {
00529 const CSeq_id& prod_id = GetId(feat.GetProduct(), m_Scope);
00530 CBioseq_Handle prod =
00531 m_Scope->GetBioseqHandleFromTSE(prod_id, m_Imp.GetTSE());
00532 if ( !prod ) {
00533 sev = eDiag_Info;
00534 }
00535 }
00536
00537 string str("Inconsistent: Product= complete, Location= ");
00538 str += (partial_loc != eSeqlocPartial_Complete) ? "partial, " : "complete, ";
00539 str += "Feature.partial= ";
00540 str += is_partial ? "TRUE" : "FALSE";
00541 PostErr(sev, eErr_SEQ_FEAT_PartialsInconsistent, str, feat);
00542 }
00543
00544 else if ( partial_loc == eSeqlocPartial_Complete || !is_partial ) {
00545 string str("Inconsistent: ");
00546 if ( feat.CanGetProduct() ) {
00547 str += "Product= ";
00548 str += (partial_prod != eSeqlocPartial_Complete) ? "partial, " : "complete, ";
00549 }
00550 str += "Location= ";
00551 str += (partial_loc != eSeqlocPartial_Complete) ? "partial, " : "complete, ";
00552 str += "Feature.partial= ";
00553 str += is_partial ? "TRUE" : "FALSE";
00554 PostErr(eDiag_Warning, eErr_SEQ_FEAT_PartialsInconsistent, str, feat);
00555 }
00556
00557 else if ( (((partial_loc & eSeqlocPartial_Start) != 0) ||
00558 ((partial_loc & eSeqlocPartial_Stop) != 0)) &&
00559 ((partial_prod & eSeqlocPartial_Other) != 0) &&
00560 is_partial ) {
00561 PostErr(eDiag_Warning, eErr_SEQ_FEAT_PartialProblem,
00562 "5' or 3' partial location should not have unclassified"
00563 " partial in product molinfo descriptor", feat);
00564 }
00565
00566
00567 unsigned int partials[2] = { partial_prod, partial_loc };
00568 for ( int i = 0; i < 2; ++i ) {
00569 unsigned int errtype = eSeqlocPartial_Nostart;
00570
00571 for ( int j = 0; j < 4; ++j ) {
00572 if (partials[i] & errtype) {
00573 if ( i == 1 && j < 2 && IsCDDFeat(feat) ) {
00574
00575 } else if ( i == 1 && j < 2 &&
00576 IsPartialAtSpliceSite(feat.GetLocation(), errtype) ) {
00577 PostErr(eDiag_Info, eErr_SEQ_FEAT_PartialProblem,
00578 parterr[i] + ": " + parterrs[j] +
00579 " (but is at consensus splice site)", feat);
00580 } else if ( i == 1 && j < 2 &&
00581 (feat.GetData().Which() == CSeqFeatData::e_Gene ||
00582 feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA) &&
00583 IsSameAsCDS(feat) ) {
00584 PostErr(eDiag_Info, eErr_SEQ_FEAT_PartialProblem,
00585 parterr[i] + ": " + parterrs[j], feat);
00586 } else if (feat.GetData().Which() == CSeqFeatData::e_Cdregion && j == 0) {
00587 PostErr (eDiag_Warning, eErr_SEQ_FEAT_PartialProblem,
00588 parterr[i] +
00589 ": 5' partial is not at start AND is not at consensus splice site",
00590 feat);
00591 } else if (feat.GetData().Which() == CSeqFeatData::e_Cdregion && j == 1) {
00592 PostErr (eDiag_Warning, eErr_SEQ_FEAT_PartialProblem,
00593 parterr[i] +
00594 ": 3' partial is not at stop AND is not at consensus splice site",
00595 feat);
00596 } else {
00597 PostErr (eDiag_Warning, eErr_SEQ_FEAT_PartialProblem,
00598 parterr[i] + ": " + parterrs[j], feat);
00599 }
00600 }
00601 errtype <<= 1;
00602 }
00603 }
00604 }
00605 }
00606
00607
00608 static int s_GetStrictGenCode(const CBioSource& src)
00609 {
00610 int gencode = 0;
00611
00612 try {
00613 CBioSource::TGenome genome = src.IsSetGenome() ? src.GetGenome() : CBioSource::eGenome_unknown;
00614
00615 if ( src.IsSetOrg() && src.GetOrg().IsSetOrgname() ) {
00616 const COrgName& orn = src.GetOrg().GetOrgname();
00617
00618 switch ( genome ) {
00619 case CBioSource::eGenome_kinetoplast:
00620 case CBioSource::eGenome_mitochondrion:
00621
00622 gencode = orn.GetMgcode();
00623 break;
00624 case CBioSource::eGenome_chloroplast:
00625 case CBioSource::eGenome_chromoplast:
00626 case CBioSource::eGenome_plastid:
00627 case CBioSource::eGenome_cyanelle:
00628 case CBioSource::eGenome_apicoplast:
00629 case CBioSource::eGenome_leucoplast:
00630 case CBioSource::eGenome_proplastid:
00631
00632 gencode = 11;
00633 break;
00634 default:
00635 gencode = orn.GetGcode();
00636 break;
00637 }
00638 }
00639 } catch (...) {
00640 }
00641 return gencode;
00642 }
00643
00644
00645 void CValidError_feat::ValidateGene(const CGene_ref& gene, const CSeq_feat& feat)
00646 {
00647 ++m_NumGenes;
00648
00649 if ( (! gene.IsSetLocus() || gene.GetLocus().empty()) &&
00650 (! gene.IsSetAllele() || gene.GetAllele().empty()) &&
00651 (! gene.IsSetDesc() || gene.GetDesc().empty()) &&
00652 (! gene.IsSetMaploc() || gene.GetMaploc().empty()) &&
00653 (! gene.IsSetDb() || gene.GetDb().empty()) &&
00654 (! gene.IsSetSyn() || gene.GetSyn().empty()) &&
00655 (! gene.IsSetLocus_tag() || gene.GetLocus_tag().empty()) ) {
00656 PostErr(eDiag_Warning, eErr_SEQ_FEAT_GeneRefHasNoData,
00657 "There is a gene feature where all fields are empty", feat);
00658 }
00659 if ( gene.CanGetLocus() && !gene.GetLocus().empty() ) {
00660 ITERATE (string, it, gene.GetLocus() ) {
00661 if ( isspace((unsigned char)(*it)) != 0 ) {
00662 PostErr(eDiag_Warning, eErr_SEQ_FEAT_LocusTagProblem,
00663 "Gene locus_tag '" + gene.GetLocus() +
00664 "' should be a single word without any spaces", feat);
00665 break;
00666 }
00667 }
00668
00669 }
00670 if ( gene.CanGetDb () ) {
00671 m_Imp.ValidateDbxref(gene.GetDb(), feat);
00672 }
00673 }
00674
00675
00676 void CValidError_feat::ValidateCdregion (
00677 const CCdregion& cdregion,
00678 const CSeq_feat& feat
00679 )
00680 {
00681 bool pseudo = (feat.CanGetPseudo() && feat.GetPseudo()) ||
00682 IsOverlappingGenePseudo(feat);
00683
00684 bool transl_except = false;
00685
00686 FOR_EACH_GBQUAL_ON_FEATURE (it, feat) {
00687 const CGb_qual& qual = **it;
00688 if ( qual.CanGetQual() ) {
00689 const string& key = qual.GetQual();
00690 if ( NStr::EqualNocase(key, "pseudo") ) {
00691 pseudo = true;
00692 } else if ( NStr::EqualNocase(key, "exception") ) {
00693 if ( !feat.IsSetExcept() ) {
00694 PostErr(eDiag_Warning, eErr_SEQ_FEAT_ExceptInconsistent,
00695 "Exception flag should be set in coding region", feat);
00696 }
00697 } else if ( NStr::EqualNocase(key, "transl_except") ) {
00698 transl_except = true;
00699 } else if ( NStr::EqualNocase(key, "codon") ) {
00700 PostErr(eDiag_Warning, eErr_SEQ_FEAT_WrongQualOnImpFeat,
00701 "Use the proper genetic code, if available, "
00702 "or set transl_excepts on specific codons", feat);
00703 } else if ( NStr::EqualNocase(key, "protein_id") ) {
00704 PostErr(eDiag_Warning, eErr_SEQ_FEAT_WrongQualOnImpFeat,
00705 "protein_id should not be a gbqual on a CDS feature", feat);
00706 } else if ( NStr::EqualNocase(key, "transcript_id") ) {
00707 PostErr(eDiag_Warning, eErr_SEQ_FEAT_WrongQualOnImpFeat,
00708 "transcript_id should not be a gbqual on a CDS feature", feat);
00709 }
00710 }
00711 }
00712 if ( transl_except && feat.CanGetExcept_text() &&
00713 NStr::FindNoCase(feat.GetExcept_text(), "RNA editing") != NPOS ) {
00714 PostErr(eDiag_Warning, eErr_SEQ_FEAT_TranslExceptAndRnaEditing,
00715 "CDS has both RNA editing /exception and /transl_except qualifiers", feat);
00716 }
00717
00718 bool conflict = cdregion.CanGetConflict() && cdregion.GetConflict();
00719 if ( !pseudo && !conflict ) {
00720 ValidateCdTrans(feat);
00721 ValidateSplice(feat, false);
00722 } else if ( conflict ) {
00723 ValidateCdConflict(cdregion, feat);
00724 }
00725 ValidateCdsProductId(feat);
00726
00727 x_ValidateCdregionCodebreak(cdregion, feat);
00728
00729 if ( cdregion.IsSetOrf() && cdregion.GetOrf () &&
00730 feat.IsSetProduct () ) {
00731 PostErr (eDiag_Warning, eErr_SEQ_FEAT_OrfCdsHasProduct,
00732 "An ORF coding region should not have a product", feat);
00733 }
00734
00735 if ( pseudo && feat.CanGetProduct () ) {
00736 PostErr(eDiag_Warning, eErr_SEQ_FEAT_PseudoCdsHasProduct,
00737 "A pseudo coding region should not have a product", feat);
00738 }
00739
00740 CBioseq_Handle bsh = m_Scope->GetBioseqHandle(feat.GetLocation());
00741 if ( bsh ) {
00742
00743 string label;
00744 (*(bsh.GetCompleteBioseq())).GetLabel(&label, CBioseq::eBoth);
00745
00746 CSeqdesc_CI diter (bsh, CSeqdesc::e_Source);
00747 if ( diter ) {
00748 const CBioSource& src = diter->GetSource();
00749 int biopgencode = s_GetStrictGenCode(src);
00750
00751 if ( cdregion.CanGetCode() ) {
00752 int cdsgencode = cdregion.GetCode().GetId();
00753 if ( biopgencode != cdsgencode ) {
00754 int genome = 0;
00755
00756 if ( src.CanGetGenome() ) {
00757 genome = src.GetGenome();
00758 }
00759
00760 if ( IsPlastid(genome) ) {
00761 PostErr(eDiag_Warning, eErr_SEQ_FEAT_GenCodeMismatch,
00762 "Genetic code conflict between CDS (code " +
00763 NStr::IntToString (cdsgencode) +
00764 ") and BioSource.genome biological context (" +
00765 s_PlastidTxt[genome] + ") (uses code 11)", feat);
00766 } else {
00767 PostErr (eDiag_Warning, eErr_SEQ_FEAT_GenCodeMismatch,
00768 "Genetic code conflict between CDS (code " +
00769 NStr::IntToString(cdsgencode) +
00770 ") and BioSource (code " +
00771 NStr::IntToString(biopgencode) + ")", feat);
00772 }
00773 }
00774 }
00775 }
00776 }
00777
00778 ValidateBadGeneOverlap(feat);
00779 ValidateBadMRNAOverlap(feat);
00780 ValidateCommonCDSProduct(feat);
00781 ValidateCDSPartial(feat);
00782 }
00783
00784
00785 void CValidError_feat::x_ValidateCdregionCodebreak
00786 (const CCdregion& cds,
00787 const CSeq_feat& feat)
00788 {
00789 const CSeq_loc& feat_loc = feat.GetLocation();
00790 const CCode_break* prev_cbr = 0;
00791
00792 FOR_EACH_CODEBREAK_ON_CDREGION (it, cds) {
00793 const CCode_break& cbr = **it;
00794 const CSeq_loc& cbr_loc = cbr.GetLoc();
00795 ECompare comp = Compare(cbr_loc, feat_loc, m_Scope);
00796 if ( (comp != eContained) && (comp != eSame)) {
00797 PostErr (eDiag_Error, eErr_SEQ_FEAT_Range,
00798 "Code-break location not in coding region", feat);
00799 }
00800 if ( prev_cbr != 0 ) {
00801 if ( Compare(cbr_loc, prev_cbr->GetLoc(), m_Scope) == eSame ) {
00802 string msg = "Multiple code-breaks at same location";
00803 string str;
00804 cbr_loc.GetLabel(&str);
00805 if ( !str.empty() ) {
00806 msg += "[" + str + "]";
00807 }
00808 PostErr(eDiag_Error, eErr_SEQ_FEAT_DuplicateTranslExcept,
00809 msg, feat);
00810 }
00811 }
00812 prev_cbr = &cbr;
00813 }
00814 }
00815
00816
00817
00818 void CValidError_feat::ValidateCdsProductId(const CSeq_feat& feat)
00819 {
00820
00821 if ( feat.CanGetProduct() ) {
00822 return;
00823 }
00824
00825 bool pseudo = (feat.CanGetPseudo() && feat.GetPseudo()) ||
00826 IsOverlappingGenePseudo(feat);
00827 if ( pseudo ) {
00828 return;
00829 }
00830
00831 if ( feat.CanGetLocation() ) {
00832 const CSeq_loc& loc = feat.GetLocation();
00833 if ( loc.IsPartialStart(eExtreme_Biological) && !loc.IsPartialStop(eExtreme_Biological) ) {
00834 if ( GetLength(loc, m_Scope) <= 5 ) {
00835 return;
00836 }
00837 }
00838 }
00839
00840 if ( feat.CanGetExcept() && feat.CanGetExcept_text() &&
00841 !IsBlankString(feat.GetExcept_text()) ) {
00842 if ( NStr::Find(feat.GetExcept_text(),
00843 "rearrangement required for product") != NPOS ) {
00844 return;
00845 }
00846 }
00847
00848
00849 PostErr(eDiag_Warning, eErr_SEQ_FEAT_MissingCDSproduct,
00850 "Expected CDS product absent", feat);
00851 }
00852
00853
00854 void CValidError_feat::ValidateCdConflict
00855 (const CCdregion& cdregion,
00856 const CSeq_feat& feat)
00857 {
00858 CBioseq_Handle nuc = m_Scope->GetBioseqHandle(feat.GetLocation());
00859 CBioseq_Handle prot = m_Scope->GetBioseqHandle(feat.GetProduct());
00860
00861
00862 string transl_prot;
00863 try {
00864 CCdregion_translate::TranslateCdregion(
00865 transl_prot,
00866 nuc,
00867 feat.GetLocation(),
00868 cdregion,
00869 false,
00870 false);
00871 } catch ( const runtime_error& ) {
00872 }
00873
00874 CSeqVector prot_vec = prot.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
00875 prot_vec.SetCoding(CSeq_data::e_Ncbieaa);
00876
00877 string prot_seq;
00878 prot_vec.GetSeqData(0, prot_vec.size(), prot_seq);
00879
00880 if ( transl_prot.empty() || prot_seq.empty() || transl_prot == prot_seq ) {
00881 PostErr(eDiag_Error, eErr_SEQ_FEAT_BadConflictFlag,
00882 "Coding region conflict flag should not be set", feat);
00883 } else {
00884 PostErr(eDiag_Warning, eErr_SEQ_FEAT_ConflictFlagSet,
00885 "Coding region conflict flag is set", feat);
00886 }
00887 }
00888
00889
00890 void CValidError_feat::ValidateSplice(const CSeq_feat& feat, bool check_all)
00891 {
00892
00893
00894
00895
00896 bool report_errors = true, has_errors = false;
00897
00898
00899 if ( feat.CanGetExcept() && feat.GetExcept() &&
00900 feat.CanGetExcept_text() ) {
00901 if ( SuppressCheck(feat.GetExcept_text()) ) {
00902 report_errors = false;
00903 }
00904 }
00905
00906 size_t num_of_parts = 0;
00907 ENa_strand strand = eNa_strand_unknown;
00908
00909
00910
00911 const CSeq_loc& location = feat.GetLocation ();
00912
00913 for (CSeq_loc_CI citer(location); citer; ++citer) {
00914 ++num_of_parts;
00915 if ( num_of_parts == 1 ) {
00916 strand = citer.GetStrand();
00917 } else {
00918 if ( strand != citer.GetStrand() ) {
00919 return;
00920 }
00921 }
00922 }
00923
00924 if ( num_of_parts == 0 ) {
00925 return;
00926 }
00927 if ( !check_all && num_of_parts == 1 ) {
00928 return;
00929 }
00930
00931 bool partial_first = location.IsPartialStart(eExtreme_Biological);
00932 bool partial_last = location.IsPartialStop(eExtreme_Biological);
00933
00934
00935 size_t counter = 0;
00936 const CSeq_id* last_id = 0;
00937 CBioseq_Handle bsh;
00938 size_t seq_len = 0;
00939 CSeqVector seq_vec;
00940 string label;
00941
00942 for (CSeq_loc_CI citer(location); citer; ++citer) {
00943 ++counter;
00944
00945 const CSeq_id& seq_id = citer.GetSeq_id();
00946 if ( last_id == 0 || !last_id->Match(seq_id) ) {
00947 bsh = m_Scope->GetBioseqHandle(seq_id);
00948 if ( !bsh ) {
00949 break;
00950 }
00951 seq_vec =
00952 bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac,
00953 citer.GetStrand());
00954 seq_len = seq_vec.size();
00955
00956
00957
00958 const CBioseq& bsq = *bsh.GetCompleteBioseq();
00959 bsq.GetLabel(&label, CBioseq::eContent, m_Imp.IsSuppressContext());
00960
00961 last_id = &seq_id;
00962 }
00963
00964 TSeqPos acceptor = citer.GetRange().GetFrom();
00965 TSeqPos donor = citer.GetRange().GetTo();
00966
00967 TSeqPos start = acceptor;
00968 TSeqPos stop = donor;
00969
00970 if ( citer.GetStrand() == eNa_strand_minus ) {
00971 swap(acceptor, donor);
00972 stop = seq_len - donor - 1;
00973 start = seq_len - acceptor - 1;
00974 }
00975
00976
00977
00978 EDiagSev sev = eDiag_Warning;
00979 if ( m_Imp.IsSpliceErr() &&
00980 !(m_Imp.IsGPS() || m_Imp.IsRefSeq()) &&
00981 !check_all ) {
00982 sev = eDiag_Error;
00983 }
00984
00985
00986 if ( ((check_all && !partial_last) || counter < num_of_parts) &&
00987 (stop < seq_len - 2) ) {
00988 try {
00989 CSeqVector::TResidue res1 = seq_vec[stop + 1];
00990 CSeqVector::TResidue res2 = seq_vec[stop + 2];
00991
00992 if ( IsResidue(res1) && IsResidue(res2) ) {
00993 if ( (res1 != 'G') || (res2 != 'T' ) ) {
00994 has_errors = true;
00995 string msg;
00996 if ( (res1 == 'G') && (res2 == 'C' ) ) {
00997 sev = eDiag_Info;
00998 msg = "Rare splice donor consensus (GC) found instead of "
00999 "(GT) after exon ending at position " +
01000 NStr::IntToString(donor + 1) + " of " + label;
01001 } else {
01002 msg = "Splice donor consensus (GT) not found after exon"
01003 " ending at position " +
01004 NStr::IntToString(donor + 1) + " of " + label;
01005 }
01006 if (report_errors) {
01007 PostErr(sev, eErr_SEQ_FEAT_NotSpliceConsensus, msg, feat);
01008 }
01009 }
01010 }
01011 } catch ( exception& ) {
01012 break;
01013 }
01014 }
01015
01016 if ( ((check_all && !partial_first) || counter != 1) &&
01017 (start > 1) ) {
01018 try {
01019 CSeqVector::TResidue res1 = seq_vec[start - 2];
01020 CSeqVector::TResidue res2 = seq_vec[start - 1];
01021
01022 if ( IsResidue(res1) && IsResidue(res2) ) {
01023 if ( (res1 != 'A') || (res2 != 'G') ) {
01024 has_errors = true;
01025 if (report_errors) {
01026 PostErr(sev, eErr_SEQ_FEAT_NotSpliceConsensus,
01027 "Splice acceptor consensus (AG) not found before "
01028 "exon starting at position " +
01029 NStr::IntToString(acceptor + 1) + " of " + label, feat);
01030 }
01031 }
01032 }
01033 } catch ( exception& ) {
01034 }
01035 }
01036 }
01037
01038 if (!report_errors && !has_errors) {
01039 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnnecessaryException,
01040 "feature has exception but passes splice site test", feat);
01041 }
01042 }
01043
01044
01045 void CValidError_feat::ValidateProt(const CProt_ref& prot, const CSerialObject& obj)
01046 {
01047 CProt_ref::EProcessed processed = CProt_ref::eProcessed_not_set;
01048
01049 if ( prot.CanGetProcessed() ) {
01050 processed = prot.GetProcessed();
01051 }
01052
01053 bool empty = true;
01054 if ( processed != CProt_ref::eProcessed_signal_peptide &&
01055 processed != CProt_ref::eProcessed_transit_peptide ) {
01056 if ( prot.IsSetName() &&
01057 (!prot.GetName().empty() || !prot.GetName().front().empty()) ) {
01058 empty = false;
01059 }
01060 if ( prot.CanGetDesc() && !prot.GetDesc().empty() ) {
01061 empty = false;
01062 }
01063 if ( prot.CanGetEc() && !prot.GetEc().empty() ) {
01064 empty = false;
01065 }
01066 if ( prot.CanGetActivity() && !prot.GetActivity().empty() ) {
01067 empty = false;
01068 }
01069 if ( prot.CanGetDb() && !prot.GetDb().empty() ) {
01070 empty = false;
01071 }
01072
01073 if ( empty ) {
01074 PostErr(eDiag_Warning, eErr_SEQ_FEAT_ProtRefHasNoData,
01075 "There is a protein feature where all fields are empty", obj);
01076 }
01077 }
01078 if ( prot.CanGetDb () ) {
01079 m_Imp.ValidateDbxref(prot.GetDb(), obj);
01080 }
01081 if ( prot.CanGetDesc() && !prot.GetDesc().empty() &&
01082 prot.GetName().empty() ) {
01083 PostErr(eDiag_Warning, eErr_SEQ_FEAT_NoNameForProtein,
01084 "Protein feature has description but no name", obj);
01085 }
01086 }
01087
01088
01089 void CValidError_feat::ValidateRna(const CRNA_ref& rna, const CSeq_feat& feat)
01090 {
01091 const CRNA_ref::EType& rna_type = rna.GetType ();
01092
01093 if ( rna_type == CRNA_ref::eType_mRNA ) {
01094 bool pseudo = (feat.CanGetPseudo() && feat.GetPseudo()) || IsOverlappingGenePseudo(feat);
01095
01096 if ( !pseudo ) {
01097 ValidateMrnaTrans(feat);
01098 ValidateSplice(feat, false);
01099 }
01100 ValidateBadGeneOverlap(feat);
01101 ValidateCommonMRNAProduct(feat);
01102 }
01103
01104 if ( rna.CanGetExt() &&
01105 rna.GetExt().Which() == CRNA_ref::C_Ext::e_TRNA ) {
01106 const CTrna_ext& trna = rna.GetExt ().GetTRNA ();
01107 if ( trna.CanGetAnticodon () ) {
01108 const CSeq_loc& anticodon = trna.GetAnticodon();
01109 size_t anticodon_len = 0;
01110 bool bad_anticodon = false;
01111 for ( CSeq_loc_CI it(anticodon); it; ++it ) {
01112 anticodon_len += GetLength(anticodon, m_Scope);
01113 ECompare comp = sequence::Compare(anticodon,
01114 feat.GetLocation(),
01115 m_Scope);
01116 if ( comp != eContained && comp != eSame ) {
01117 bad_anticodon = true;
01118 }
01119 }
01120 if ( bad_anticodon ) {
01121 PostErr (eDiag_Error, eErr_SEQ_FEAT_Range,
01122 "Anticodon location not in tRNA", feat);
01123 }
01124 if ( anticodon_len != 3 ) {
01125 PostErr (eDiag_Warning, eErr_SEQ_FEAT_Range,
01126 "Anticodon is not 3 bases in length", feat);
01127 }
01128 ValidateAnticodon(anticodon, feat);
01129 }
01130 ValidateTrnaCodons(trna, feat);
01131 }
01132
01133 if ( rna_type == CRNA_ref::eType_tRNA ) {
01134 FOR_EACH_GBQUAL_ON_FEATURE (gbqual, feat) {
01135 if ( NStr::CompareNocase((**gbqual).GetVal (), "anticodon") == 0 ) {
01136 PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
01137 "Unparsed anticodon qualifier in tRNA", feat);
01138 break;
01139 }
01140 }
01141
01142 if ( rna.CanGetExt() &&
01143 rna.GetExt().Which () == CRNA_ref::C_Ext::e_Name ) {
01144 PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
01145 "Unparsed product qualifier in tRNA", feat);
01146 }
01147 }
01148
01149 if ( rna_type == CRNA_ref::eType_unknown ) {
01150 PostErr(eDiag_Warning, eErr_SEQ_FEAT_RNAtype0,
01151 "RNA type 0 (unknown) not supported", feat);
01152 }
01153
01154 if ( feat.CanGetProduct() ) {
01155 ValidateRnaProductType(rna, feat);
01156 }
01157 }
01158
01159
01160 void CValidError_feat::ValidateAnticodon(const CSeq_loc& anticodon, const CSeq_feat& feat)
01161 {
01162 bool ordered = true;
01163 bool adjacent = false;
01164 bool unmarked_strand = false;
01165 bool mixed_strand = false;
01166
01167 CSeq_loc_CI prev;
01168 for (CSeq_loc_CI curr(anticodon); curr; ++curr) {
01169 if ( !curr.GetSeq_loc().IsInt() && !curr.GetSeq_loc().IsPnt() ) {
01170 continue;
01171 }
01172
01173 if ( prev && curr &&
01174 IsSameBioseq(curr.GetSeq_id(), prev.GetSeq_id(), m_Scope) ) {
01175 CSeq_loc_CI::TRange prev_range = prev.GetRange();
01176 CSeq_loc_CI::TRange curr_range = curr.GetRange();
01177 if ( ordered ) {
01178 if ( curr.GetStrand() == eNa_strand_minus ) {
01179 if (prev_range.GetTo() < curr_range.GetTo()) {
01180 ordered = false;
01181 }
01182 if (curr_range.GetTo() + 1 == prev_range.GetFrom()) {
01183 adjacent = true;
01184 }
01185 } else {
01186 if (prev_range.GetTo() > curr_range.GetTo()) {
01187 ordered = false;
01188 }
01189 if (prev_range.GetTo() + 1 == curr_range.GetFrom()) {
01190 adjacent = true;
01191 }
01192 }
01193 }
01194 ENa_strand curr_strand = curr.GetStrand();
01195 ENa_strand prev_strand = prev.GetStrand();
01196 if ( curr_range == prev_range && curr_strand == prev_strand ) {
01197 PostErr(eDiag_Warning, eErr_SEQ_FEAT_DuplicateInterval,
01198 "Duplicate anticodon exons in location", feat);
01199 }
01200 if ( curr_strand != prev_strand ) {
01201 if (curr_strand == eNa_strand_plus && prev_strand == eNa_strand_unknown) {
01202 unmarked_strand = true;
01203 } else if (curr_strand == eNa_strand_unknown && prev_strand == eNa_strand_plus) {
01204 unmarked_strand = true;
01205 } else {
01206 mixed_strand = true;
01207 }
01208 }
01209 }
01210 prev = curr;
01211 }
01212 if (adjacent) {
01213 PostErr(eDiag_Warning, eErr_SEQ_FEAT_AbuttingIntervals,
01214 "Adjacent intervals in Anticodon", feat);
01215 }
01216
01217
01218 bool trans_splice = false;
01219 if (feat.CanGetExcept() && feat.GetExcept() && feat.CanGetExcept_text()) {
01220 if (NStr::FindNoCase(feat.GetExcept_text(), "trans-splicing") != NPOS) {
01221 trans_splice = true;
01222 }
01223 }
01224 if (!trans_splice) {
01225 if (mixed_strand) {
01226 PostErr(eDiag_Warning, eErr_SEQ_FEAT_MixedStrand,
01227 "Mixed strands in Anticodon", feat);
01228 }
01229 if (unmarked_strand) {
01230 PostErr(eDiag_Warning, eErr_SEQ_FEAT_MixedStrand,
01231 "Mixed plus and unknown strands in Anticodon", feat);
01232 }
01233 if (!ordered) {
01234 PostErr(eDiag_Warning, eErr_SEQ_FEAT_SeqLocOrder,
01235 "Intervals out of order in Anticodon", feat);
01236 }
01237 }
01238 }
01239
01240
01241 void CValidError_feat::ValidateRnaProductType
01242 (const CRNA_ref& rna,
01243 const CSeq_feat& feat)
01244 {
01245 CBioseq_Handle bsh = m_Scope->GetBioseqHandle(feat.GetProduct());
01246 if ( !bsh ) {
01247 return;
01248 }
01249 CSeqdesc_CI di(bsh, CSeqdesc::e_Molinfo);
01250 if ( !di ) {
01251 return;
01252 }
01253 const CMolInfo& mol_info = di->GetMolinfo();
01254 if ( !mol_info.CanGetBiomol() ) {
01255 return;
01256 }
01257 int biomol = mol_info.GetBiomol();
01258
01259 switch ( rna.GetType() ) {
01260
01261 case CRNA_ref::eType_mRNA:
01262 if ( biomol == CMolInfo::eBiomol_mRNA ) {
01263 return;
01264 }
01265 break;
01266
01267 case CRNA_ref::eType_tRNA:
01268 if ( biomol == CMolInfo::eBiomol_tRNA ) {
01269 return;
01270 }
01271 break;
01272
01273 case CRNA_ref::eType_rRNA:
01274 if ( biomol == CMolInfo::eBiomol_rRNA ) {
01275 return;
01276 }
01277 break;
01278
01279 default:
01280 return;
01281 }
01282
01283 PostErr(eDiag_Error, eErr_SEQ_FEAT_RnaProductMismatch,
01284 "Type of RNA does not match MolInfo of product Bioseq", feat);
01285 }
01286
01287
01288 int s_LegalNcbieaaValues[] = { 42, 65, 66, 67, 68, 69, 70, 71, 72, 73,
01289 75, 76, 77, 78, 80, 81, 82, 83, 84, 85,
01290 86, 87, 88, 89, 90 };
01291
01292
01293 static const char* kAANames[] = {
01294 "---", "Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile",
01295 "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr", "Val",
01296 "Trp", "OTHER", "Tyr", "Glx", "Sec", "TERM"
01297 };
01298
01299
01300 const char* GetAAName(unsigned char aa, bool is_ascii)
01301 {
01302 try {
01303 if (is_ascii) {
01304 aa = CSeqportUtil::GetMapToIndex
01305 (CSeq_data::e_Ncbieaa, CSeq_data::e_Ncbistdaa, aa);
01306 }
01307 return (aa < sizeof(kAANames)/sizeof(*kAANames)) ? kAANames[aa] : "OTHER";
01308 } catch (...) {
01309 return "OTHER";
01310 }
01311 }
01312
01313
01314 static string GetGeneticCodeName (int gcode)
01315 {
01316 const CGenetic_code_table& code_table = CGen_code_table::GetCodeTable();
01317 const list<CRef<CGenetic_code> >& codes = code_table.Get();
01318
01319 for ( list<CRef<CGenetic_code> >::const_iterator code_it = codes.begin(), code_it_end = codes.end(); code_it != code_it_end; ++code_it ) {
01320 if ((*code_it)->GetId() == gcode) {
01321 return (*code_it)->GetName();
01322 }
01323 }
01324 return "unknown";
01325 }
01326
01327 void CValidError_feat::ValidateTrnaCodons(const CTrna_ext& trna, const CSeq_feat& feat)
01328 {
01329 if (!trna.IsSetAa()) {
01330 return;
01331 }
01332
01333 vector<char> seqData;
01334 string str = "";
01335
01336 switch (trna.GetAa().Which()) {
01337 case CTrna_ext::C_Aa::e_Iupacaa:
01338 str = trna.GetAa().GetIupacaa();
01339 CSeqConvert::Convert(str, CSeqUtil::e_Iupacaa, 0, str.size(), seqData, CSeqUtil::e_Ncbieaa);
01340 break;
01341 case CTrna_ext::C_Aa::e_Ncbi8aa:
01342 str = trna.GetAa().GetNcbi8aa();
01343 CSeqConvert::Convert(str, CSeqUtil::e_Ncbi8aa, 0, str.size(), seqData, CSeqUtil::e_Ncbieaa);
01344 break;
01345 case CTrna_ext::C_Aa::e_Ncbistdaa:
01346 str = trna.GetAa().GetNcbi8aa();
01347 CSeqConvert::Convert(str, CSeqUtil::e_Ncbistdaa, 0, str.size(), seqData, CSeqUtil::e_Ncbieaa);
01348 break;
01349 case CTrna_ext::C_Aa::e_Ncbieaa:
01350 seqData.push_back(trna.GetAa().GetNcbieaa());
01351 break;
01352 default:
01353 NCBI_THROW (CCoreException, eCore, "Unrecognized tRNA aa coding");
01354 break;
01355 }
01356 unsigned char aa = seqData[0];
01357
01358
01359 bool found = false;
01360 for ( int i = 0; i < 25; ++i ) {
01361 if ( aa == s_LegalNcbieaaValues[i] ) {
01362 found = true;
01363 break;
01364 }
01365 }
01366 if ( !found ) {
01367 PostErr(eDiag_Error, eErr_SEQ_FEAT_BadTrnaAA,
01368 "Invalid tRNA amino acid (" + NStr::IntToString(aa) + ")", feat);
01369 aa = ' ';
01370 }
01371
01372
01373 int gcode = 1;
01374 CBioseq_Handle bsh = m_Scope->GetBioseqHandle(feat.GetLocation());
01375 if ( bsh ) {
01376
01377 CSeqdesc_CI diter(bsh, CSeqdesc::e_Source);
01378 if ( diter ) {
01379 gcode = diter->GetSource().GetGenCode();
01380 }
01381 }
01382
01383 const string& ncbieaa = CGen_code_table::GetNcbieaa(gcode);
01384 if ( ncbieaa.length() != 64 ) {
01385 return;
01386 }
01387
01388 string codename = GetGeneticCodeName (gcode);
01389 char buf[2];
01390 buf[0] = aa;
01391 buf[1] = 0;
01392 string aaname = buf;
01393 aaname += "/";
01394 aaname += GetAAName (aa, true);
01395
01396 EDiagSev sev = (aa == 'U') ? eDiag_Warning : eDiag_Error;
01397
01398 bool modified_codon_recognition = false;
01399 if ( feat.CanGetExcept_text() &&
01400 NStr::FindNoCase(feat.GetExcept_text(), "modified codon recognition") != NPOS ) {
01401 modified_codon_recognition = true;
01402 }
01403
01404 ITERATE( CTrna_ext::TCodon, iter, trna.GetCodon() ) {
01405
01406 if ( *iter < 0 || *iter > 63 ) {
01407 PostErr(sev, eErr_SEQ_FEAT_TrnaCodonWrong,
01408 "tRNA codon value (" + NStr::IntToString(*iter) +
01409 ") out of range", feat);
01410 continue;
01411 }
01412
01413 if ( !modified_codon_recognition ) {
01414 unsigned char taa = ncbieaa[*iter];
01415 if ( taa != aa ) {
01416 if ( (aa == 'U') && (taa == '*') && (*iter == 14) ) {
01417
01418
01419
01420 } else {
01421 string codon = CGen_code_table::IndexToCodon(*iter);
01422 NStr::ReplaceInPlace (codon, "T", "U");
01423
01424 PostErr(sev, eErr_SEQ_FEAT_TrnaCodonWrong,
01425 "tRNA codon (" + codon + ") does not match amino acid ("
01426 + aaname + ") specified by genetic code ("
01427 + NStr::IntToString (gcode) + "/" + codename + ")", feat);
01428 }
01429 }
01430 }
01431 }
01432 }
01433
01434
01435 void CValidError_feat::ValidateImp(const CImp_feat& imp, const CSeq_feat& feat, bool is_insd_in_sep)
01436 {
01437 CSeqFeatData::ESubtype subtype = feat.GetData().GetSubtype();
01438 const string& key = imp.GetKey();
01439
01440 switch ( subtype ) {
01441 case CSeqFeatData::eSubtype_exon:
01442 if ( m_Imp.IsValidateExons() ) {
01443 ValidateSplice(feat, true);
01444 }
01445 break;
01446
01447 case CSeqFeatData::eSubtype_bad:
01448 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnknownImpFeatKey,
01449 "Unknown feature key " + key, feat);
01450 break;
01451
01452 case CSeqFeatData::eSubtype_virion:
01453 case CSeqFeatData::eSubtype_mutation:
01454 case CSeqFeatData::eSubtype_allele:
01455 PostErr(eDiag_Error, eErr_SEQ_FEAT_UnknownImpFeatKey,
01456 "Feature key " + key + " is no longer legal", feat);
01457 break;
01458
01459 case CSeqFeatData::eSubtype_polyA_site:
01460 {
01461 CSeq_loc::TRange range = feat.GetLocation().GetTotalRange();
01462 if ( range.GetFrom() != range.GetTo() ) {
01463 PostErr(eDiag_Warning, eErr_SEQ_FEAT_PolyAsiteNotPoint,
01464 "PolyA_site should be a single point", feat);
01465 }
01466 }
01467 break;
01468
01469 case CSeqFeatData::eSubtype_mat_peptide:
01470 case CSeqFeatData::eSubtype_sig_peptide:
01471 case CSeqFeatData::eSubtype_transit_peptide:
01472 PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidForType,
01473 "Peptide processing feature should be converted to the "
01474 "appropriate protein feature subtype", feat);
01475 ValidatePeptideOnCodonBoundry(feat, key);
01476 break;
01477
01478 case CSeqFeatData::eSubtype_mRNA:
01479 case CSeqFeatData::eSubtype_tRNA:
01480 case CSeqFeatData::eSubtype_rRNA:
01481 case CSeqFeatData::eSubtype_snRNA:
01482 case CSeqFeatData::eSubtype_scRNA:
01483 case CSeqFeatData::eSubtype_snoRNA:
01484 case CSeqFeatData::eSubtype_misc_RNA:
01485 case CSeqFeatData::eSubtype_precursor_RNA:
01486
01487 PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidForType,
01488 "RNA feature should be converted to the appropriate RNA feature "
01489 "subtype, location should be converted manually", feat);
01490 break;
01491
01492 case CSeqFeatData::eSubtype_Imp_CDS:
01493 {
01494
01495 bool pseudo = (feat.CanGetPseudo() && feat.GetPseudo()) ||
01496 IsOverlappingGenePseudo(feat);
01497 if ( !pseudo ) {
01498 PostErr(eDiag_Info, eErr_SEQ_FEAT_ImpCDSnotPseudo,
01499 "ImpFeat CDS should be pseudo", feat);
01500 }
01501
01502 FOR_EACH_GBQUAL_ON_FEATURE (gbqual, feat) {
01503 if ( NStr::CompareNocase( (*gbqual)->GetQual(), "translation") == 0 ) {
01504 PostErr(eDiag_Error, eErr_SEQ_FEAT_ImpCDShasTranslation,
01505 "ImpFeat CDS with /translation found", feat);
01506 }
01507 }
01508 }
01509 break;
01510 case CSeqFeatData::eSubtype_imp:
01511 PostErr(eDiag_Error, eErr_SEQ_FEAT_UnknownImpFeatKey,
01512 "Unknown feature key " + key, feat);
01513 break;
01514 default:
01515 break;
01516 }
01517
01518
01519 if ( imp.CanGetLoc() ) {
01520 const string& imp_loc = imp.GetLoc();
01521 if ( imp_loc.find("one-of") != string::npos ) {
01522 PostErr(eDiag_Error, eErr_SEQ_FEAT_ImpFeatBadLoc,
01523 "ImpFeat loc " + imp_loc +
01524 " has obsolete 'one-of' text for feature " + key, feat);
01525 } else if ( feat.GetLocation().IsInt() ) {
01526 const CSeq_interval& seq_int = feat.GetLocation().GetInt();
01527 string temp_loc = NStr::IntToString(seq_int.GetFrom() + 1) +
01528 ".." + NStr::IntToString(seq_int.GetTo() + 1);
01529 if ( imp_loc != temp_loc ) {
01530 PostErr(eDiag_Error, eErr_SEQ_FEAT_ImpFeatBadLoc,
01531 "ImpFeat loc " + imp_loc + " does not equal feature location " +
01532 temp_loc + "for feature " + key, feat);
01533 }
01534 }
01535 }
01536
01537 if ( feat.CanGetQual() ) {
01538 ValidateImpGbquals(imp, feat, is_insd_in_sep);
01539 }
01540
01541
01542 bool found = false;
01543 switch (subtype) {
01544 case CSeqFeatData::eSubtype_conflict:
01545 case CSeqFeatData::eSubtype_old_sequence:
01546 if (!feat.IsSetCit() && NStr::IsBlank(feat.GetNamedQual("compare"))) {
01547 PostErr(eDiag_Warning, eErr_SEQ_FEAT_MissingQualOnImpFeat,
01548 "Feature " + key + " requires either /compare or /citation (or both)",
01549 feat);
01550 }
01551 break;
01552 default:
01553 ITERATE (CFeatQualAssoc::TGBQualTypeVec, required, CFeatQualAssoc::GetMandatoryGbquals(subtype)) {
01554 found = false;
01555 FOR_EACH_GBQUAL_ON_FEATURE (qual, feat) {
01556 if (CGbqualType::GetType(**qual) == *required) {
01557 found = true;
01558 break;
01559 }
01560 }
01561 if (!found) {
01562 PostErr(eDiag_Warning, eErr_SEQ_FEAT_MissingQualOnImpFeat,
01563 "Missing qualifier " + CGbqualType::GetString(*required) +
01564 " for feature " + key, feat);
01565 }
01566 }
01567 break;
01568 }
01569 }
01570
01571
01572 static bool s_RptUnitIsBaseRange (string str, int& from, int& to)
01573
01574 {
01575 if (str.length() > 25) {
01576 return false;
01577 }
01578 SIZE_TYPE pos = NStr::Find (str, "..");
01579 if (pos == string::npos) {
01580 return false;
01581 }
01582 try {
01583 from = NStr::StringToInt (str.substr(0, pos));
01584 to = NStr::StringToInt (str.substr (pos + 2));
01585 } catch (...) {
01586 return false;
01587 }
01588 if (from < 0 || to < 0) {
01589 return false;
01590 }
01591 return true;
01592 }
01593
01594 typedef enum {
01595 eAccessionFormat_valid = 0,
01596 eAccessionFormat_no_start_letters,
01597 eAccessionFormat_wrong_number_of_digits,
01598 eAccessionFormat_null,
01599 eAccessionFormat_too_long,
01600 eAccessionFormat_missing_version,
01601 eAccessionFormat_bad_version } EAccessionFormatError;
01602
01603 static EAccessionFormatError s_ValidateAccessionString (string accession, bool require_version)
01604 {
01605 if (NStr::IsBlank (accession)) {
01606 return eAccessionFormat_null;
01607 } else if (accession.length() >= 16) {
01608 return eAccessionFormat_too_long;
01609 } else if (accession.length() < 3
01610 || ! isalpha (accession.c_str()[0])
01611 || ! isupper (accession.c_str()[0])
01612 || ! isalpha (accession.c_str()[1])
01613 || ! isupper (accession.c_str()[1])) {
01614 return eAccessionFormat_no_start_letters;
01615 }
01616
01617 string str = accession;
01618 if (NStr::StartsWith (str, "NZ_")) {
01619 str = str.substr(3);
01620 }
01621
01622 const char *cp = str.c_str();
01623 int numAlpha = 0;
01624
01625 while (isalpha (*cp)) {
01626 numAlpha++;
01627 cp++;
01628 }
01629
01630 int numUndersc = 0;
01631
01632 while (*cp == '_') {
01633 numUndersc++;
01634 cp++;
01635 }
01636
01637 int numDigits = 0;
01638 while (isdigit (*cp)) {
01639 numDigits++;
01640 cp++;
01641 }
01642
01643 if ((*cp != '\0' && *cp != ' ' && *cp != '.') || numUndersc > 1) {
01644 return eAccessionFormat_wrong_number_of_digits;
01645 }
01646
01647 EAccessionFormatError rval = eAccessionFormat_valid;
01648
01649 if (require_version) {
01650 if (*cp != '.') {
01651 rval = eAccessionFormat_missing_version;
01652 }
01653 cp++;
01654 int numVersion = 0;
01655 while (isdigit (*cp)) {
01656 numVersion++;
01657 cp++;
01658 }
01659 if (numVersion < 1) {
01660 rval = eAccessionFormat_missing_version;
01661 } else if (*cp != '\0' && *cp != ' ') {
01662 rval = eAccessionFormat_bad_version;
01663 }
01664 }
01665
01666
01667 if (numUndersc == 0) {
01668 if ((numAlpha == 1 && numDigits == 5)
01669 || (numAlpha == 2 && numDigits == 6)
01670 || (numAlpha == 3 && numDigits == 5)
01671 || (numAlpha == 4 && numDigits == 8)
01672 || (numAlpha == 5 && numDigits == 7)) {
01673 return rval;
01674 }
01675 } else if (numUndersc == 1) {
01676 if (numAlpha != 2 || (numDigits != 6 && numDigits != 8 && numDigits != 9)) {
01677 return eAccessionFormat_wrong_number_of_digits;
01678 }
01679 char first_letter = accession.c_str()[0];
01680 char second_letter = accession.c_str()[1];
01681 if (first_letter == 'N' || first_letter == 'X' || first_letter == 'Z') {
01682 if (second_letter == 'M' || second_letter == 'C'
01683 || second_letter == 'T' || second_letter == 'P'
01684 || second_letter == 'G' || second_letter == 'R'
01685 || second_letter == 'S' || second_letter == 'W'
01686 || second_letter == 'W' || second_letter == 'Z') {
01687 return rval;
01688 }
01689 }
01690 if ((first_letter == 'A' || first_letter == 'Y')
01691 && second_letter == 'P') {
01692 return rval;
01693 }
01694 }
01695
01696 return eAccessionFormat_wrong_number_of_digits;
01697 }
01698
01699
01700 static bool s_StringConsistsOf (string str, string consist)
01701 {
01702 const char *src = str.c_str();
01703 const char *find = consist.c_str();
01704 bool rval = true;
01705
01706 while (*src != 0 && rval) {
01707 if (strchr (find, *src) == NULL) {
01708 rval = false;
01709 }
01710 src++;
01711 }
01712 return rval;
01713 }
01714
01715
01716 void CValidError_feat::ValidateImpGbquals
01717 (const CImp_feat& imp,
01718 const CSeq_feat& feat,
01719 bool is_insd_in_sep)
01720 {
01721 CSeqFeatData::ESubtype ftype = feat.GetData().GetSubtype();
01722 const string& key = imp.GetKey();
01723
01724 FOR_EACH_GBQUAL_ON_FEATURE (qual, feat) {
01725 const string& qual_str = (*qual)->GetQual();
01726
01727 if ( qual_str == "gsdb_id" ) {
01728 continue;
01729 }
01730
01731 CGbqualType::EType gbqual = CGbqualType::GetType(qual_str);
01732
01733 if ( gbqual == CGbqualType::e_Bad ) {
01734 if ( !qual_str.empty() ) {
01735 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnknownImpFeatQual,
01736 "Unknown qualifier " + qual_str, feat);
01737 } else {
01738 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnknownImpFeatQual,
01739 "Empty qualifier", feat);
01740 }
01741 } else {
01742 if ( !CFeatQualAssoc::IsLegalGbqual(ftype, gbqual) ) {
01743 PostErr(eDiag_Warning, eErr_SEQ_FEAT_WrongQualOnImpFeat,
01744 "Wrong qualifier " + qual_str + " for feature " +
01745 key, feat);
01746 }
01747
01748 string val = (*qual)->GetVal();
01749
01750 bool error = false;
01751 switch ( gbqual ) {
01752 case CGbqualType::e_Rpt_type:
01753 {{
01754 for ( size_t i = 0;
01755 i < sizeof(s_LegalRepeatTypes) / sizeof(string);
01756 ++i ) {
01757 if ( val.find(s_LegalRepeatTypes[i]) != string::npos ) {
01758 bool left = false, right = false;
01759 if ( i > 0 ) {
01760 left = val[i-1] == ',' || val[i-1] == '(';
01761 }
01762 if ( i < val.length() - 1 ) {
01763 right = val[i+1] == ',' || val[i+1] == ')';
01764 }
01765 if ( left && right ) {
01766 error = true;
01767 }
01768 break;
01769 }
01770 }
01771 }}
01772 break;
01773
01774 case CGbqualType::e_Rpt_unit:
01775 case CGbqualType::e_Rpt_unit_seq:
01776 {{
01777 bool found = false, multiple_rpt_unit = true;
01778 ITERATE(string, it, val) {
01779 if ( *it <= ' ' ) {
01780 found = true;
01781 } else if ( *it == '(' || *it == ')' ||
01782 *it == ',' || *it == '.' ||
01783 isdigit((unsigned char)(*it)) ) {
01784 } else {
01785 multiple_rpt_unit = false;
01786 }
01787 }
01788
01789
01790
01791
01792
01793
01794 if ( NStr::CompareNocase(key, "repeat_region") == 0 &&
01795 !multiple_rpt_unit ) {
01796 if (val.length() <= GetLength(feat.GetLocation(), m_Scope) ) {
01797 bool just_nuc_letters = true;
01798 static const string nuc_letters = "ACGTNacgtn";
01799
01800 ITERATE(string, it, val) {
01801 if ( nuc_letters.find(*it) == NPOS ) {
01802 just_nuc_letters = false;
01803 break;
01804 }
01805 }
01806
01807 if ( just_nuc_letters ) {
01808 CSeqVector vec = GetSequenceFromFeature(feat, *m_Scope);
01809 if ( !vec.empty() ) {
01810 string vec_data;
01811 vec.GetSeqData(0, vec.size(), vec_data);
01812 if (NStr::FindNoCase (vec_data, val) == string::npos) {
01813 PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidQualifierValue,
01814 "repeat_region /rpt_unit and underlying "
01815 "sequence do not match", feat);
01816 }
01817 }
01818
01819
01820 } else {
01821 PostErr(eDiag_Info, eErr_SEQ_FEAT_InvalidQualifierValue,
01822 "rpt_unit_seq qualifier contains invalid characters", feat);
01823 }
01824 } else {
01825 PostErr(eDiag_Info, eErr_SEQ_FEAT_InvalidQualifierValue,
01826 "Length of rpt_unit_seq is greater than feature length", feat);
01827 }
01828 }
01829 }}
01830 break;
01831 case CGbqualType::e_Rpt_unit_range:
01832 {{
01833 int from, to;
01834 if (!s_RptUnitIsBaseRange(val, from, to)) {
01835 PostErr (eDiag_Warning, eErr_SEQ_FEAT_InvalidQualifierValue,
01836 "/rpt_unit_range is not a base range", feat);
01837 } else {
01838 CSeq_loc::TRange range = feat.GetLocation().GetTotalRange();
01839 if (from < range.GetFrom() || from > range.GetTo() || to < range.GetFrom() || to > range.GetTo()) {
01840 PostErr (eDiag_Warning, eErr_SEQ_FEAT_InvalidQualifierValue,
01841 "/rpt_unit_range is not within sequence length", feat);
01842 }
01843 }
01844 }}
01845 break;
01846 case CGbqualType::e_Label:
01847 {{
01848 bool only_digits = true,
01849 has_spaces = false;
01850
01851 ITERATE(string, it, val) {
01852 if ( isspace((unsigned char)(*it)) ) {
01853 has_spaces = true;
01854 }
01855 if ( !isdigit((unsigned char)(*it)) ) {
01856 only_digits = false;
01857 }
01858 }
01859 if (only_digits || has_spaces) {
01860 PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue, "Illegal value for qualifier " + qual_str, feat);
01861 }
01862 }}
01863 break;
01864 case CGbqualType::e_Replace:
01865 {{
01866 CBioseq_Handle bh = m_Scope->GetBioseqHandle(feat.GetLocation());
01867 if (bh) {
01868 if (bh.IsNa()) {
01869 if (NStr::Equal(key, "variation")) {
01870 if (!s_StringConsistsOf (val, "acgt")) {
01871 PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
01872 val + " is not a legal value for qualifier " + qual_str
01873 + " - should only be composed of acgt unambiguous nucleotide bases",
01874 feat);
01875 }
01876 } else if (!s_StringConsistsOf (val, "acgtmrwsykvhdbn")) {
01877 PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
01878 val + " is not a legal value for qualifier " + qual_str
01879 + " - should only be composed of acgtmrwsykvhdbn nucleotide bases",
01880 feat);
01881 }
01882 } else if (bh.IsAa()) {
01883 if (!s_StringConsistsOf (val, "acdefghiklmnpqrstuvwy*")) {
01884 PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
01885 val + " is not a legal value for qualifier " + qual_str
01886 + " - should only be composed of acdefghiklmnpqrstuvwy* amino acids",
01887 feat);
01888 }
01889 }
01890
01891
01892 bool has_fuzz = false;
01893 for( objects::CSeq_loc_CI it(feat.GetLocation()); it && !has_fuzz; ++it) {
01894 if (it.IsPoint() && it.GetSeq_loc().GetPnt().IsSetFuzz()) {
01895 has_fuzz = true;
01896 }
01897 }
01898 if (!has_fuzz && val.length() == GetLength (feat.GetLocation(), m_Scope)) {
01899 string bases = "";
01900 bool match = false;
01901 try {
01902 CCdregion_translate::ReadSequenceByLocation (bases, bh, feat.GetLocation(), CCdregion_translate::eTruncate);
01903 if (NStr::EqualNocase(val, bases)) {
01904 PostErr (eDiag_Info, eErr_SEQ_FEAT_SuspiciousQualifierValue,
01905 "/replace already matches underlying sequence (" + val + ")",
01906 feat);
01907 }
01908 } catch ( ... ) {
01909 }
01910 }
01911 }
01912 }}
01913 break;
01914
01915 case CGbqualType::e_Cons_splice:
01916 {{
01917 error = true;
01918 for (size_t i = 0;
01919 i < sizeof(s_LegalConsSpliceStrings) / sizeof(string);
01920 ++i) {
01921 if ( NStr::CompareNocase(val, s_LegalConsSpliceStrings[i]) == 0 ) {
01922 error = false;
01923 break;
01924 }
01925 }
01926 }}
01927 break;
01928
01929 case CGbqualType::e_Mobile_element:
01930 {{
01931 }}
01932 break;
01933
01934 case CGbqualType::e_Compare:
01935 {{
01936 if (!NStr::StartsWith (val, "(")) {
01937 EAccessionFormatError valid_accession = s_ValidateAccessionString (val, true);
01938 if (valid_accession == eAccessionFormat_missing_version) {
01939 PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
01940 val + " accession missing version for qualifier " + qual_str, feat);
01941 } else if (valid_accession == eAccessionFormat_bad_version) {
01942 PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
01943 val + " accession has bad version for qualifier " + qual_str, feat);
01944 } else if (valid_accession != eAccessionFormat_valid) {
01945 PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
01946 val + " is not a legal accession for qualifier " + qual_str, feat);
01947 } else if (is_insd_in_sep && NStr::Find (val, "_") == string::npos) {
01948 PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
01949 "RefSeq accession " + val + " cannot be used for qualifier " + qual_str, feat);
01950 }
01951 }
01952 }}
01953 break;
01954
01955 default:
01956 break;
01957 }
01958 if ( error ) {
01959 PostErr(eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue,
01960 val + " is not a legal value for qualifier " + qual_str, feat);
01961 }
01962 }
01963 }
01964 }
01965
01966
01967 void CValidError_feat::ValidatePeptideOnCodonBoundry
01968 (const CSeq_feat& feat,
01969 const string& key)
01970 {
01971 const CSeq_loc& loc = feat.GetLocation();
01972
01973
01974 if( m_Imp.AvoidPerfBottlenecks() ) {
01975 return;
01976 }
01977
01978
01979 CConstRef<CSeq_feat> cds = GetOverlappingCDS(loc, *m_Scope);
01980 if ( !cds ) {
01981 return;
01982 }
01983 const CCdregion& cdr = cds->GetData().GetCdregion();
01984
01985 TSeqPos pos1 = LocationOffset(cds->GetLocation(), loc, eOffset_FromStart);
01986 TSeqPos pos2 = pos1 + GetLength(loc, m_Scope);
01987 unsigned int frame = 0;
01988 switch (cdr.GetFrame()) {
01989 case CCdregion::eFrame_not_set:
01990 case CCdregion::eFrame_one:
01991 frame = 0;
01992 break;
01993 case CCdregion::eFrame_two:
01994 frame = 1;
01995 break;
01996 case CCdregion::eFrame_three:
01997 frame = 2;
01998 break;
01999 }
02000
02001 TSeqPos mod1 = (pos1 + 3 - frame) %3;
02002 TSeqPos mod2 = (pos2 + 3 - frame) %3;
02003
02004 if ( mod1 != 0 && loc.IsPartialStart(eExtreme_Biological)
02005 && cds->GetLocation().IsPartialStart(eExtreme_Biological)
02006 && pos1 == 0) {
02007 mod1 = 0;
02008 }
02009 if ( mod2 != 0 && loc.IsPartialStop(eExtreme_Biological)
02010 && cds->GetLocation().IsPartialStop(eExtreme_Biological)
02011 && pos2 == GetLength (cds->GetLocation(), m_Scope)) {
02012 mod2 = 0;
02013 }
02014
02015 if ( (mod1 != 0) && (mod2 != 0) ) {
02016 PostErr(eDiag_Warning, eErr_SEQ_FEAT_PeptideFeatOutOfFrame,
02017 "Start and stop of " + key + " are out of frame with CDS codons",
02018 feat);
02019 } else if (mod1 != 0) {
02020 PostErr(eDiag_Warning, eErr_SEQ_FEAT_PeptideFeatOutOfFrame,
02021 "Start of " + key + " is out of frame with CDS codons", feat);
02022 } else if (mod2 != 0) {
02023 PostErr(eDiag_Warning, eErr_SEQ_FEAT_PeptideFeatOutOfFrame,
02024 "Stop of " + key + " is out of frame with CDS codons", feat);
02025 }
02026 }
02027
02028
02029 static const string sc_BypassMrnaTransCheckText[] = {
02030 "RNA editing",
02031 "artificial frameshift",
02032 "mismatches in transcription"
02033 "reasons given in citation",
02034 "unclassified transcription discrepancy",
02035 };
02036 DEFINE_STATIC_ARRAY_MAP(CStaticArraySet<string>, sc_BypassMrnaTransCheck, sc_BypassMrnaTransCheckText);
02037
02038
02039 void CValidError_feat::ValidateMrnaTrans(const CSeq_feat& feat)
02040 {
02041 bool has_errors = false, unclassified_except = false,
02042 mismatch_except = false, report_errors = true;
02043 string farstr;
02044
02045 if (feat.CanGetPseudo() && feat.GetPseudo()) {
02046 return;
02047 }
02048 if (!feat.CanGetProduct()) {
02049 return;
02050 }
02051
02052 if (feat.CanGetExcept() && feat.GetExcept() &&
02053 feat.CanGetExcept_text()) {
02054 const string& except_text = feat.GetExcept_text();
02055 ITERATE (CStaticArraySet<string>, it, sc_BypassMrnaTransCheck) {
02056 if (NStr::FindNoCase(except_text, *it) != NPOS) {
02057 report_errors = false;
02058 }
02059 if (NStr::FindNoCase(except_text, "unclassified transcription discrepancy") != NPOS) {
02060 unclassified_except = true;
02061 }
02062 if (NStr::FindNoCase(except_text, "mismatches in transcription") != NPOS) {
02063 mismatch_except = true;
02064 report_errors = true;
02065 }
02066 }
02067 }
02068
02069 CConstRef<CSeq_id> product_id;
02070 try {
02071 product_id.Reset(&GetId(feat.GetProduct(), m_Scope));
02072 } catch (CException&) {
02073 }
02074 if (!product_id) {
02075 return;
02076 }
02077
02078 CBioseq_Handle nuc = m_Scope->GetBioseqHandle(feat.GetLocation());
02079 if (!nuc) {
02080 has_errors = true;
02081 if (report_errors || unclassified_except) {
02082 PostErr(eDiag_Error, eErr_SEQ_FEAT_MrnaTransFail,
02083 "Unable to transcribe mRNA", feat);
02084 };
02085 }
02086 if (nuc) {
02087 EDiagSev sev = eDiag_Error;
02088 CBioseq_Handle rna = m_Scope->GetBioseqHandleFromTSE(*product_id, nuc);
02089 if (!rna) {
02090
02091 sev = m_Imp.IsRefSeq() ? eDiag_Error : eDiag_Warning;
02092 if (m_Imp.IsFarFetchMRNAproducts()) {
02093 rna = m_Scope->GetBioseqHandle(feat.GetProduct());
02094 }
02095 if (!rna) {
02096 return;
02097 }
02098 farstr = "(far)";
02099 }
02100 _ASSERT(nuc && rna);
02101
02102 CSeqVector nuc_vec(feat.GetLocation(), *m_Scope,
02103 CBioseq_Handle::eCoding_Iupac);
02104 CSeqVector rna_vec(feat.GetProduct(), *m_Scope,
02105 CBioseq_Handle::eCoding_Iupac);
02106
02107 size_t nuc_len = nuc_vec.size();
02108 size_t rna_len = rna_vec.size();
02109
02110 if (nuc_len != rna_len) {
02111 has_errors = true;
02112 if (nuc_len < rna_len) {
02113 size_t count_a = 0, count_no_a = 0;
02114
02115 for (CSeqVector_CI iter(rna_vec, nuc_len); iter; ++iter) {
02116 if ((*iter == 'A') || (*iter == 'a')) {
02117 ++count_a;
02118 } else {
02119 ++count_no_a;
02120 }
02121 }
02122 if (count_a < (19 * count_no_a)) {
02123 if (report_errors) {
02124 PostErr(sev, eErr_SEQ_FEAT_TranscriptLen,
02125 "Transcript length [" + NStr::IntToString(nuc_len) +
02126 "] less than " + farstr + "product length [" +
02127 NStr::IntToString(rna_len) + "], and tail < 95% polyA",
02128 feat);
02129 }
02130 } else {
02131 if (report_errors) {
02132 PostErr(eDiag_Info, eErr_SEQ_FEAT_TranscriptLen,
02133 "Transcript length [" + NStr::IntToString(nuc_len) +
02134 "] less than " + farstr + "product length [" +
02135 NStr::IntToString(rna_len) + "], but tail >= 95% polyA",
02136 feat);
02137 }
02138 }
02139 } else {
02140 if (report_errors) {
02141 PostErr(sev, eErr_SEQ_FEAT_TranscriptLen,
02142 "Transcript length [" + NStr::IntToString(nuc_vec.size()) + "] " +
02143 "greater than " + farstr +"product length [" +
02144 NStr::IntToString(rna_vec.size()) + "]", feat);
02145 }
02146 }
02147
02148 rna_len = nuc_len = min(nuc_len, rna_len);
02149 }
02150 _ASSERT(nuc_len == rna_len);
02151
02152 if (nuc_len > 0) {
02153 CSeqVector_CI nuc_ci(nuc_vec);
02154 CSeqVector_CI rna_ci(rna_vec);
02155 size_t mismatches = 0;
02156
02157
02158 while ((nuc_ci && rna_ci) && (nuc_ci.GetPos() < nuc_len)) {
02159 if (*nuc_ci != *rna_ci) {
02160 ++mismatches;
02161 }
02162 ++nuc_ci;
02163 ++rna_ci;
02164 }
02165 if (mismatches > 0) {
02166 has_errors = true;
02167 if (report_errors && !mismatch_except) {
02168 PostErr(eDiag_Error, eErr_SEQ_FEAT_TranscriptMismatches,
02169 "There are " + NStr::IntToString(mismatches) +
02170 " mismatches out of " + NStr::IntToString(nuc_len) +
02171 " bases between the transcript and " + farstr + "product sequence",
02172 feat);
02173 }
02174 }
02175 }
02176 }
02177
02178 if (!report_errors && !has_errors) {
02179 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnnecessaryException,
02180 "mRNA has exception but passes transcription test", feat);
02181 }
02182 }
02183
02184
02185 void CValidError_feat::ValidateCommonMRNAProduct(const CSeq_feat& feat)
02186 {
02187 if ( (feat.CanGetPseudo() && feat.GetPseudo()) ||
02188 IsOverlappingGenePseudo(feat) ) {
02189 return;
02190 }
02191
02192 if ( !feat.CanGetProduct() ) {
02193 return;
02194 }
02195
02196 CBioseq_Handle bsh = m_Scope->GetBioseqHandle(feat.GetProduct());
02197 if ( !bsh ) {
02198 const CSeq_id& sid = GetId(feat.GetProduct(), m_Scope);
02199 if ( sid.IsLocal() ) {
02200 if ( m_Imp.IsGPS() || m_Imp.IsRefSeq() ) {
02201 PostErr(eDiag_Error, eErr_SEQ_FEAT_MissingMRNAproduct,
02202 "Product Bioseq of mRNA feature is not "
02203 "packaged in the record", feat);
02204 }
02205 }
02206 } else {
02207
02208
02209 if( m_Imp.AvoidPerfBottlenecks() ) {
02210 return;
02211 }
02212
02213
02214 CFeat_CI mrna(
02215 bsh, SAnnotSelector(CSeqFeatData::e_Rna)
02216 .SetOverlapTotalRange()
02217 .SetResolveNone()
02218 .SetByProduct());
02219 while ( mrna ) {
02220 if ( &(mrna->GetOriginalFeature()) != &feat ) {
02221 PostErr(eDiag_Critical, eErr_SEQ_FEAT_MultipleMRNAproducts,
02222 "Same product Bioseq from multiple mRNA features", feat);
02223 break;
02224 }
02225 ++mrna;
02226 }
02227 }
02228 }
02229
02230
02231 void CValidError_feat::ValidateBothStrands(const CSeq_feat& feat)
02232 {
02233 const CSeq_loc& location = feat.GetLocation ();
02234 bool bothstrands = false, bothreverse = false;
02235
02236 for ( CSeq_loc_CI citer (location); citer; ++citer ) {
02237 ENa_strand strand = citer.GetStrand();
02238 if ( strand == eNa_strand_both ) {
02239 bothstrands = true;
02240 } else if (strand == eNa_strand_both_rev) {
02241 bothreverse = true;
02242 }
02243 }
02244 if (bothstrands || bothreverse) {
02245 EDiagSev sev = eDiag_Warning;
02246 string prefix = "Feature";
02247 if (feat.IsSetData()) {
02248 if (feat.GetData().IsCdregion()) {
02249 prefix = "CDS";
02250 sev = eDiag_Error;
02251 } else if (feat.GetData().IsRna() && feat.GetData().GetRna().IsSetType()
02252 && feat.GetData().GetRna().GetType() == CRNA_ref::eType_mRNA) {
02253 prefix = "mRNA";
02254 sev = eDiag_Error;
02255 }
02256 }
02257 string suffix = "";
02258 if (bothstrands && bothreverse) {
02259 suffix = "(forward and reverse)";
02260 } else if (bothstrands) {
02261 suffix = "(forward)";
02262 } else if (bothreverse) {
02263 suffix = "(reverse)";
02264 }
02265
02266 PostErr (sev, eErr_SEQ_FEAT_BothStrands,
02267 prefix + " may not be on both " + suffix + " strands", feat);
02268 }
02269 }
02270
02271
02272
02273 void CValidError_feat::ValidateCommonCDSProduct
02274 (const CSeq_feat& feat)
02275 {
02276 if ( (feat.CanGetPseudo() && feat.GetPseudo()) ||
02277 IsOverlappingGenePseudo(feat) ) {
02278 return;
02279 }
02280
02281 if ( !feat.CanGetProduct() ) {
02282 return;
02283 }
02284
02285 const CCdregion& cdr = feat.GetData().GetCdregion();
02286 if ( cdr.CanGetOrf() ) {
02287 return;
02288 }
02289
02290 CBioseq_Handle prod = m_Scope->GetBioseqHandle(feat.GetProduct());
02291 if ( !prod ) {
02292 const CSeq_id* sid = 0;
02293 try {
02294 sid = &(GetId(feat.GetProduct(), m_Scope));
02295 } catch (const CObjmgrUtilException&) {}
02296
02297
02298 if ( sid == 0 || !sid->IsOther() ) {
02299 if ( m_Imp.IsGPS() ) {
02300 return;
02301 }
02302 }
02303
02304 if ( m_Imp.GetTSE().IsSeq() ) {
02305 return;
02306 }
02307
02308
02309 if ( m_Imp.IsStandaloneAnnot() ) {
02310 return;
02311 }
02312
02313 PostErr(eDiag_Warning, eErr_SEQ_FEAT_MultipleCDSproducts,
02314 "Unable to find product Bioseq from CDS feature", feat);
02315 return;
02316 }
02317 CBioseq_Handle nuc = m_Scope->GetBioseqHandle(feat.GetLocation());
02318 if ( nuc ) {
02319 CSeq_entry_Handle prod_nps =
02320 prod.GetExactComplexityLevel(CBioseq_set::eClass_nuc_prot);
02321 CSeq_entry_Handle nuc_nps =
02322 nuc.GetExactComplexityLevel(CBioseq_set::eClass_nuc_prot);
02323
02324 if ( (prod_nps != nuc_nps) && (!m_Imp.IsNT()) && (!m_Imp.IsGPS()) ) {
02325 PostErr(eDiag_Error, eErr_SEQ_FEAT_CDSproductPackagingProblem,
02326 "Protein product not packaged in nuc-prot set with nucleotide",
02327 feat);
02328 }
02329 }
02330 const CSeq_feat* sfp = GetCDSForProduct(prod);
02331 if ( sfp == 0 ) {
02332 return;
02333 }
02334
02335 if ( &feat != sfp ) {
02336
02337
02338 if ( m_Imp.IsGPS() ) {
02339
02340 CBioseq_Handle sfh = m_Scope->GetBioseqHandle(sfp->GetLocation());
02341 if ( nuc != sfh ) {
02342 return;
02343 }
02344 }
02345 PostErr(eDiag_Critical, eErr_SEQ_FEAT_MultipleCDSproducts,
02346 "Same product Bioseq from multiple CDS features", feat);
02347 }
02348 }
02349
02350
02351 void CValidError_feat::ValidateCDSPartial(const CSeq_feat& feat)
02352 {
02353 if ( !feat.CanGetProduct() || !feat.CanGetLocation() ) {
02354 return;
02355 }
02356
02357 CBioseq_Handle bsh = m_Scope->GetBioseqHandle(feat.GetProduct());
02358 if ( !bsh ) {
02359 return;
02360 }
02361
02362 CSeqdesc_CI sd(bsh, CSeqdesc::e_Molinfo);
02363 if ( !sd ) {
02364 return;
02365 }
02366 const CMolInfo& molinfo = sd->GetMolinfo();
02367
02368 const CSeq_loc& loc = feat.GetLocation();
02369 bool partial5 = loc.IsPartialStart(eExtreme_Biological);
02370 bool partial3 = loc.IsPartialStop(eExtreme_Biological);
02371
02372 if ( molinfo.CanGetCompleteness() ) {
02373 switch ( molinfo.GetCompleteness() ) {
02374 case CMolInfo::eCompleteness_unknown:
02375 break;
02376
02377 case CMolInfo::eCompleteness_complete:
02378 if ( partial5 || partial3 ) {
02379 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
02380 "CDS is partial but protein is complete", feat);
02381 }
02382 break;
02383
02384 case CMolInfo::eCompleteness_partial:
02385 break;
02386
02387 case CMolInfo::eCompleteness_no_left:
02388 if ( !partial5 ) {
02389 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
02390 "CDS is 5' complete but protein is NH2 partial", feat);
02391 }
02392 if ( partial3 ) {
02393 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
02394 "CDS is 3' partial but protein is NH2 partial", feat);
02395 }
02396 break;
02397
02398 case CMolInfo::eCompleteness_no_right:
02399 if (! partial3) {
02400 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
02401 "CDS is 3' complete but protein is CO2 partial", feat);
02402 }
02403 if (partial5) {
02404 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
02405 "CDS is 5' partial but protein is CO2 partial", feat);
02406 }
02407 break;
02408
02409 case CMolInfo::eCompleteness_no_ends:
02410 if ( partial5 && partial3 ) {
02411 } else if ( partial5 ) {
02412 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
02413 "CDS is 5' partial but protein has neither end", feat);
02414 } else if ( partial3 ) {
02415 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
02416 "CDS is 3' partial but protein has neither end", feat);
02417 } else {
02418 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
02419 "CDS is complete but protein has neither end", feat);
02420 }
02421 break;
02422
02423 case CMolInfo::eCompleteness_has_left:
02424 break;
02425
02426 case CMolInfo::eCompleteness_has_right:
02427 break;
02428
02429 case CMolInfo::eCompleteness_other:
02430 break;
02431
02432 default:
02433 break;
02434 }
02435 }
02436 }
02437
02438
02439 void CValidError_feat::ValidateBadMRNAOverlap(const CSeq_feat& feat)
02440 {
02441 const CSeq_loc& loc = feat.GetLocation();
02442
02443
02444 if( m_Imp.AvoidPerfBottlenecks() ) {
02445 return;
02446 }
02447
02448
02449 CConstRef<CSeq_feat> mrna = GetBestOverlappingFeat(
02450 loc,
02451 CSeqFeatData::eSubtype_mRNA,
02452 eOverlap_Simple,
02453 *m_Scope);
02454 if ( !mrna ) {
02455 return;
02456 }
02457
02458 mrna = GetBestOverlappingFeat(
02459 loc,
02460 CSeqFeatData::eSubtype_mRNA,
02461 eOverlap_CheckIntRev,
02462 *m_Scope);
02463 if ( mrna ) {
02464 return;
02465 }
02466
02467 mrna = GetBestOverlappingFeat(
02468 loc,
02469 CSeqFeatData::eSubtype_mRNA,
02470 eOverlap_Interval,
02471 *m_Scope);
02472 if ( !mrna ) {
02473 return;
02474 }
02475
02476 mrna = GetBestOverlappingFeat(
02477 loc,
02478 CSeqFeatData::eSubtype_mRNA,
02479 eOverlap_Subset,
02480 *m_Scope);
02481
02482 EDiagSev sev = eDiag_Error;
02483 if ( m_Imp.IsNC() || m_Imp.IsNT() ||
02484 (feat.CanGetExcept() && feat.GetExcept()) ) {
02485 sev = eDiag_Warning;
02486 }
02487 if ( mrna ) {
02488
02489 bool supress = false;
02490
02491 if ( feat.CanGetExcept_text() ) {
02492 const CSeq_feat::TExcept_text& text = feat.GetExcept_text();
02493 if ( NStr::FindNoCase(text, "ribosomal slippage") != NPOS ) {
02494 supress = true;
02495 }
02496 }
02497 if ( !supress ) {
02498 PostErr(sev, eErr_SEQ_FEAT_CDSmRNArange,
02499 "mRNA contains CDS but internal intron-exon boundaries "
02500 "do not match", feat);
02501 }
02502 } else {
02503 PostErr(sev, eErr_SEQ_FEAT_CDSmRNArange,
02504 "mRNA overlaps or contains CDS but does not completely "
02505 "contain intervals", feat);
02506 }
02507 }
02508
02509
02510 void CValidError_feat::ValidateBadGeneOverlap(const CSeq_feat& feat)
02511 {
02512 const CGene_ref* grp = feat.GetGeneXref();
02513 if ( grp != 0 ) {
02514 return;
02515 }
02516
02517
02518 if( m_Imp.AvoidPerfBottlenecks() ) {
02519 return;
02520 }
02521
02522
02523
02524 if (GetOverlappingGene(feat.GetLocation(), *m_Scope)) {
02525 return;
02526 }
02527
02528
02529 if (!GetBestOverlappingFeat(feat.GetLocation(),
02530 CSeqFeatData::e_Gene,
02531 eOverlap_Simple,
02532 *m_Scope)) {
02533 return;
02534 }
02535
02536
02537
02538 EDiagSev sev = (m_Imp.IsNC() || m_Imp.IsNT()) ? eDiag_Warning : eDiag_Error;
02539
02540
02541 if (feat.GetData().IsCdregion()) {
02542 PostErr(sev, eErr_SEQ_FEAT_CDSgeneRange,
02543 "gene overlaps CDS but does not completely contain it", feat);
02544 } else if (feat.GetData().IsRna()) {
02545 if (GetOverlappingOperon(feat.GetLocation(), *m_Scope)) {
02546 return;
02547 }
02548 PostErr(sev, eErr_SEQ_FEAT_mRNAgeneRange,
02549 "gene overlaps mRNA but does not completely contain it", feat);
02550 }
02551 }
02552
02553
02554 static const string s_LegalExceptionStrings[] = {
02555 "RNA editing",
02556 "reasons given in citation",
02557 "rearrangement required for product",
02558 "ribosomal slippage",
02559 "trans-splicing",
02560 "alternative processing",
02561 "artificial frameshift",
02562 "nonconsensus splice site",
02563 "modified codon recognition",
02564 "alternative start codon",
02565 "dicistronic gene",
02566 "transcribed product replaced",
02567 "translated product replaced",
02568 "transcribed pseudogene",
02569 };
02570
02571
02572 static const string s_RefseqExceptionStrings [] = {
02573 "unclassified transcription discrepancy",
02574 "unclassified translation discrepancy",
02575 "mismatches in transcription",
02576 "mismatches in translation",
02577 "adjusted for low-quality genome",
02578 };
02579
02580
02581 void CValidError_feat::ValidateExcept(const CSeq_feat& feat)
02582 {
02583 if (feat.IsSetExcept_text() && !NStr::IsBlank (feat.GetExcept_text()) && !feat.IsSetExcept()) {
02584 PostErr(eDiag_Warning, eErr_SEQ_FEAT_ExceptInconsistent,
02585 "Exception text is set, but exception flag is not set", feat);
02586 } else if (feat.IsSetExcept() && (!feat.IsSetExcept_text() || NStr::IsBlank (feat.GetExcept_text()))) {
02587 PostErr(eDiag_Warning, eErr_SEQ_FEAT_ExceptInconsistent,
02588 "Exception flag is set, but exception text is empty", feat);
02589 }
02590 if ( feat.CanGetExcept_text() && !feat.GetExcept_text ().empty() ) {
02591 ValidateExceptText(feat.GetExcept_text(), feat);
02592 }
02593 }
02594
02595
02596 void CValidError_feat::ValidateExceptText(const string& text, const CSeq_feat& feat)
02597 {
02598 if ( text.empty() ) return;
02599
02600 EDiagSev sev = eDiag_Error;
02601 bool found = false;
02602
02603 string str;
02604 string::size_type begin = 0, end, textlen = text.length();
02605
02606 const string* except_begin = s_LegalExceptionStrings;
02607 const string* except_end =
02608 &(s_LegalExceptionStrings[sizeof(s_LegalExceptionStrings) / sizeof(string)]);
02609 const string* refseq_begin = s_RefseqExceptionStrings;
02610 const string* refseq_end =
02611 &(s_RefseqExceptionStrings[sizeof(s_RefseqExceptionStrings) / sizeof(string)]);
02612
02613 while ( begin < textlen ) {
02614 found = false;
02615 end = min( text.find_first_of(',', begin), textlen );
02616 str = NStr::TruncateSpaces( text.substr(begin, end) );
02617 if ( find(except_begin, except_end, str) != except_end ) {
02618 found = true;
02619 }
02620 if ( !found && (m_Imp.IsGPS() || m_Imp.IsRefSeq()) ) {
02621 if ( find(refseq_begin, refseq_end, str) != refseq_end ) {
02622 found = true;
02623 }
02624 }
02625 if ( !found ) {
02626 if ( m_Imp.IsNC() || m_Imp.IsNT() ) {
02627 sev = eDiag_Warning;
02628 }
02629 PostErr(sev, eErr_SEQ_FEAT_InvalidQualifierValue,
02630 str + " is not legal exception explanation", feat);
02631 }
02632 begin = end + 1;
02633 }
02634 }
02635
02636
02637
02638
02639 void CValidError_feat::ReportCdTransErrors
02640 (const CSeq_feat& feat,
02641 bool show_stop,
02642 bool got_stop,
02643 bool no_end,
02644 int ragged,
02645 bool report_errors,
02646 bool& has_errors)
02647 {
02648 if (show_stop) {
02649 if (!got_stop && !no_end) {
02650 has_errors = true;
02651 if (report_errors) {
02652 PostErr(eDiag_Error, eErr_SEQ_FEAT_NoStop,
02653 "Missing stop codon", feat);
02654 }
02655 } else if (got_stop && no_end) {
02656 has_errors = true;
02657 if (report_errors) {
02658 PostErr (eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
02659 "Got stop codon, but 3'end is labeled partial", feat);
02660 }
02661 } else if (got_stop && !no_end && ragged) {
02662 has_errors = true;
02663 if (report_errors) {
02664 PostErr (eDiag_Error, eErr_SEQ_FEAT_TransLen,
02665 "Coding region extends " + NStr::IntToString(ragged) +
02666 " base(s) past stop codon", feat);
02667 }
02668 }
02669 }
02670 }
02671
02672
02673 static const string sc_BypassCdsTransCheckText[] = {
02674 "RNA editing",
02675 "artificial frameshift",
02676 "mismatches in translation",
02677 "rearrangement required for product",
02678 "reasons given in citation",
02679 "unclassified translation discrepancy"
02680 };
02681 DEFINE_STATIC_ARRAY_MAP(CStaticArraySet<string>, sc_BypassCdsTransCheck, sc_BypassCdsTransCheckText);
02682
02683
02684 static void s_LocIdType(const CSeq_loc& loc, CScope& scope, const CSeq_entry& tse,
02685 bool& is_nt, bool& is_ng, bool& is_nw, bool& is_nc)
02686 {
02687 is_nt = is_ng = is_nw = is_nc = false;
02688 if (!IsOneBioseq(loc, &scope)) {
02689 return;
02690 }
02691 const CSeq_id& id = GetId(loc, &scope);
02692 CBioseq_Handle bsh = scope.GetBioseqHandleFromTSE(id, tse);
02693 if (bsh) {
02694 FOR_EACH_SEQID_ON_BIOSEQ (it, *(bsh.GetBioseqCore())) {
02695 CSeq_id::EAccessionInfo info = (*it)->IdentifyAccession();
02696 is_nt |= (info == CSeq_id::eAcc_refseq_contig);
02697 is_ng |= (info == CSeq_id::eAcc_refseq_genomic);
02698 is_nw |= (info == CSeq_id::eAcc_refseq_wgs_intermed);
02699 is_nc |= (info == CSeq_id::eAcc_refseq_chromosome);
02700 }
02701 }
02702 }
02703
02704
02705 void CValidError_feat::ValidateCdTrans(const CSeq_feat& feat)
02706 {
02707
02708 if (!feat.GetData().IsCdregion()) {
02709 return;
02710 }
02711
02712 bool prot_ok = true;
02713 int ragged = 0;
02714 bool has_errors = false, unclassified_except = false,
02715 mismatch_except = false, report_errors = true;
02716 string farstr;
02717
02718 if (feat.CanGetExcept() && feat.GetExcept() &&
02719 feat.CanGetExcept_text()) {
02720 const string& except_text = feat.GetExcept_text();
02721 ITERATE (CStaticArraySet<string>, it, sc_BypassCdsTransCheck) {
02722 if (NStr::FindNoCase(except_text, *it) != NPOS) {
02723 report_errors = false;
02724 }
02725 if (NStr::FindNoCase(except_text, "unclassified transcription discrepancy") != NPOS) {
02726 unclassified_except = true;
02727 }
02728 if (NStr::FindNoCase(except_text, "mismatches in transcription") != NPOS) {
02729 mismatch_except = true;
02730 report_errors = true;
02731 }
02732 }
02733 }
02734
02735
02736 FOR_EACH_GBQUAL_ON_FEATURE (it, feat) {
02737 if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "pseudo")) {
02738 return;
02739 }
02740 }
02741
02742 const CCdregion& cdregion = feat.GetData().GetCdregion();
02743
02744
02745 bool transl_except = false;
02746 if (!cdregion.IsSetCode_break()) {
02747 FOR_EACH_GBQUAL_ON_FEATURE (it, feat) {
02748 if ((*it)->IsSetQual() && NStr::Equal((*it)->GetQual(), "transl_except")) {
02749 transl_except = true;
02750 }
02751 }
02752 }
02753
02754 const CSeq_loc& location = feat.GetLocation();
02755
02756 int gc = 0;
02757 if ( cdregion.CanGetCode() ) {
02758
02759 gc = cdregion.GetCode().GetId();
02760 }
02761 string gccode = NStr::IntToString(gc);
02762
02763 string transl_prot;
02764 bool alt_start = false;
02765 try {
02766 CCdregion_translate::TranslateCdregion(
02767 transl_prot,
02768 feat,
02769 *m_Scope,
02770 true,
02771 false,
02772 &alt_start,
02773 CCdregion_translate::eTruncate);
02774 } catch (CException&) {
02775 }
02776 if (transl_prot.empty()) {
02777 if (report_errors) {
02778 PostErr (eDiag_Error, eErr_SEQ_FEAT_CdTransFail,
02779 "Unable to translate", feat);
02780 }
02781 }
02782
02783
02784 if (alt_start && gc == 1) {
02785 EDiagSev sev = eDiag_Warning;
02786 if (s_IsLocRefSeqMrna(feat.GetLocation(), *m_Scope)) {
02787 sev = eDiag_Error;
02788 } else if (s_IsLocGEDL(feat.GetLocation(), *m_Scope)) {
02789 sev = eDiag_Info;
02790 }
02791 if (sev != eDiag_Info &&
02792 feat.CanGetExcept() && feat.GetExcept() &&
02793 feat.CanGetExcept_text() && !feat.GetExcept_text().empty()) {
02794 if (feat.GetExcept_text().find("alternative start codon") != NPOS) {
02795 sev = eDiag_Info;
02796 }
02797 }
02798 if (sev != eDiag_Info) {
02799 has_errors = true;
02800 if (report_errors) {
02801 PostErr(sev, eErr_SEQ_FEAT_AltStartCodon,
02802 "Alternative start codon used", feat);
02803 }
02804 }
02805 }
02806
02807 bool no_end = false;
02808 unsigned int part_loc = SeqLocPartialCheck(location, m_Scope);
02809 unsigned int part_prod = eSeqlocPartial_Complete;
02810 if (feat.IsSetProduct()) {
02811 part_prod = SeqLocPartialCheck(feat.GetProduct(), m_Scope);
02812 if ( (part_loc & eSeqlocPartial_Stop) ||
02813 (part_prod & eSeqlocPartial_Stop) ) {
02814 no_end = true;
02815 } else {
02816
02817 ragged = CheckForRaggedEnd(location, cdregion);
02818 }
02819 }
02820
02821
02822
02823 has_errors = x_ValidateCodeBreakNotOnCodon(feat, location, cdregion, report_errors);
02824
02825 if (cdregion.GetFrame() > CCdregion::eFrame_one) {
02826 has_errors = true;
02827 EDiagSev sev = s_IsLocRefSeqMrna(feat.GetLocation(), *m_Scope) ?
02828 eDiag_Error : eDiag_Warning;
02829 if (!(part_loc & eSeqlocPartial_Start)) {
02830 if (report_errors) {
02831 PostErr(sev, eErr_SEQ_FEAT_PartialProblem,
02832 "Suspicious CDS location - frame > 1 but not 5' partial",
02833 feat);
02834 }
02835 } else if ((part_loc & eSeqlocPartial_Nostart) &&
02836 !IsPartialAtSpliceSite(location, eSeqlocPartial_Nostart)) {
02837 if (report_errors) {
02838 PostErr(sev, eErr_SEQ_FEAT_PartialProblem,
02839 "Suspicious CDS location - frame > 1 and not at consensus splice site",
02840 feat);
02841 }
02842 }
02843 }
02844
02845 bool no_beg =
02846 (part_loc & eSeqlocPartial_Start) || (part_prod & eSeqlocPartial_Start);
02847
02848 bool reported_bad_start_codon = false;
02849
02850
02851 size_t internal_stop_count = 0;
02852 bool got_stop = false;
02853 if (!transl_prot.empty()) {
02854 bool got_dash = (transl_prot[0] == '-');
02855 got_stop = (transl_prot[transl_prot.length() - 1] == '*');
02856
02857 ITERATE(string, it, transl_prot) {
02858 if ( *it == '*' ) {
02859 ++internal_stop_count;
02860 }
02861 }
02862 if (got_stop) {
02863 --internal_stop_count;
02864 }
02865
02866 if (internal_stop_count > 0) {
02867 has_errors = true;
02868 if (got_dash) {
02869 if (report_errors || unclassified_except) {
02870 PostErr(eDiag_Error, eErr_SEQ_FEAT_StartCodon,
02871 "Illegal start codon and " +
02872 NStr::IntToString(internal_stop_count) +
02873 " internal stops. Probably wrong genetic code [" +
02874 gccode + "]", feat);
02875 }
02876 }
02877 if (report_errors || unclassified_except) {
02878 PostErr(eDiag_Error, eErr_SEQ_FEAT_InternalStop,
02879 NStr::IntToString(internal_stop_count) +
02880 " internal stops. Genetic code [" + gccode + "]", feat);
02881 }
02882 prot_ok = false;
02883 if (internal_stop_count > 5) {
02884 return;
02885 }
02886 } else if (got_dash) {
02887 has_errors = true;
02888 if (report_errors) {
02889 PostErr(eDiag_Error, eErr_SEQ_FEAT_StartCodon,
02890 "Illegal start codon used. Wrong genetic code [" +
02891 gccode + "] or protein should be partial", feat);
02892 reported_bad_start_codon = true;
02893 }
02894 }
02895 }
02896
02897 bool show_stop = true;
02898
02899 const CSeq_id* protid = 0;
02900 CBioseq_Handle prot_handle;
02901 if (feat.IsSetProduct()) {
02902 try {
02903 protid = &GetId(feat.GetProduct(), m_Scope);
02904 } catch (CException&) {}
02905 if (protid != NULL) {
02906 prot_handle = m_Scope->GetBioseqHandleFromTSE(*protid, m_Imp.GetTSE());
02907 if (!prot_handle && m_Imp.IsFarFetchCDSproducts()) {
02908 prot_handle = m_Scope->GetBioseqHandle(*protid);
02909 }
02910 }
02911 }
02912
02913 if (!prot_handle) {
02914 if (!m_Imp.IsStandaloneAnnot()) {
02915 if (transl_prot.length() > 6) {
02916 bool is_nt, is_ng, is_nw, is_nc;
02917 s_LocIdType(location, *m_Scope, m_Imp.GetTSE(), is_nt, is_ng, is_nw, is_nc);
02918 if (!(is_nt || is_ng || is_nw)) {
02919 EDiagSev sev = eDiag_Error;
02920 if (IsDeltaOrFarSeg(location, m_Scope)) {
02921 sev = eDiag_Warning;
02922 }
02923 if (is_nc) {
02924 sev = eDiag_Warning;
02925 if ( m_Imp.GetTSE().IsSeq()) {
02926 sev = eDiag_Info;
02927 }
02928 }
02929 if (sev != eDiag_Info) {
02930 has_errors = true;
02931 if (report_errors) {
02932 PostErr(sev, eErr_SEQ_FEAT_NoProtein,
02933 "No protein Bioseq given", feat);
02934 }
02935 }
02936 }
02937 }
02938 ReportCdTransErrors(feat, show_stop, got_stop, no_end, ragged,
02939 report_errors, has_errors);
02940 }
02941 }
02942
02943
02944 if (feat.IsSetProduct()) {
02945 CSeqVector prot_vec = prot_handle.GetSeqVector();
02946 prot_vec.SetCoding(CSeq_data::e_Ncbieaa);
02947 size_t prot_len = prot_vec.size();
02948 size_t len = transl_prot.length();
02949
02950 if (got_stop && (len == prot_len + 1)) {
02951 --len;
02952 }
02953
02954
02955 while (prot_len > 0) {
02956 if (prot_vec[prot_len - 1] == 'X') {
02957 --prot_len;
02958 } else {
02959 break;
02960 }
02961 }
02962
02963 while (len > 0) {
02964 if (transl_prot[len - 1] == 'X') {
02965 --len;
02966 } else {
02967 break;
02968 }
02969 }
02970
02971 vector<TSeqPos> mismatches;
02972 if (len == prot_len) {
02973 for (TSeqPos i = 0; i < len; ++i) {
02974 if (transl_prot[i] != prot_vec[i]) {
02975 has_errors = true;
02976 mismatches.push_back(i);
02977 prot_ok = false;
02978 }
02979 }
02980 } else {
02981 has_errors = true;
02982 if (report_errors) {
02983 PostErr(eDiag_Error, eErr_SEQ_FEAT_TransLen,
02984 "Given protein length [" + NStr::IntToString(prot_len) +
02985 "] does not match " + farstr + "translation length [" +
02986 NStr::IntToString(len) + "]", feat);
02987 }
02988 }
02989
02990
02991 string msg;
02992 if (!mismatches.empty() && mismatches.front() == 0) {
02993 if (feat.IsSetPartial() && feat.GetPartial() && (!no_beg) && (!no_end)) {
02994 if (report_errors) {
02995 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
02996 "Start of location should probably be partial",
02997 feat);
02998 }
02999 } else if (transl_prot[mismatches.front()] == '-') {
03000 if (report_errors && !reported_bad_start_codon) {
03001 PostErr(eDiag_Error, eErr_SEQ_FEAT_StartCodon,
03002 "Illegal start codon used. Wrong genetic code [" +
03003 gccode + "] or protein should be partial", feat);
03004 }
03005 }
03006 }
03007
03008 char prot_res, transl_res;
03009 string nuclocstr;
03010 if (mismatches.size() > 10) {
03011
03012
03013 nuclocstr = MapToNTCoords(feat, feat.GetProduct(), mismatches.front());
03014 prot_res = prot_vec[mismatches.front()];
03015 transl_res = Residue(transl_prot[mismatches.front()]);
03016 msg =
03017 NStr::IntToString(mismatches.size()) + " mismatches found. " +
03018 "First mismatch at " + NStr::IntToString(mismatches.front() + 1) +
03019 ", residue in protein [" + prot_res + "]" +
03020 " != translation [" + transl_res + "]";
03021 if (!nuclocstr.empty()) {
03022 msg += " at " + nuclocstr;
03023 }
03024 nuclocstr = MapToNTCoords(feat, feat.GetProduct(), mismatches.back());
03025 prot_res = prot_vec[mismatches.back()];
03026 transl_res = Residue(transl_prot[mismatches.back()]);
03027 msg +=
03028 ". Last mismatch at " + NStr::IntToString(mismatches.back() + 1) +
03029 ", residue in protein [" + prot_res + "]" +
03030 " != translation [" + transl_res + "]";
03031 if (!nuclocstr.empty()) {
03032 msg += " at " + nuclocstr;
03033 }
03034 msg += ". Genetic code [" + gccode + "]";
03035 if (report_errors && !mismatch_except) {
03036 PostErr(eDiag_Error, eErr_SEQ_FEAT_MisMatchAA, msg, feat);
03037 }
03038 } else {
03039
03040 for (size_t i = 0; i < mismatches.size(); ++i) {
03041 nuclocstr = MapToNTCoords(feat, feat.GetProduct(), mismatches[i]);
03042 prot_res = prot_vec[mismatches[i]];
03043 transl_res = Residue(transl_prot[mismatches[i]]);
03044 msg = farstr + "Residue " + NStr::IntToString(mismatches[i] + 1) +
03045 " in protein [" + prot_res + "]" +
03046 " != translation [" + transl_res + "]";
03047 if (!nuclocstr.empty()) {
03048 msg += " at " + nuclocstr;
03049 }
03050 if (report_errors && !mismatch_except) {
03051 PostErr(eDiag_Error, eErr_SEQ_FEAT_MisMatchAA, msg, feat);
03052 }
03053 }
03054 }
03055
03056 if (feat.CanGetPartial() && feat.GetPartial() &&
03057 mismatches.empty()) {
03058 if (!no_beg && !no_end) {
03059 if (report_errors) {
03060 if (!got_stop) {
03061 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
03062 "End of location should probably be partial", feat);
03063 } else {
03064 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
03065 "This SeqFeat should not be partial", feat);
03066 }
03067 }
03068 show_stop = false;
03069 }
03070 }
03071 }
03072
03073 ReportCdTransErrors(feat, show_stop, got_stop, no_end, ragged,
03074 report_errors, has_errors);
03075
03076 if (report_errors && transl_except) {
03077 if (prot_ok) {
03078 PostErr(eDiag_Warning, eErr_SEQ_FEAT_TranslExcept,
03079 "Unparsed transl_except qual (but protein is okay). Skipped", feat);
03080 } else {
03081 PostErr(eDiag_Warning, eErr_SEQ_FEAT_TranslExcept,
03082 "Unparsed transl_except qual. Skipped", feat);
03083 }
03084 }
03085
03086 if (!report_errors && has_errors) {
03087 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnnecessaryException,
03088 "CDS has exception but passes translation test", feat);
03089 }
03090 }
03091
03092
03093 bool CValidError_feat::x_ValidateCodeBreakNotOnCodon
03094 (const CSeq_feat& feat,
03095 const CSeq_loc& loc,
03096 const CCdregion& cdregion,
03097 bool report_errors)
03098 {
03099 TSeqPos len = GetLength(loc, m_Scope);
03100 bool has_errors = false;
03101
03102 FOR_EACH_CODEBREAK_ON_CDREGION (cbr, cdregion) {
03103 size_t codon_length = GetLength((*cbr)->GetLoc(), m_Scope);
03104 TSeqPos from = LocationOffset(loc, (*cbr)->GetLoc(),
03105 eOffset_FromStart, m_Scope);
03106 TSeqPos to = from + codon_length - 1;
03107
03108
03109 if (codon_length == 3 ||
03110 ((codon_length == 1 || codon_length == 2) && to == len - 1)) {
03111 size_t start_pos;
03112 switch (cdregion.GetFrame()) {
03113 case CCdregion::eFrame_two:
03114 start_pos = 1;
03115 break;
03116 case CCdregion::eFrame_three:
03117 start_pos = 2;
03118 break;
03119 default:
03120 start_pos = 0;
03121 break;
03122 }
03123 if ((from % 3) != start_pos) {
03124 if (report_errors) {
03125 PostErr(eDiag_Warning, eErr_SEQ_FEAT_TranslExceptPhase,
03126 "transl_except qual out of frame.", feat);
03127 }
03128 has_errors = true;
03129 }
03130 }
03131 }
03132 return has_errors;
03133 }
03134
03135
03136
03137 void CValidError_feat::ValidateGeneXRef(const CSeq_feat& feat)
03138 {
03139 const CGene_ref* grp = feat.GetGeneXref();
03140 if ( !grp || grp->IsSuppressed() ) {
03141 return;
03142 }
03143
03144
03145 if( m_Imp.AvoidPerfBottlenecks() ) {
03146 return;
03147 }
03148
03149
03150 CConstRef<CSeq_feat> overlap =
03151 GetOverlappingGene(feat.GetLocation(), *m_Scope);
03152 if ( !overlap ) {
03153 return;
03154 }
03155
03156 const CGene_ref& gene = overlap->GetData().GetGene();
03157 if ( !gene.IsSuppressed() ) {
03158 if ( gene.CanGetAllele() && !gene.GetAllele().empty() ) {
03159 const string& allele = gene.GetAllele();
03160
03161 FOR_EACH_GBQUAL_ON_FEATURE (qual_iter, feat) {
03162 const CGb_qual& qual = **qual_iter;
03163 if ( qual.CanGetQual() &&
03164 NStr::Compare(qual.GetQual(), "allele") == 0 ) {
03165 if ( qual.CanGetVal() &&
03166 NStr::CompareNocase(qual.GetVal(), allele) == 0 ) {
03167 PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidQualifierValue,
03168 "Redundant allele qualifier (" + allele +
03169 ") on gene and feature", feat);
03170 } else if (feat.GetData().GetSubtype() != CSeqFeatData::eSubtype_variation) {
03171 PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidQualifierValue,
03172 "Mismatched allele qualifier on gene (" + allele +
03173 ") and feature (" + qual.GetVal() +")", feat);
03174 }
03175 }
03176 }
03177 }
03178 }
03179
03180 string label, gene_label;
03181 grp->GetLabel(&label);
03182 gene.GetLabel(&gene_label);
03183
03184 if ( NStr::CompareNocase(label, gene_label) == 0 ) {
03185 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnnecessaryGeneXref,
03186 "Unnecessary gene cross-reference " + label, feat);
03187 }
03188 }
03189
03190
03191 void CValidError_feat::ValidateOperon(const CSeq_feat& gene)
03192 {
03193 CConstRef<CSeq_feat> operon =
03194 GetOverlappingOperon(gene.GetLocation(), *m_Scope);
03195 if ( !operon || !operon->CanGetQual() ) {
03196 return;
03197 }
03198
03199 string label;
03200 feature::GetLabel(gene, &label, feature::eContent, m_Scope);
03201 if ( label.empty() ) {
03202 return;
03203 }
03204
03205 FOR_EACH_GBQUAL_ON_FEATURE (qual_iter, gene) {
03206 const CGb_qual& qual = **qual_iter;
03207 if( qual.CanGetQual() && qual.CanGetVal() ) {
03208 if ( NStr::Compare(qual.GetQual(), "operon") == 0 &&
03209 NStr::CompareNocase(qual.GetVal(), label) ) {
03210 PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidQualifierValue,
03211 "Operon is same as gene - " + label, gene);
03212 }
03213 }
03214 }
03215 }
03216
03217
03218 bool CValidError_feat::IsPartialAtSpliceSite
03219 (const CSeq_loc& loc,
03220 unsigned int tag)
03221 {
03222 if ( tag != eSeqlocPartial_Nostart && tag != eSeqlocPartial_Nostop ) {
03223 return false;
03224 }
03225
03226 CSeq_loc_CI first, last;
03227 for ( CSeq_loc_CI sl_iter(loc); sl_iter; ++sl_iter ) {
03228 if ( !first ) {
03229 first = sl_iter;
03230 }
03231 last = sl_iter;
03232 }
03233
03234 if ( first.GetStrand() != last.GetStrand() ) {
03235 return false;
03236 }
03237 CSeq_loc_CI temp = (tag == eSeqlocPartial_Nostart) ? first : last;
03238
03239 CBioseq_Handle bsh = m_Scope->GetBioseqHandle(temp.GetSeq_id());
03240 if ( !bsh ) {
03241 return false;
03242 }
03243
03244 TSeqPos acceptor = temp.GetRange().GetFrom();
03245 TSeqPos donor = temp.GetRange().GetTo();
03246 TSeqPos start = acceptor;
03247 TSeqPos stop = donor;
03248
03249 CSeqVector vec = bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac,
03250 temp.GetStrand());
03251 TSeqPos len = vec.size();
03252
03253 if ( temp.GetStrand() == eNa_strand_minus ) {
03254 swap(acceptor, donor);
03255 stop = len - donor - 1;
03256 start = len - acceptor - 1;
03257 }
03258
03259 bool result = false;
03260
03261
03262 if ( (tag == eSeqlocPartial_Nostop) && (stop < len - 2) ) {
03263 try {
03264 CSeqVector::TResidue res1 = vec[stop + 1];
03265 CSeqVector::TResidue res2 = vec[stop + 2];
03266
03267 if ( IsResidue(res1) && IsResidue(res2) ) {
03268 if ( (res1 == 'G' && res2 == 'T') ||
03269 (res1 == 'G' && res2 == 'C') ) {
03270 result = true;
03271 }
03272 }
03273 } catch ( exception& ) {
03274 return false;
03275 }
03276 } else if ( (tag == eSeqlocPartial_Nostart) && (start > 1) ) {
03277 try {
03278 CSeqVector::TResidue res1 = vec[start - 2];
03279 CSeqVector::TResidue res2 = vec[start - 1];
03280
03281 if ( IsResidue(res1) && IsResidue(res2) ) {
03282 if ( (res1 == 'A') && (res2 == 'G') ) {
03283 result = true;
03284 }
03285 }
03286 } catch ( exception& ) {
03287 return false;
03288 }
03289 }
03290
03291 return result;
03292 }
03293
03294
03295 void CValidError_feat::ValidateFeatCit
03296 (const CPub_set& cit,
03297 const CSeq_feat& feat)
03298 {
03299 if ( !feat.CanGetCit() ) {
03300 return;
03301 }
03302
03303 if ( cit.IsPub() ) {
03304 ITERATE( CPub_set::TPub, pi, cit.GetPub() ) {
03305 if ( (*pi)->IsEquiv() ) {
03306 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnnecessaryCitPubEquiv,
03307 "Citation on feature has unexpected internal Pub-equiv",
03308 feat);
03309 return;
03310 }
03311 }
03312 }
03313 }
03314
03315 void CValidError_feat::ValidateFeatComment
03316 (const string& comment,
03317 const CSeq_feat& feat)
03318 {
03319 if ( m_Imp.IsSerialNumberInComment(comment) ) {
03320 PostErr(eDiag_Info, eErr_SEQ_FEAT_SerialInComment,
03321 "Feature comment may refer to reference by serial number - "
03322 "attach reference specific comments to the reference "
03323 "REMARK instead.", feat);
03324 }
03325 }
03326
03327
03328 void CValidError_feat::ValidateFeatBioSource
03329 (const CBioSource& bsrc,
03330 const CSeq_feat& feat)
03331 {
03332 if ( bsrc.IsSetIs_focus() ) {
03333 PostErr(eDiag_Error, eErr_SEQ_FEAT_FocusOnBioSourceFeature,
03334 "Focus must be on BioSource descriptor, not BioSource feature.",
03335 feat);
03336 }
03337
03338 CBioseq_Handle bsh = m_Scope->GetBioseqHandle(feat.GetLocation());
03339 if ( !bsh ) {
03340 return;
03341 }
03342 CSeqdesc_CI dbsrc_i(bsh, CSeqdesc::e_Source);
03343 if ( !dbsrc_i ) {
03344 return;
03345 }
03346
03347 const COrg_ref& org = bsrc.GetOrg();
03348 const CBioSource& dbsrc = dbsrc_i->GetSource();
03349 const COrg_ref& dorg = dbsrc.GetOrg();
03350
03351 if ( org.CanGetTaxname() && !org.GetTaxname().empty() &&
03352 dorg.CanGetTaxname() ) {
03353 string taxname = org.GetTaxname();
03354 string dtaxname = dorg.GetTaxname();
03355 if ( NStr::CompareNocase(taxname, dtaxname) != 0 ) {
03356 if ( !dbsrc.CanGetIs_focus() && !m_Imp.IsTransgenic(dbsrc) ) {
03357 PostErr(eDiag_Error, eErr_SEQ_DESCR_BioSourceNeedsFocus,
03358 "BioSource descriptor must have focus or transgenic "
03359 "when BioSource feature with different taxname is "
03360 "present.", feat);
03361 }
03362 }
03363 }
03364
03365 FOR_EACH_DBXREF_ON_FEATURE (it, feat) {
03366 if ((*it)->GetType() == CDbtag::eDbtagType_taxon) {
03367 PostErr(eDiag_Warning, eErr_SEQ_FEAT_TaxonDbxrefOnFeature,
03368 "BioSource feature has taxon xref in common feature db_xref list",
03369 feat);
03370 }
03371 }
03372
03373 m_Imp.ValidateBioSource(bsrc, feat);
03374 }
03375
03376
03377 void CValidError_feat::ValidateBondLocs(const CSeq_feat& feat)
03378 {
03379 _ASSERT( ! feat.GetData().IsBond() );
03380
03381
03382 CSeq_loc_CI loc_ci(feat.GetLocation());
03383 for (; loc_ci; ++loc_ci) {
03384 if (loc_ci.GetSeq_loc().IsBond()) {
03385 PostErr(eDiag_Warning, eErr_SEQ_FEAT_ImproperBondLocation,
03386 "Bond location should only be on bond features", feat);
03387 break;
03388 }
03389 }
03390 }
03391
03392
03393 bool CValidError_feat::IsSameAsCDS(const CSeq_feat& feat)
03394 {
03395 EOverlapType overlap_type;
03396 if ( feat.GetData().IsGene() ) {
03397 overlap_type = eOverlap_Simple;
03398 } else if ( feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA ) {
03399 overlap_type = eOverlap_CheckIntervals;
03400 } else {
03401 return false;
03402 }
03403
03404 CConstRef<CSeq_feat> cds = GetBestOverlappingFeat(
03405 feat.GetLocation(),
03406 CSeqFeatData::e_Cdregion,
03407 overlap_type,
03408 *m_Scope);
03409 if ( cds ) {
03410 if ( TestForOverlap(
03411 cds->GetLocation(),
03412 feat.GetLocation(),
03413 eOverlap_Simple) == 0 ) {
03414 return true;
03415 }
03416 }
03417 return false;
03418 }
03419
03420
03421 bool CValidError_feat::IsCDDFeat(const CSeq_feat& feat) const
03422 {
03423 if ( feat.GetData().IsRegion() ) {
03424 if ( feat.CanGetDbxref() ) {
03425 FOR_EACH_DBXREF_ON_FEATURE (db, feat) {
03426 if ( (*db)->CanGetDb() &&
03427 NStr::Compare((*db)->GetDb(), "CDD") == 0 ) {
03428 return true;
03429 }
03430 }
03431 }
03432 }
03433 return false;
03434 }
03435
03436 void CValidError_feat::x_ValidateSeqFeatLoc(const CSeq_feat& feat)
03437 {
03438
03439
03440 CSeq_entry_Handle tse = m_Imp.GetTSEH();
03441 const CSeq_loc& loc = feat.GetLocation();
03442 if (IsOneBioseq(loc, m_Scope)) {
03443 return;
03444 }
03445
03446 CBioseq_Handle bsh;
03447 for (CSeq_loc_CI it(loc) ;it; ++it) {
03448 CBioseq_Handle seq = m_Scope->GetBioseqHandleFromTSE(it.GetSeq_id(), tse);
03449 if (seq && !seq.GetExactComplexityLevel(CBioseq_set::eClass_parts)) {
03450 if (bsh && seq != bsh) {
03451 PostErr(eDiag_Error, eErr_SEQ_FEAT_MultipleBioseqs,
03452 "Feature location refers to multiple near bioseqs.", feat);
03453 } else {
03454 bsh = seq;
03455 }
03456 }
03457 }
03458 }
03459
03460
03461 END_SCOPE(validator)
03462 END_SCOPE(objects)
03463 END_NCBI_SCOPE
03464
03465