NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/demo/tbl2asn.c |
source navigation diff markup identifier search freetext search file search |
1 /* tbl2asn.c 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information (NCBI) 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government do not place any restriction on its use or reproduction. 13 * We would, however, appreciate having the NCBI and the author cited in 14 * any work or product based on this material 15 * 16 * Although all reasonable efforts have been taken to ensure the accuracy 17 * and reliability of the software and data, the NLM and the U.S. 18 * Government do not and cannot warrant the performance or results that 19 * may be obtained by using this software or data. The NLM and the U.S. 20 * Government disclaim all warranties, express or implied, including 21 * warranties of performance, merchantability or fitness for any particular 22 * purpose. 23 * 24 * =========================================================================== 25 * 26 * File Name: tbl2asn.c 27 * 28 * Author: Jonathan Kans 29 * 30 * Version Creation Date: 5/5/00 31 * 32 * $Revision: 6.277 $ 33 * 34 * File Description: 35 * 36 * Modifications: 37 * -------------------------------------------------------------------------- 38 * Date Name Description of modification 39 * ------- ---------- ----------------------------------------------------- 40 * 41 * 42 * ========================================================================== 43 */ 44 45 #include <ncbi.h> 46 #include <objall.h> 47 #include <objsset.h> 48 #include <objsub.h> 49 #include <objfdef.h> 50 #include <sequtil.h> 51 #include <edutil.h> 52 #include <seqport.h> 53 #include <gather.h> 54 #include <sqnutils.h> 55 #include <subutil.h> 56 #include <toasn3.h> 57 #include <valid.h> 58 #include <asn2gnbk.h> 59 #include <explore.h> 60 #include <tofasta.h> 61 #include <simple.h> 62 #include <suggslp.h> 63 #include <aliparse.h> 64 #include <util/creaders/alnread.h> 65 #include <pmfapi.h> 66 #include <tax3api.h> 67 #ifdef INTERNAL_NCBI_TBL2ASN 68 #include <accpubseq.h> 69 #endif 70 #define NLM_GENERATED_CODE_PROTO 71 #include <asnmacro.h> 72 #include <objmacro.h> 73 #include <macroapi.h> 74 75 #define TBL2ASN_APP_VER "13.2" 76 77 CharPtr TBL2ASN_APPLICATION = TBL2ASN_APP_VER; 78 79 typedef struct cleanupargs { 80 Boolean collection_dates; 81 Boolean collection_dates_month_first; 82 Boolean add_notes_to_overlapping_cds_without_abc; 83 } CleanupArgsData, PNTR CleanupArgsPtr; 84 85 typedef struct tblargs { 86 Boolean raw2delt; 87 Int2 r2dmin; 88 Boolean r2dunk100; 89 Boolean fastaset; 90 Int2 whichclass; 91 Boolean deltaset; 92 Boolean alignset; 93 Boolean gapped; 94 Boolean phrapace; 95 Boolean genprodset; 96 Boolean linkbyoverlap; 97 Boolean linkbyproduct; 98 Boolean implicitgaps; 99 Boolean forcelocalid; 100 Boolean gpstonps; 101 Boolean gnltonote; 102 Boolean removeunnecxref; 103 Boolean dotaxlookup; 104 Boolean dopublookup; 105 CharPtr accn; 106 CharPtr center; 107 CharPtr organism; 108 CharPtr srcquals; 109 CharPtr comment; 110 CharPtr commentFile; 111 CharPtr tableFile; 112 Boolean findorf; 113 Boolean runonorf; 114 Boolean altstart; 115 Boolean conflict; 116 Boolean validate; 117 Boolean relaxed; 118 Boolean validate_barcode; 119 Boolean flatfile; 120 Boolean genereport; 121 Boolean seqidfromfile; 122 Boolean smartfeats; 123 Boolean smarttitle; 124 Boolean logtoterminal; 125 CharPtr aln_beginning_gap; 126 CharPtr aln_end_gap; 127 CharPtr aln_middle_gap; 128 CharPtr aln_missing; 129 CharPtr aln_match; 130 Boolean aln_is_protein; 131 Boolean save_bioseq_set; 132 133 GlobalDiscrepReportPtr global_report; 134 135 CleanupArgsData cleanup_args; 136 } TblArgs, PNTR TblArgsPtr; 137 138 static FILE* OpenOneFile ( 139 CharPtr directory, 140 CharPtr base, 141 CharPtr suffix 142 ) 143 144 { 145 Char file [FILENAME_MAX], path [PATH_MAX]; 146 147 if (base == NULL) { 148 base = ""; 149 } 150 if (suffix == NULL) { 151 suffix = ""; 152 } 153 154 StringNCpy_0 (path, directory, sizeof (path)); 155 sprintf (file, "%s%s", base, suffix); 156 FileBuildPath (path, NULL, file); 157 158 return FileOpen (path, "r"); 159 } 160 161 static void WriteOneFile ( 162 CharPtr results, 163 CharPtr base, 164 CharPtr suffix, 165 CharPtr outfile, 166 SeqEntryPtr sep, 167 SubmitBlockPtr sbp, 168 Boolean save_bioseq_set 169 ) 170 171 { 172 AsnIoPtr aip; 173 BioseqSetPtr bssp; 174 Char file [FILENAME_MAX], path [PATH_MAX]; 175 SeqSubmit ssb; 176 177 if (sep == NULL || sep->data.ptrvalue == NULL) return; 178 179 MemSet ((Pointer) &ssb, 0, sizeof (SeqSubmit)); 180 ssb.sub = sbp; 181 ssb.datatype = 1; 182 ssb.data = (Pointer) sep; 183 184 if (StringDoesHaveText (outfile)) { 185 StringNCpy_0 (path, outfile, sizeof (path)); 186 } else { 187 StringNCpy_0 (path, results, sizeof (path)); 188 sprintf (file, "%s%s", base, suffix); 189 FileBuildPath (path, NULL, file); 190 } 191 192 aip = AsnIoOpen (path, "w"); 193 if (aip == NULL) return; 194 195 if (sbp != NULL) { 196 SeqSubmitAsnWrite (&ssb, aip, NULL); 197 } else if (save_bioseq_set && IS_Bioseq_set (sep)) { 198 bssp = (BioseqSetPtr) sep->data.ptrvalue; 199 BioseqSetAsnWrite (bssp, aip, NULL); 200 } else { 201 SeqEntryAsnWrite (sep, aip, NULL); 202 } 203 204 AsnIoFlush (aip); 205 AsnIoClose (aip); 206 } 207 208 static CharPtr compatSeverityLabel [] = { 209 "NONE", "NOTE: valid", "WARNING: valid", "ERROR: valid", "REJECT: valid", "FATAL: valid", "MAX", NULL 210 }; 211 212 static void LIBCALLBACK ValidCallback ( 213 ErrSev severity, 214 int errcode, 215 int subcode, 216 Uint2 entityID, 217 Uint2 itemtype, 218 Uint4 itemID, 219 CharPtr accession, 220 CharPtr message, 221 CharPtr objtype, 222 CharPtr label, 223 CharPtr context, 224 CharPtr location, 225 CharPtr product, 226 Pointer userdata 227 ) 228 229 { 230 CharPtr catname, errname; 231 FILE *fp; 232 233 fp = (FILE *) userdata; 234 if (fp == NULL) return; 235 236 if (severity < SEV_NONE || severity > SEV_MAX) { 237 severity = SEV_MAX; 238 } 239 240 catname = GetValidCategoryName (errcode); 241 errname = GetValidErrorName (errcode, subcode); 242 243 if (catname == NULL) { 244 catname = "?"; 245 } 246 if (errname == NULL) { 247 errname = "?"; 248 } 249 250 if (accession == NULL) { 251 accession = ""; 252 } 253 if (message == NULL) { 254 message = ""; 255 } 256 if (objtype == NULL) { 257 objtype = ""; 258 } 259 if (label == NULL) { 260 label = ""; 261 } 262 263 fprintf (fp, "%s [%s.%s] %s %s: %s", 264 compatSeverityLabel [severity], 265 catname, errname, message, objtype, label); 266 if (location != NULL) { 267 fprintf (fp, " %s", location); 268 } 269 if (context != NULL) { 270 fprintf (fp, " %s", context); 271 } 272 if (product != NULL) { 273 fprintf (fp, " -> %s", product); 274 } 275 fprintf (fp, "\n"); 276 } 277 278 279 static void ValidateOneFile ( 280 CharPtr results, 281 CharPtr base, 282 CharPtr suffix, 283 SeqEntryPtr sep, 284 Boolean standard, 285 Boolean relaxed, 286 Boolean barcode 287 ) 288 289 { 290 Char file [FILENAME_MAX], path [PATH_MAX]; 291 FILE *ofp; 292 ErrSev oldErrSev; 293 ValidStructPtr vsp; 294 295 StringNCpy_0 (path, results, sizeof (path)); 296 sprintf (file, "%s%s", base, suffix); 297 FileBuildPath (path, NULL, file); 298 299 ofp = FileOpen (path, "w"); 300 301 if (standard) { 302 vsp = ValidStructNew (); 303 if (vsp != NULL) { 304 vsp->useSeqMgrIndexes = TRUE; 305 vsp->suppressContext = TRUE; 306 vsp->seqSubmitParent = TRUE; 307 if (! relaxed) { 308 vsp->testLatLonSubregion = TRUE; 309 } 310 oldErrSev = ErrSetMessageLevel (SEV_NONE); 311 vsp->errfunc = ValidCallback; 312 vsp->userdata = (Pointer) ofp; 313 /* vsp->convertGiToAccn = FALSE; */ 314 ValidateSeqEntry (sep, vsp); 315 ValidStructFree (vsp); 316 ErrSetMessageLevel (oldErrSev); 317 } 318 } 319 /* Barcode results if requested */ 320 if (barcode) { 321 BarcodeValidateOneSeqEntry (ofp, sep, TRUE, FALSE, TRUE, NULL); 322 } 323 324 FileClose (ofp); 325 } 326 327 static void FlatfileOneFile ( 328 CharPtr results, 329 CharPtr base, 330 CharPtr suffix, 331 SeqEntryPtr sep 332 ) 333 334 { 335 Char file [FILENAME_MAX], path [PATH_MAX]; 336 FILE *fp; 337 ErrSev oldErrSev; 338 339 StringNCpy_0 (path, results, sizeof (path)); 340 sprintf (file, "%s%s", base, suffix); 341 FileBuildPath (path, NULL, file); 342 343 fp = FileOpen (path, "w"); 344 if (fp == NULL) return; 345 346 oldErrSev = ErrSetMessageLevel (SEV_MAX); 347 SeqEntryToGnbk (sep, NULL, GENBANK_FMT, ENTREZ_MODE, NORMAL_STYLE, 0, 0, 0, NULL, fp); 348 ErrSetMessageLevel (oldErrSev); 349 350 FileClose (fp); 351 } 352 353 /* for full-length cDNAs, allow automatic annotation of largest internal ORF */ 354 355 typedef struct orfdata { 356 Int4 curlen [6], bestlen [6], currstart [6], beststart [6], sublen [6]; 357 Boolean inorf [6], altstart, runonorf; 358 Int4 bioseq_len; 359 } OrfData, PNTR OrfDataPtr; 360 361 static Boolean TreatLikeStop (Int2 frame, Int4 pos, Uint1 strand, Int4 len) 362 { 363 Int4 remainder = len % 3; 364 Boolean like_stop = FALSE; 365 366 if (strand == Seq_strand_minus) { 367 if (pos < 3) { 368 like_stop = TRUE; 369 } 370 } else { 371 if (pos >= len - remainder - 3) { 372 like_stop = TRUE; 373 } 374 } 375 return like_stop; 376 } 377 378 static void LIBCALLBACK LookForOrfs ( 379 Int4 position, 380 Char residue, 381 Boolean atgStart, 382 Boolean altStart, 383 Boolean orfStop, 384 Int2 frame, 385 Uint1 strand, 386 Pointer userdata 387 ) 388 389 { 390 Int2 idx; 391 OrfDataPtr odp; 392 Boolean start_of_seq = FALSE; 393 394 odp = (OrfDataPtr) userdata; 395 if (strand == Seq_strand_plus) { 396 397 /* top strand */ 398 399 idx = frame; 400 if (odp->inorf [idx]) { 401 if (!orfStop && odp->runonorf) { 402 /* treat the end of the sequence like a stop codon */ 403 if (TreatLikeStop(frame, position, strand, odp->bioseq_len)) { 404 (odp->curlen[idx])++; 405 orfStop = TRUE; 406 } 407 } 408 409 if (orfStop) { 410 odp->inorf [idx] = FALSE; 411 if (odp->curlen [idx] > odp->bestlen [idx]) { 412 odp->bestlen [idx] = odp->curlen [idx]; 413 odp->beststart [idx] = odp->currstart [idx]; 414 } 415 } else { 416 (odp->curlen [idx])++; 417 } 418 } else if (atgStart || (altStart && odp->altstart)) { 419 odp->inorf [idx] = TRUE; 420 odp->curlen [idx] = 1; 421 odp->currstart [idx] = position - frame; 422 } 423 } else { 424 425 /* bottom strand */ 426 427 idx = frame + 3; 428 429 if (!orfStop && odp->runonorf) { 430 start_of_seq = TreatLikeStop (frame, position, strand, odp->bioseq_len); 431 } 432 433 if (orfStop) { 434 odp->curlen [idx] = 0; 435 odp->sublen [idx] = 0; 436 odp->currstart [idx] = position - frame; 437 } else if (start_of_seq) { 438 odp->curlen [idx] = 1; 439 odp->sublen [idx] = 1; 440 odp->currstart [idx] = position - frame - 3; 441 if (odp->curlen [idx] > odp->bestlen [idx]) { 442 odp->bestlen [idx] = odp->curlen [idx]; 443 odp->beststart [idx] = odp->currstart [idx]; 444 } 445 } else if (atgStart || (altStart && odp->altstart)) { 446 (odp->sublen [idx])++; 447 odp->curlen [idx] = odp->sublen [idx]; 448 if (odp->curlen [idx] > odp->bestlen [idx]) { 449 odp->bestlen [idx] = odp->curlen [idx]; 450 odp->beststart [idx] = odp->currstart [idx]; 451 } 452 } else { 453 (odp->sublen [idx])++; 454 } 455 } 456 } 457 458 static SeqFeatPtr AnnotateBestOrf ( 459 BioseqPtr bsp, 460 Int2 genCode, 461 Boolean altstart, 462 Boolean runonorf, 463 SqnTagPtr stp 464 ) 465 466 { 467 SeqFeatPtr cds = NULL; 468 CdRegionPtr crp; 469 GeneRefPtr grp; 470 Int2 i, best, idx; 471 OrfData od; 472 ProtRefPtr prp; 473 SeqFeatPtr sfp; 474 SeqInt sint; 475 CharPtr str; 476 TransTablePtr ttp; 477 ValNode vn; 478 SeqFeatXrefPtr xref; 479 Boolean partial5 = FALSE, partial3 = FALSE; 480 481 if (bsp == NULL) return NULL; 482 for (i = 0; i < 6; i++) { 483 od.curlen [i] = INT4_MIN; 484 od.bestlen [i] = 0; 485 od.currstart [i] = 0; 486 od.beststart [i] = 0; 487 od.sublen [i] = INT4_MIN; 488 od.inorf [i] = FALSE; 489 } 490 od.altstart = altstart; 491 od.runonorf = runonorf; 492 od.bioseq_len = bsp->length; 493 494 /* use simultaneous 6-frame translation finite state machine */ 495 496 ttp = PersistentTransTableByGenCode (genCode); 497 if (ttp != NULL) { 498 TransTableProcessBioseq (ttp, LookForOrfs, (Pointer) &od, bsp); 499 } 500 /* TransTableFree (tbl); - now using persistent tables, free at end */ 501 best = -1; 502 idx = -1; 503 for (i = 0; i < 6; i++) { 504 if (od.bestlen [i] > best) { 505 best = od.bestlen [i]; 506 idx = i; 507 } 508 } 509 if (idx == -1) return NULL; 510 511 /* make feature location on largest ORF */ 512 513 if (idx < 3) { 514 MemSet ((Pointer) &sint, 0, sizeof (SeqInt)); 515 sint.from = od.beststart [idx] + idx; 516 sint.to = sint.from + (od.bestlen [idx]) * 3 + 2; 517 if (sint.to > od.bioseq_len - 1) { 518 sint.to = od.bioseq_len - 1; 519 partial3 = TRUE; 520 } 521 sint.id = SeqIdFindBest (bsp->id, 0); 522 sint.strand = Seq_strand_plus; 523 vn.choice = SEQLOC_INT; 524 vn.extended = 0; 525 vn.data.ptrvalue = (Pointer) &sint; 526 vn.next = NULL; 527 } else { 528 MemSet ((Pointer) &sint, 0, sizeof (SeqInt)); 529 sint.from = od.beststart [idx] + idx - 3; 530 sint.to = sint.from + (od.bestlen [idx]) * 3 + 2; 531 if (sint.from < 0) { 532 sint.from = 0; 533 partial3 = TRUE; 534 } 535 sint.id = SeqIdFindBest (bsp->id, 0); 536 sint.strand = Seq_strand_minus; 537 vn.choice = SEQLOC_INT; 538 vn.extended = 0; 539 vn.data.ptrvalue = (Pointer) &sint; 540 vn.next = NULL; 541 } 542 543 SetSeqLocPartial (&vn, partial5, partial3); 544 545 /* make CDS feature with unknown product - now check [protein=...] */ 546 547 cds = CreateNewFeatureOnBioseq (bsp, SEQFEAT_CDREGION, &vn); 548 if (cds == NULL) return NULL; 549 if (partial5 || partial3) { 550 cds->partial = TRUE; 551 } 552 crp = CreateNewCdRgn (1, FALSE, genCode); 553 if (crp == NULL) return NULL; 554 crp->frame = 1; 555 cds->data.value.ptrvalue = (Pointer) crp; 556 557 prp = ProtRefNew (); 558 if (prp == NULL) return cds; 559 xref = SeqFeatXrefNew (); 560 if (xref == NULL) return cds; 561 xref->data.choice = SEQFEAT_PROT; 562 xref->data.value.ptrvalue = (Pointer) prp; 563 xref->next = cds->xref; 564 cds->xref = xref; 565 prp = ParseTitleIntoProtRef (stp, prp); 566 if (prp->name == NULL && prp->desc == NULL) { 567 prp->name = ValNodeCopyStr (NULL, 0, "unknown"); 568 } 569 570 /* parse CDS comment ("note" goes to biosource) and experimental evidence */ 571 572 str = SqnTagFind (stp, "comment"); 573 if (StringDoesHaveText (str)) { 574 cds->comment = StringSave (str); 575 } 576 577 str = SqnTagFind (stp, "evidence"); 578 if (StringICmp (str, "experimental") == 0) { 579 cds->exp_ev = 1; 580 } 581 582 /* now check [gene=...], make gene feature if locus or synonym present */ 583 584 grp = GeneRefNew (); 585 if (grp == NULL) return cds; 586 grp = ParseTitleIntoGeneRef (stp, grp); 587 if (grp->locus == NULL && grp->syn == NULL) { 588 GeneRefFree (grp); 589 return cds; 590 } 591 sfp = CreateNewFeatureOnBioseq (bsp, SEQFEAT_GENE, NULL); 592 if (sfp == NULL) return cds; 593 sfp->data.value.ptrvalue = (Pointer) grp; 594 595 return cds; 596 } 597 598 /* change all feature IDs to entered accession */ 599 600 static void PromoteSeqId ( 601 SeqIdPtr sip, 602 Pointer userdata 603 ) 604 605 { 606 SeqIdPtr bestid, newid, oldid; 607 608 bestid = (SeqIdPtr) userdata; 609 610 newid = SeqIdDup (bestid); 611 if (newid == NULL) return; 612 613 oldid = ValNodeNew (NULL); 614 if (oldid == NULL) return; 615 616 MemCopy (oldid, sip, sizeof (ValNode)); 617 oldid->next = NULL; 618 619 sip->choice = newid->choice; 620 sip->data.ptrvalue = newid->data.ptrvalue; 621 622 SeqIdFree (oldid); 623 ValNodeFree (newid); 624 625 SeqIdStripLocus (sip); 626 } 627 628 static void CorrectFeatureSeqIds ( 629 SeqFeatPtr sfp, 630 Pointer userdata 631 ) 632 633 { 634 VisitSeqIdsInSeqLoc (sfp->location, userdata, PromoteSeqId); 635 } 636 637 static void CorrectGraphSeqIds ( 638 SeqGraphPtr sgp, 639 Pointer userdata 640 ) 641 642 { 643 VisitSeqIdsInSeqGraph (sgp, userdata, PromoteSeqId); 644 } 645 646 /* source information for several common organisms sequenced by genome centers */ 647 648 typedef struct orgstuff { 649 CharPtr taxname; 650 CharPtr common; 651 CharPtr lineage; 652 CharPtr division; 653 Uint1 gcode; 654 Uint1 mgcode; 655 Int4 taxID; 656 } OrgStuff, PNTR OrfStuffPtr; 657 658 static OrgStuff commonOrgStuff [] = { 659 { 660 "Saccharomyces cerevisiae", "baker's yeast", 661 "Eukaryota; Fungi; Ascomycota; Saccharomycetes; Saccharomycetales; Saccharomycetaceae; Saccharomyces", 662 "PLN", 1, 3, 4932 663 }, 664 { 665 "Drosophila melanogaster", "fruit fly", 666 "Eukaryota; Metazoa; Arthropoda; Tracheata; Hexapoda; Insecta; Pterygota; Neoptera; Endopterygota; Diptera; Brachycera; Muscomorpha; Ephydroidea; Drosophilidae; Drosophila", 667 "INV", 1, 5, 7227 668 }, 669 { 670 "Homo sapiens", "human", 671 "Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Primates; Catarrhini; Hominidae; Homo", 672 "PRI", 1, 2, 9606 673 }, 674 { 675 "Escherichia coli", "", 676 "Bacteria; Proteobacteria; gamma subdivision; Enterobacteriaceae; Escherichia", 677 "BCT", 11, 0, 562 678 }, 679 { 680 "Helicobacter pylori", "", 681 "Bacteria; Proteobacteria; epsilon subdivision; Helicobacter group; Helicobacter", 682 "BCT", 11, 0, 210 683 }, 684 { 685 "Arabidopsis thaliana", "thale cress", 686 "Eukaryota; Viridiplantae; Embryophyta; Tracheophyta; Spermatophyta; Magnoliophyta; eudicotyledons; core eudicots; Rosidae; eurosids II; Brassicales; Brassicaceae; Arabidopsis", 687 "PLN", 1, 1, 3702 688 }, 689 { 690 "Mus musculus", "house mouse", 691 "Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Rodentia; Sciurognathi; Muridae; Murinae; Mus", 692 "ROD", 1, 2, 10090 693 }, 694 { 695 "Rattus norvegicus", "Norway rat", 696 "Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Rodentia; Sciurognathi; Muridae; Murinae; Rattus", 697 "ROD", 1, 2, 10116 698 }, 699 { 700 "Danio rerio", "zebrafish", 701 "Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Actinopterygii; Neopterygii; Teleostei; Euteleostei; Ostariophysi; Cypriniformes; Cyprinidae; Rasborinae; Danio", 702 "VRT", 1, 2, 7955 703 }, 704 { 705 "Zea mays", "", 706 "Eukaryota; Viridiplantae; Embryophyta; Tracheophyta; Spermatophyta; Magnoliophyta; Liliopsida; Poales; Poaceae; Zea", 707 "PLN", 1, 1, 4577 708 }, 709 { 710 "Caenorhabditis elegans", "", 711 "Eukaryota; Metazoa; Nematoda; Chromadorea; Rhabditida; Rhabditoidea; Rhabditidae; Peloderinae; Caenorhabditis", 712 "INV", 1, 5, 6239 713 }, 714 { 715 "Caenorhabditis briggsae", "", 716 "Eukaryota; Metazoa; Nematoda; Chromadorea; Rhabditida; Rhabditoidea; Rhabditidae; Peloderinae; Caenorhabditis", 717 "INV", 1, 5, 6238 718 }, 719 { 720 "Anopheles gambiae", "African malaria mosquito", 721 "Eukaryota; Metazoa; Arthropoda; Tracheata; Hexapoda; Insecta; Pterygota; Neoptera; Endopterygota; Diptera; Nematocera; Culicoidea; Anopheles", 722 "INV", 1, 5, 7165 723 }, 724 { 725 "Anopheles gambiae str. PEST", "African malaria mosquito", 726 "Eukaryota; Metazoa; Arthropoda; Tracheata; Hexapoda; Insecta; Pterygota; Neoptera; Endopterygota; Diptera; Nematocera; Culicoidea; Anopheles", 727 "INV", 1, 5, 180454 728 }, 729 { 730 "Tetrahymena thermophila", "", 731 "Eukaryota; Alveolata; Ciliophora; Oligohymenophorea; Hymenostomatida; Tetrahymenina; Tetrahymena", 732 "INV", 6, 4, 5911 733 }, 734 { 735 "Pan troglodytes", "chimpanzee", 736 "Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Primates; Catarrhini; Hominidae; Pan", 737 "PRI", 1, 2, 9598 738 }, 739 { 740 "Candida albicans", "", 741 "Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; Saccharomycetales; mitosporic Saccharomycetales; Candida", 742 "PLN", 12, 4, 5476 743 }, 744 { 745 "Candida albicans SC5314", "", 746 "Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; Saccharomycetales; mitosporic Saccharomycetales; Candida", 747 "PLN", 12, 4, 237561 748 }, 749 { 750 "Trypanosoma brucei", "", 751 "Eukaryota; Euglenozoa; Kinetoplastida; Trypanosomatidae; Trypanosoma", 752 "INV", 1, 4, 5691 753 }, 754 { 755 "Trypanosoma cruzi", "", 756 "Eukaryota; Euglenozoa; Kinetoplastida; Trypanosomatidae; Trypanosoma; Schizotrypanum", 757 "INV", 1, 4, 5693 758 }, 759 { 760 "Oryza sativa", "", 761 "Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; Spermatophyta; Magnoliophyta; Liliopsida; Poales; Poaceae; Ehrhartoideae; Oryzeae; Oryza", 762 "PLN", 1, 1, 4530 763 }, 764 { 765 "Oryza sativa (indica cultivar-group)", "", 766 "Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; Spermatophyta; Magnoliophyta; Liliopsida; Poales; Poaceae; Ehrhartoideae; Oryzeae; Oryza", 767 "PLN", 1, 1, 39946 768 }, 769 { 770 "Oryza sativa (japonica cultivar-group)", "", 771 "Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; Spermatophyta; Magnoliophyta; Liliopsida; Poales; Poaceae; Ehrhartoideae; Oryzeae; Oryza", 772 "PLN", 1, 1, 39947 773 }, 774 { 775 "Aspergillus nidulans FGSC A4", "", 776 "Eukaryota; Fungi; Ascomycota; Pezizomycotina; Eurotiomycetes; Eurotiales; Trichocomaceae; Emericella", 777 "PLN", 1, 4, 227321 778 }, 779 { 780 "environmental sequence", "", 781 "unclassified; environmental samples", 782 "UNA", 1, 2, 256318 783 }, 784 { 785 NULL, NULL, NULL, NULL, 0, 0, 0 786 } 787 }; 788 789 static Boolean HasTaxon ( 790 OrgRefPtr orp 791 ) 792 793 { 794 ValNodePtr db; 795 DbtagPtr dbt; 796 797 if (orp == FALSE) return FALSE; 798 for (db = orp->db; db != NULL; db = db->next) { 799 dbt = (DbtagPtr) db->data.ptrvalue; 800 if (dbt != NULL && dbt->db != NULL && 801 StringICmp (dbt->db, "taxon") == 0) return TRUE; 802 } 803 return FALSE; 804 } 805 806 static void AddMissingSourceInfo ( 807 BioSourcePtr biop 808 ) 809 810 { 811 ValNodePtr db; 812 DbtagPtr dbt; 813 Int2 idx; 814 ObjectIdPtr oip; 815 OrgNamePtr onp; 816 OrgRefPtr orp; 817 OrfStuffPtr osp; 818 819 if (biop == NULL) return; 820 orp = biop->org; 821 if (orp == NULL) return; 822 onp = orp->orgname; 823 if (onp == NULL) return; 824 825 /* look for entry of organisms in commonOrgStuff table */ 826 827 for (idx = 0; commonOrgStuff [idx].taxname != NULL; idx++) { 828 osp = &(commonOrgStuff [idx]); 829 if (StringICmp (orp->taxname, osp->taxname) == 0) { 830 if (StringCmp (orp->taxname, osp->taxname) != 0) { 831 /* fix capitalization of supplied name if in common organism list */ 832 StringCpy (orp->taxname, osp->taxname); 833 } 834 if (StringHasNoText (orp->common) && StringDoesHaveText (osp->common)) { 835 orp->common = StringSave (osp->common); 836 } 837 if (onp->gcode == 0) { 838 onp->gcode = osp->gcode; 839 } 840 if (onp->mgcode == 0) { 841 onp->mgcode = osp->mgcode; 842 } 843 if (StringHasNoText (onp->div)) { 844 onp->div = StringSave (osp->division); 845 } 846 if (StringHasNoText (onp->lineage)) { 847 onp->lineage = StringSave (osp->lineage); 848 } 849 if (! HasTaxon (orp)) { 850 db = ValNodeNew (NULL); 851 if (db != NULL) { 852 dbt = DbtagNew (); 853 if (dbt != NULL) { 854 oip = ObjectIdNew (); 855 if (oip != NULL) { 856 oip->id = osp->taxID; 857 dbt->db = StringSave ("taxon"); 858 dbt->tag = oip; 859 db->data.ptrvalue = (Pointer) dbt; 860 orp->db = db; 861 } 862 } 863 } 864 } 865 } 866 } 867 } 868 869 static BioseqPtr GetBioseqReferencedByAnnot ( 870 SeqAnnotPtr sap, 871 Uint2 entityID 872 ) 873 874 { 875 SeqAlignPtr align; 876 BioseqPtr bsp; 877 DenseDiagPtr ddp; 878 DenseSegPtr dsp; 879 SeqFeatPtr feat; 880 SeqGraphPtr graph; 881 SeqIdPtr sip; 882 SeqLocPtr slp; 883 StdSegPtr ssp; 884 SeqLocPtr tloc; 885 886 if (sap == NULL) return NULL; 887 switch (sap->type) { 888 case 1 : 889 feat = (SeqFeatPtr) sap->data; 890 while (feat != NULL) { 891 slp = feat->location; 892 if (slp != NULL) { 893 bsp = BioseqFindFromSeqLoc (slp); 894 if (bsp != NULL) return bsp; 895 } 896 feat = feat->next; 897 } 898 break; 899 case 2 : 900 align = (SeqAlignPtr) sap->data; 901 while (align != NULL) { 902 if (align->segtype == 1) { 903 ddp = (DenseDiagPtr) align->segs; 904 if (ddp != NULL) { 905 for (sip = ddp->id; sip != NULL; sip = sip->next) { 906 bsp = BioseqFind (sip); 907 if (bsp != NULL) return bsp; 908 } 909 } 910 } else if (align->segtype == 2) { 911 dsp = (DenseSegPtr) align->segs; 912 if (dsp != NULL) { 913 for (sip = dsp->ids; sip != NULL; sip = sip->next) { 914 bsp = BioseqFind (sip); 915 if (bsp != NULL) return bsp; 916 } 917 } 918 } else if (align->segtype == 3) { 919 ssp = (StdSegPtr) align->segs; 920 if (ssp != NULL && ssp->loc != NULL) { 921 for (tloc = ssp->loc; tloc != NULL; tloc = tloc->next) { 922 bsp = BioseqFindFromSeqLoc (tloc); 923 if (bsp != NULL) return bsp; 924 } 925 } 926 } 927 align = align->next; 928 } 929 break; 930 case 3 : 931 graph = (SeqGraphPtr) sap->data; 932 while (graph != NULL) { 933 slp = graph->loc; 934 if (slp != NULL) { 935 bsp = BioseqFindFromSeqLoc (slp); 936 if (bsp != NULL) return bsp; 937 } 938 graph = graph->next; 939 } 940 break; 941 default : 942 break; 943 } 944 return NULL; 945 } 946 947 static Int2 GetGenCodeForBsp ( 948 BioseqPtr bsp 949 ) 950 951 { 952 BioSourcePtr biop; 953 Boolean mito; 954 OrgNamePtr onp; 955 OrgRefPtr orp; 956 SeqDescrPtr sdp; 957 958 sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_source, NULL); 959 if (sdp == NULL) return 1; 960 biop = (BioSourcePtr) sdp->data.ptrvalue; 961 if (biop == NULL) return 1; 962 orp = biop->org; 963 if (orp == NULL) return 1; 964 onp = orp->orgname; 965 if (onp == NULL) return 1; 966 mito = (Boolean) (biop->genome == 4 || biop->genome == 5); 967 if (mito) { 968 if (onp->mgcode == 0) { 969 return 1; 970 } 971 return onp->mgcode; 972 } 973 if (onp->gcode == 0) { 974 return 1; 975 } 976 return onp->gcode; 977 } 978 979 typedef struct gcmdata { 980 SeqFeatPtr gene; 981 SeqFeatPtr feat; 982 CharPtr label; 983 } GmcData, PNTR GmcDataPtr; 984 985 static int LIBCALLBACK SortByGenePtr ( 986 VoidPtr vp1, 987 VoidPtr vp2 988 ) 989 990 { 991 GmcDataPtr gdp1, gdp2; 992 993 if (vp1 == NULL || vp2 == NULL) return 0; 994 gdp1 = (GmcDataPtr) vp1; 995 gdp2 = (GmcDataPtr) vp2; 996 if (gdp1 == NULL || gdp2 == NULL) return 0; 997 998 if (gdp1->gene > gdp2->gene) return -1; 999 if (gdp1->gene < gdp2->gene) return 1; 1000 1001 if (gdp1->feat > gdp2->feat) return -1; 1002 if (gdp1->feat < gdp2->feat) return 1; 1003 1004 return 0; 1005 } 1006 1007 static void PrintOneGeneLine ( 1008 SeqFeatPtr gene, 1009 SeqFeatPtr cds, 1010 SeqFeatPtr rna, 1011 CharPtr cdslabel, 1012 CharPtr rnalabel, 1013 FILE *fp 1014 ) 1015 1016 { 1017 BioseqPtr bsp; 1018 ValNodePtr db, old_locus_tag, vnp; 1019 DbtagPtr dbt; 1020 CharPtr desc, locus, locus_tag, cdslcl, cdsaccn, cdsgnl, 1021 rnaaccn, rnagnl, fbgn, gene_type, rna_type, prefix; 1022 GBQualPtr gbq; 1023 GeneRefPtr grp; 1024 ObjectIdPtr oip; 1025 SeqIdPtr sip; 1026 CharPtr str; 1027 TextSeqIdPtr tsip; 1028 1029 if (fp == NULL) return; 1030 1031 locus = NULL; 1032 desc = NULL; 1033 locus_tag = NULL; 1034 old_locus_tag = NULL; 1035 1036 cdslcl = NULL; 1037 cdsaccn = NULL; 1038 cdsgnl = NULL; 1039 rnaaccn = NULL; 1040 rnagnl = NULL; 1041 1042 db = NULL; 1043 fbgn = NULL; 1044 1045 gene_type = NULL; 1046 rna_type = NULL; 1047 1048 if (gene != NULL) { 1049 gene_type = "gene"; 1050 if (gene->pseudo) { 1051 gene_type = "pseudogene"; 1052 } 1053 grp = (GeneRefPtr) gene->data.value.ptrvalue; 1054 if (grp != NULL) { 1055 if (grp->pseudo) { 1056 gene_type = "pseudogene"; 1057 } 1058 locus = grp->locus; 1059 desc = grp->desc; 1060 locus_tag = grp->locus_tag; 1061 db = grp->db; 1062 } 1063 if (db == NULL) { 1064 db = gene->dbxref; 1065 } 1066 for (gbq = gene->qual; gbq != NULL; gbq = gbq->next) { 1067 if (StringICmp (gbq->qual, "old_locus_tag") != 0) continue; 1068 if (StringHasNoText (gbq->val)) continue; 1069 ValNodeCopyStr(&old_locus_tag, 0, gbq->val); 1070 } 1071 for (vnp = db; vnp != NULL; vnp = vnp->next) { 1072 dbt = (DbtagPtr) vnp->data.ptrvalue; 1073 if (dbt == NULL) continue; 1074 if (StringICmp (dbt->db, "FLYBASE") != 0) continue; 1075 oip = dbt->tag; 1076 if (oip == NULL) continue; 1077 fbgn = oip->str; 1078 } 1079 } 1080 1081 if (cds != NULL) { 1082 if (cds->product != NULL) { 1083 bsp = BioseqFindFromSeqLoc (cds->product); 1084 if (bsp != NULL) { 1085 for (sip = bsp->id; sip != NULL; sip = sip->next) { 1086 switch (sip->choice) { 1087 case SEQID_LOCAL : 1088 oip = (ObjectIdPtr) sip->data.ptrvalue; 1089 if (oip == NULL) continue; 1090 cdslcl = oip->str; 1091 break; 1092 case SEQID_GENBANK : 1093 case SEQID_TPG : 1094 tsip = (TextSeqIdPtr) sip->data.ptrvalue; 1095 if (tsip == NULL) continue; 1096 cdsaccn = tsip->accession; 1097 break; 1098 case SEQID_GENERAL : 1099 dbt = (DbtagPtr) sip->data.ptrvalue; 1100 if (dbt == NULL) continue; 1101 if (IsSkippableDbtag (dbt)) continue; 1102 oip = dbt->tag; 1103 if (oip == NULL) continue; 1104 cdsgnl = oip->str; 1105 break; 1106 default : 1107 break; 1108 } 1109 } 1110 } 1111 } 1112 } 1113 1114 if (rna != NULL) { 1115 switch (rna->idx.subtype) { 1116 case FEATDEF_preRNA : 1117 rna_type = "precursor RNA"; 1118 break; 1119 case FEATDEF_mRNA : 1120 rna_type = "mRNA"; 1121 break; 1122 case FEATDEF_tRNA : 1123 rna_type = "tRNA"; 1124 break; 1125 case FEATDEF_rRNA : 1126 rna_type = "rRNA"; 1127 break; 1128 case FEATDEF_otherRNA : 1129 rna_type = "misc RNA"; 1130 break; 1131 case FEATDEF_ncRNA : 1132 rna_type = "ncRNA"; 1133 for (gbq = rna->qual; gbq != NULL; gbq = gbq->next) { 1134 if (StringICmp (gbq->qual, "ncRNA_class") != 0) continue; 1135 if (StringDoesHaveText (gbq->val)) { 1136 rna_type = gbq->val; 1137 } 1138 } 1139 break; 1140 case FEATDEF_tmRNA : 1141 rna_type = "tmRNA"; 1142 break; 1143 default : 1144 break; 1145 } 1146 if (rna->pseudo) { 1147 rna_type = "pseudo RNA"; 1148 } 1149 if (rna->product != NULL) { 1150 bsp = BioseqFindFromSeqLoc (rna->product); 1151 if (bsp != NULL) { 1152 for (sip = bsp->id; sip != NULL; sip = sip->next) { 1153 switch (sip->choice) { 1154 case SEQID_GENBANK : 1155 case SEQID_TPG : 1156 tsip = (TextSeqIdPtr) sip->data.ptrvalue; 1157 if (tsip == NULL) continue; 1158 rnaaccn = tsip->accession; 1159 break; 1160 case SEQID_GENERAL : 1161 dbt = (DbtagPtr) sip->data.ptrvalue; 1162 if (dbt == NULL) continue; 1163 if (IsSkippableDbtag (dbt)) continue; 1164 oip = dbt->tag; 1165 if (oip == NULL) continue; 1166 rnagnl = oip->str; 1167 break; 1168 default : 1169 break; 1170 } 1171 } 1172 } 1173 } 1174 } 1175 1176 if (StringDoesHaveText (locus_tag)) { 1177 fprintf (fp, "%s", locus_tag); 1178 } else { 1179 fprintf (fp, "null_gene_ltag"); 1180 } 1181 1182 fprintf (fp, "\t"); 1183 if (StringDoesHaveText (locus)) { 1184 fprintf (fp, "%s", locus); 1185 } else { 1186 fprintf (fp, "null_gene_locus"); 1187 } 1188 1189 fprintf (fp, "\t"); 1190 if (StringDoesHaveText (desc)) { 1191 fprintf (fp, "%s", desc); 1192 } else { 1193 fprintf (fp, "null_gene_desc"); 1194 } 1195 1196 fprintf (fp, "\t"); 1197 if (StringDoesHaveText (fbgn)) { 1198 fprintf (fp, "%s", fbgn); 1199 } else { 1200 fprintf (fp, "null_fbgn"); 1201 } 1202 1203 fprintf (fp, "\t"); 1204 if (old_locus_tag != NULL) { 1205 prefix = ""; 1206 for (vnp = old_locus_tag; vnp != NULL; vnp = vnp->next) { 1207 str = (CharPtr) vnp->data.ptrvalue; 1208 if (StringHasNoText (str)) continue; 1209 fprintf (fp, "%s%s", prefix, str); 1210 prefix = ","; 1211 } 1212 } else { 1213 fprintf (fp, "null_old_ltag"); 1214 } 1215 1216 fprintf (fp, "\t"); 1217 if (StringDoesHaveText (cdslcl)) { 1218 fprintf (fp, "%s", cdslcl); 1219 } else { 1220 fprintf (fp, "null_cds_lcl"); 1221 } 1222 1223 fprintf (fp, "\t"); 1224 if (StringDoesHaveText (cdsaccn)) { 1225 fprintf (fp, "%s", cdsaccn); 1226 } else { 1227 fprintf (fp, "null_cds_accn"); 1228 } 1229 1230 fprintf (fp, "\t"); 1231 if (StringDoesHaveText (cdsgnl)) { 1232 fprintf (fp, "%s", cdsgnl); 1233 } else { 1234 fprintf (fp, "null_cds_gnl"); 1235 } 1236 1237 fprintf (fp, "\t"); 1238 if (StringDoesHaveText (rnaaccn)) { 1239 fprintf (fp, "%s", rnaaccn); 1240 } else { 1241 fprintf (fp, "null_rna_accn"); 1242 } 1243 1244 fprintf (fp, "\t"); 1245 if (StringDoesHaveText (rnagnl)) { 1246 fprintf (fp, "%s", rnagnl); 1247 } else { 1248 fprintf (fp, "null_rna_gnl"); 1249 } 1250 1251 fprintf (fp, "\t"); 1252 if (StringDoesHaveText (cdslabel)) { 1253 fprintf (fp, "%s", cdslabel); 1254 } else { 1255 fprintf (fp, "null_cds_product"); 1256 } 1257 1258 fprintf (fp, "\t"); 1259 if (StringDoesHaveText (rnalabel)) { 1260 fprintf (fp, "%s", rnalabel); 1261 } else { 1262 fprintf (fp, "null_rna_product"); 1263 } 1264 1265 fprintf (fp, "\t"); 1266 if (StringDoesHaveText (gene_type)) { 1267 fprintf (fp, "%s", gene_type); 1268 } else { 1269 fprintf (fp, "null_gene_type"); 1270 } 1271 1272 fprintf (fp, "\t"); 1273 if (StringDoesHaveText (rna_type)) { 1274 fprintf (fp, "%s", rna_type); 1275 } else { 1276 fprintf (fp, "null_rna_type"); 1277 } 1278 1279 fprintf (fp, "\n"); 1280 } 1281 1282 static void GeneReportOneBsp ( 1283 BioseqPtr bsp, 1284 FILE *fp 1285 ) 1286 1287 { 1288 CharPtr cdslabel, rnalabel; 1289 SeqMgrFeatContext fcontext; 1290 GmcDataPtr gdp, head; 1291 GeneRefPtr grp; 1292 Int2 i, j, k, numgene, numcds, numrna, total; 1293 SeqFeatPtr matchsfp, sfp, tmp; 1294 SeqFeatXrefPtr xref; 1295 1296 if (bsp == NULL || fp == NULL) return; 1297 1298 numgene = 0; 1299 numcds = 0; 1300 numrna = 0; 1301 1302 sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext); 1303 while (sfp != NULL) { 1304 switch (sfp->data.choice) { 1305 case SEQFEAT_GENE : 1306 numgene++; 1307 break; 1308 case SEQFEAT_CDREGION : 1309 numcds++; 1310 break; 1311 case SEQFEAT_RNA : 1312 numrna++; 1313 break; 1314 default : 1315 break; 1316 } 1317 sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext); 1318 } 1319 1320 if (numgene == 0) return; 1321 total = numgene + numcds + numrna; 1322 if (total == 0) return; 1323 1324 head = (GmcDataPtr) MemNew (sizeof (GmcData) * (size_t) (total + 1)); 1325 if (head == NULL) return; 1326 1327 gdp = head; 1328 total = 0; 1329 sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext); 1330 while (sfp != NULL) { 1331 if (sfp->data.choice == SEQFEAT_CDREGION || sfp->data.choice == SEQFEAT_RNA) { 1332 gdp->feat = sfp; 1333 gdp->label = fcontext.label; 1334 grp = SeqMgrGetGeneXref (sfp); 1335 if (grp == NULL) { 1336 gdp->gene = SeqMgrGetOverlappingGene (sfp->location, NULL); 1337 } else if (! SeqMgrGeneIsSuppressed (grp)) { 1338 if (StringDoesHaveText (grp->locus_tag)) { 1339 gdp->gene = SeqMgrGetGeneByLocusTag (bsp, grp->locus_tag, NULL); 1340 } else if (StringDoesHaveText (grp->locus)) { 1341 gdp->gene = SeqMgrGetFeatureByLabel (bsp, grp->locus, SEQFEAT_GENE, 0, NULL); 1342 } 1343 } 1344 gdp++; 1345 total++; 1346 } else if (sfp->data.choice == SEQFEAT_GENE) { 1347 gdp->gene = sfp; 1348 gdp++; 1349 total++; 1350 } 1351 sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext); 1352 } 1353 1354 HeapSort (head, (size_t) total, sizeof (GmcData), SortByGenePtr); 1355 1356 for (i = 0; i < total; i += j) { 1357 sfp = head [i].gene; 1358 if (sfp == NULL) continue; 1359 numcds = 0; 1360 numrna = 0; 1361 for (j = 0; i + j < total && sfp == head [i + j].gene; j++) { 1362 tmp = head [i + j].feat; 1363 if (tmp == NULL) continue; 1364 if (tmp->data.choice == SEQFEAT_CDREGION) { 1365 numcds++; 1366 } else if (tmp->data.choice == SEQFEAT_RNA) { 1367 numrna++; 1368 } 1369 } 1370 cdslabel = NULL; 1371 rnalabel = NULL; 1372 if (numcds > 0) { 1373 for (k = 0; k < j; k++) { 1374 tmp = head [i + k].feat; 1375 if (tmp == NULL) continue; 1376 if (tmp->data.choice != SEQFEAT_CDREGION) continue; 1377 cdslabel = head [i + k].label; 1378 matchsfp = NULL; 1379 for (xref = tmp->xref; xref != NULL && matchsfp == NULL; xref = xref->next) { 1380 if (xref->id.choice != 0) { 1381 matchsfp = SeqMgrGetFeatureByFeatID (tmp->idx.entityID, NULL, NULL, xref, &fcontext); 1382 rnalabel = fcontext.label; 1383 } 1384 } 1385 PrintOneGeneLine (sfp, tmp, matchsfp, cdslabel, rnalabel, fp); 1386 } 1387 } else if (numrna > 0) { 1388 for (k = 0; k < j; k++) { 1389 tmp = head [i + k].feat; 1390 if (tmp == NULL) continue; 1391 if (tmp->data.choice != SEQFEAT_RNA) continue; 1392 rnalabel = head [i + k].label; 1393 PrintOneGeneLine (sfp, NULL, tmp, NULL, rnalabel, fp); 1394 } 1395 } else { 1396 PrintOneGeneLine (sfp, NULL, NULL, NULL, NULL, fp); 1397 } 1398 } 1399 1400 MemFree (head); 1401 } 1402 1403 static void GeneReportGenomicBsp ( 1404 BioseqPtr bsp, 1405 Pointer userdata 1406 ) 1407 1408 { 1409 SeqMgrDescContext dcontext; 1410 MolInfoPtr mip; 1411 SeqDescrPtr sdp; 1412 1413 if (bsp == NULL) return; 1414 1415 if (ISA_aa (bsp->mol)) return; 1416 sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext); 1417 if (sdp == NULL) return; 1418 mip = (MolInfoPtr) sdp->data.ptrvalue; 1419 if (mip == NULL) return; 1420 if (mip->biomol != MOLECULE_TYPE_GENOMIC) return; 1421 1422 GeneReportOneBsp (bsp, (FILE *) userdata); 1423 } 1424 1425 static void GeneReportOneFile ( 1426 CharPtr results, 1427 CharPtr base, 1428 CharPtr suffix, 1429 SeqEntryPtr sep 1430 ) 1431 1432 { 1433 Char file [FILENAME_MAX], path [PATH_MAX]; 1434 FILE *fp; 1435 ErrSev oldErrSev; 1436 1437 StringNCpy_0 (path, results, sizeof (path)); 1438 sprintf (file, "%s%s", base, suffix); 1439 FileBuildPath (path, NULL, file); 1440 1441 fp = FileOpen (path, "w"); 1442 if (fp == NULL) return; 1443 1444 oldErrSev = ErrSetMessageLevel (SEV_MAX); 1445 VisitBioseqsInSep (sep, (Pointer) fp, GeneReportGenomicBsp); 1446 ErrSetMessageLevel (oldErrSev); 1447 1448 FileClose (fp); 1449 } 1450 1451 static void EnhanceOneCDS ( 1452 SeqFeatPtr sfp, 1453 Boolean alt_splice 1454 ) 1455 1456 { 1457 DbtagPtr dbt; 1458 GBQualPtr gbq; 1459 Char id [64]; 1460 SeqIdPtr ids, sip; 1461 size_t len; 1462 CharPtr name, nwstr, ptr, str; 1463 ObjectIdPtr oip; 1464 ProtRefPtr prp; 1465 Char tmp [256]; 1466 ValNodePtr vnp; 1467 SeqFeatXrefPtr xref; 1468 1469 if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION) return; 1470 1471 name = NULL; 1472 vnp = NULL; 1473 prp = NULL; 1474 1475 for (xref = sfp->xref; xref != NULL; xref = xref->next) { 1476 if (xref->data.choice == SEQFEAT_PROT) { 1477 prp = (ProtRefPtr) xref->data.value.ptrvalue; 1478 } 1479 } 1480 1481 id [0] = '\0'; 1482 for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) { 1483 if (StringICmp (gbq->qual, "protein_id") == 0) { 1484 StringNCpy_0 (id, gbq->val, sizeof (id)); 1485 } 1486 } 1487 if (StringDoesHaveText (id) && StringChr (id, '|') != NULL) { 1488 str = NULL; 1489 ids = SeqIdParse (id); 1490 for (sip = ids; sip != NULL; sip = sip->next) { 1491 if (sip->choice != SEQID_GENERAL) continue; 1492 dbt = (DbtagPtr) sip->data.ptrvalue; 1493 if (dbt == NULL) continue; 1494 if (IsSkippableDbtag (dbt)) continue; 1495 oip = dbt->tag; 1496 if (oip == NULL) continue; 1497 str = oip->str; 1498 } 1499 1500 if (StringDoesHaveText (str)) { 1501 if (prp != NULL && prp->name != NULL) { 1502 vnp = prp->name; 1503 name = (CharPtr) vnp->data.ptrvalue; 1504 } 1505 if (StringDoesHaveText (name) && vnp != NULL) { 1506 if (alt_splice) { 1507 ptr = StringChr (str, '-'); 1508 if (ptr != NULL && StringLen (ptr) == 3) { 1509 ptr++; 1510 ptr++; 1511 sprintf (tmp, "%s, isoform %s", str, ptr); 1512 len = StringLen (name) + StringLen (", ") + StringLen (tmp); 1513 nwstr = (CharPtr) MemNew (sizeof (Char) * (len + 2)); 1514 if (nwstr != NULL) { 1515 StringCpy (nwstr, name); 1516 /* 1517 StringCat (nwstr, ", "); 1518 */ 1519 StringCat (nwstr, " "); 1520 StringCat (nwstr, tmp); 1521 vnp->data.ptrvalue = (Pointer) nwstr; 1522 MemFree (name); 1523 } 1524 } else { 1525 AddQualifierToFeature (sfp, "product", str); 1526 } 1527 } else { 1528 len = StringLen (name) + StringLen (", ") + StringLen (str); 1529 nwstr = (CharPtr) MemNew (sizeof (Char) * (len + 2)); 1530 if (nwstr != NULL) { 1531 StringCpy (nwstr, name); 1532 /* 1533 StringCat (nwstr, ", "); 1534 */ 1535 StringCat (nwstr, " "); 1536 StringCat (nwstr, str); 1537 vnp->data.ptrvalue = (Pointer) nwstr; 1538 MemFree (name); 1539 } 1540 } 1541 } else { 1542 if (alt_splice) { 1543 ptr = StringChr (str, '-'); 1544 if (ptr != NULL && StringLen (ptr) == 3) { 1545 ptr++; 1546 ptr++; 1547 sprintf (tmp, "%s, isoform %s", str, ptr); 1548 AddQualifierToFeature (sfp, "product", tmp); 1549 } else { 1550 AddQualifierToFeature (sfp, "product", str); 1551 } 1552 } else { 1553 AddQualifierToFeature (sfp, "product", str); 1554 } 1555 } 1556 } 1557 1558 SeqIdSetFree (ids); 1559 } 1560 } 1561 1562 static void EnhanceOneRna ( 1563 SeqFeatPtr sfp, 1564 Boolean alt_splice 1565 ) 1566 1567 { 1568 DbtagPtr dbt; 1569 GBQualPtr gbq, nm_gbq; 1570 Char id [64]; 1571 SeqIdPtr ids, sip; 1572 size_t len; 1573 CharPtr name, nwstr, ptr, str; 1574 ObjectIdPtr oip; 1575 RnaRefPtr rrp; 1576 Char tmp [256]; 1577 1578 if (sfp == NULL || sfp->data.choice != SEQFEAT_RNA) return; 1579 1580 name = NULL; 1581 nm_gbq = NULL; 1582 1583 rrp = (RnaRefPtr) sfp->data.value.ptrvalue; 1584 if (rrp != NULL && rrp->ext.choice == 1) { 1585 switch (rrp->type) { 1586 case 1 : /* precurrsor_RNA */ 1587 case 2 : /* mRNA */ 1588 case 4 : /* rRNA */ 1589 name = rrp->ext.value.ptrvalue; 1590 break; 1591 case 255 : /* misc_RNA, ncRNA, tmRNA */ 1592 for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) { 1593 if (StringICmp (gbq->qual, "product") == 0) { 1594 nm_gbq = gbq; 1595 name = gbq->val; 1596 } 1597 } 1598 break; 1599 case 3: /* tRNA */ 1600 return; 1601 default : 1602 break; 1603 } 1604 } 1605 1606 id [0] = '\0'; 1607 for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) { 1608 if (StringICmp (gbq->qual, "transcript_id") == 0) { 1609 StringNCpy_0 (id, gbq->val, sizeof (id)); 1610 } 1611 } 1612 if (StringDoesHaveText (id) && StringChr (id, '|') != NULL) { 1613 str = NULL; 1614 ids = SeqIdParse (id); 1615 for (sip = ids; sip != NULL; sip = sip->next) { 1616 if (sip->choice != SEQID_GENERAL) continue; 1617 dbt = (DbtagPtr) sip->data.ptrvalue; 1618 if (dbt == NULL) continue; 1619 if (IsSkippableDbtag(dbt)) continue; 1620 oip = dbt->tag; 1621 if (oip == NULL) continue; 1622 str = oip->str; 1623 } 1624 1625 if (StringDoesHaveText (str)) { 1626 if (StringDoesHaveText (name) && StringCmp (str, name) != 0) { 1627 if (alt_splice) { 1628 ptr = StringChr (str, '-'); 1629 if (ptr != NULL && StringLen (ptr) == 3) { 1630 ptr++; 1631 ptr++; 1632 sprintf (tmp, "%s, transcript variant %s", str, ptr); 1633 len = StringLen (name) + StringLen (", ") + StringLen (tmp); 1634 nwstr = (CharPtr) MemNew (sizeof (Char) * (len + 2)); 1635 if (nwstr != NULL) { 1636 StringCpy (nwstr, name); 1637 /* 1638 StringCat (nwstr, ", "); 1639 */ 1640 StringCat (nwstr, " "); 1641 StringCat (nwstr, tmp); 1642 if (nm_gbq != NULL) { 1643 nm_gbq->val = (Pointer) nwstr; 1644 } else { 1645 rrp->ext.value.ptrvalue = (Pointer) nwstr; 1646 } 1647 MemFree (name); 1648 } 1649 } else { 1650 AddQualifierToFeature (sfp, "product", str); 1651 } 1652 } else { 1653 len = StringLen (name) + StringLen (", ") + StringLen (str); 1654 nwstr = (CharPtr) MemNew (sizeof (Char) * (len + 2)); 1655 if (nwstr != NULL) { 1656 StringCpy (nwstr, name); 1657 /* 1658 StringCat (nwstr, ", "); 1659 */ 1660 StringCat (nwstr, " "); 1661 StringCat (nwstr, str); 1662 if (nm_gbq != NULL) { 1663 nm_gbq->val = (Pointer) nwstr; 1664 } else { 1665 rrp->ext.value.ptrvalue = (Pointer) nwstr; 1666 } 1667 MemFree (name); 1668 } 1669 } 1670 } else { 1671 if (alt_splice) { 1672 ptr = StringChr (str, '-'); 1673 if (ptr != NULL && StringLen (ptr) == 3) { 1674 ptr++; 1675 ptr++; 1676 sprintf (tmp, "%s, transcript variant %s", str, ptr); 1677 AddQualifierToFeature (sfp, "product", tmp); 1678 } else { 1679 AddQualifierToFeature (sfp, "product", str); 1680 } 1681 } else { 1682 AddQualifierToFeature (sfp, "product", str); 1683 } 1684 } 1685 } 1686 1687 SeqIdSetFree (ids); 1688 } 1689 } 1690 1691 static void EnhanceFeatureAnnotation ( 1692 SeqFeatPtr features, 1693 BioseqPtr bsp 1694 ) 1695 1696 { 1697 GmcDataPtr gdp, head; 1698 GeneRefPtr grp; 1699 Int2 i, j, k, numgene, numcds, numrna; 1700 SeqFeatPtr sfp; 1701 1702 if (features == NULL || bsp == NULL) return; 1703 1704 numgene = 0; 1705 numcds = 0; 1706 numrna = 0; 1707 1708 for (sfp = features; sfp != NULL; sfp = sfp->next) { 1709 switch (sfp->data.choice) { 1710 case SEQFEAT_GENE : 1711 numgene++; 1712 break; 1713 case SEQFEAT_CDREGION : 1714 numcds++; 1715 break; 1716 case SEQFEAT_RNA : 1717 numrna++; 1718 break; 1719 default : 1720 break; 1721 } 1722 } 1723 1724 if (numgene == 0) return; 1725 1726 if (numcds > 0) { 1727 head = (GmcDataPtr) MemNew (sizeof (GmcData) * (size_t) (numcds + 1)); 1728 if (head != NULL) { 1729 gdp = head; 1730 for (sfp = features; sfp != NULL; sfp = sfp->next) { 1731 if (sfp->idx.subtype == FEATDEF_CDS) { 1732 gdp->feat = sfp; 1733 grp = SeqMgrGetGeneXref (sfp); 1734 if (grp == NULL || (! SeqMgrGeneIsSuppressed (grp))) { 1735 gdp->gene = SeqMgrGetOverlappingGene (sfp->location, NULL); 1736 } 1737 gdp++; 1738 } 1739 } 1740 HeapSort (head, (size_t) numcds, sizeof (GmcData), SortByGenePtr); 1741 for (i = 0; i < numcds; i += j) { 1742 sfp = head [i].gene; 1743 for (j = 1; i + j < numcds && sfp == head [i + j].gene; j++) continue; 1744 if (j == 1) { 1745 /* no alt splicing */ 1746 EnhanceOneCDS (head [i].feat, FALSE); 1747 } else { 1748 /* is alt splicing */ 1749 for (k = 0; k < j; k++) { 1750 EnhanceOneCDS (head [i + k].feat, TRUE); 1751 } 1752 } 1753 } 1754 } 1755 MemFree (head); 1756 } 1757 1758 if (numrna > 0) { 1759 head = (GmcDataPtr) MemNew (sizeof (GmcData) * (size_t) (numrna + 1)); 1760 if (head != NULL) { 1761 gdp = head; 1762 for (sfp = features; sfp != NULL; sfp = sfp->next) { 1763 if (sfp->data.choice == SEQFEAT_RNA) { 1764 gdp->feat = sfp; 1765 grp = SeqMgrGetGeneXref (sfp); 1766 if (grp == NULL || (! SeqMgrGeneIsSuppressed (grp))) { 1767 gdp->gene = SeqMgrGetOverlappingGene (sfp->location, NULL); 1768 } 1769 gdp++; 1770 } 1771 } 1772 HeapSort (head, (size_t) numrna, sizeof (GmcData), SortByGenePtr); 1773 for (i = 0; i < numrna; i += j) { 1774 sfp = head [i].gene; 1775 for (j = 1; i + j < numrna && sfp == head [i + j].gene; j++) continue; 1776 if (j == 1) { 1777 /* no alt splicing */ 1778 EnhanceOneRna (head [i].feat, FALSE); 1779 } else { 1780 /* is alt splicing */ 1781 for (k = 0; k < j; k++) { 1782 EnhanceOneRna (head [i + k].feat, TRUE); 1783 } 1784 } 1785 } 1786 } 1787 MemFree (head); 1788 } 1789 } 1790 1791 static BioseqPtr AttachSeqAnnotEntity ( 1792 Uint2 entityID, 1793 SeqAnnotPtr sap, 1794 TblArgsPtr tbl 1795 ) 1796 1797 { 1798 SeqAnnotPtr anp; 1799 BioseqPtr bsp; 1800 Char buf [80]; 1801 Int2 genCode; 1802 SeqEntryPtr oldscope; 1803 SeqEntryPtr sep; 1804 SeqFeatPtr sfp = NULL; 1805 SeqIdPtr sip; 1806 SeqLocPtr slp; 1807 1808 if (sap == NULL || tbl == NULL) return NULL; 1809 1810 bsp = GetBioseqReferencedByAnnot (sap, entityID); 1811 if (bsp == NULL) { 1812 oldscope = SeqEntrySetScope (NULL); 1813 if (oldscope != NULL) { 1814 bsp = GetBioseqReferencedByAnnot (sap, entityID); 1815 SeqEntrySetScope (oldscope); 1816 } 1817 } 1818 if (bsp != NULL) { 1819 sep = SeqMgrGetSeqEntryForData (bsp); 1820 entityID = ObjMgrGetEntityIDForChoice (sep); 1821 if (sap->type == 1) { 1822 sfp = (SeqFeatPtr) sap->data; 1823 genCode = GetGenCodeForBsp (bsp); 1824 SetEmptyGeneticCodes (sap, genCode); 1825 } 1826 if (bsp->annot == NULL) { 1827 bsp->annot = sap; 1828 } else { 1829 anp = bsp->annot; 1830 while (anp->next != NULL) { 1831 anp = anp->next; 1832 } 1833 anp->next = sap; 1834 } 1835 if (sfp != NULL) { 1836 if (tbl->smartfeats) { 1837 1838 /* indexing needed to find mRNA and CDS within each gene */ 1839 1840 SeqMgrIndexFeatures (entityID, NULL); 1841 1842 EnhanceFeatureAnnotation (sfp, bsp); 1843 } 1844 1845 PromoteXrefsExEx (sfp, bsp, entityID, TRUE, FALSE, tbl->genprodset, tbl->forcelocalid); 1846 sep = GetTopSeqEntryForEntityID (entityID); 1847 } 1848 } else { 1849 buf [0] = '\0'; 1850 if (sap->type == 1) { 1851 sfp = (SeqFeatPtr) sap->data; 1852 if (sfp != NULL && sfp->location != NULL) { 1853 slp = SeqLocFindNext (sfp->location, NULL); 1854 if (slp != NULL) { 1855 sip = SeqLocId (slp); 1856 if (sip != NULL) { 1857 SeqIdWrite (sip, buf, PRINTID_FASTA_LONG, sizeof (buf) - 1); 1858 } 1859 } 1860 } 1861 } 1862 Message (MSG_POSTERR, "Feature table identifiers %s do not match record", buf); 1863 } 1864 sep = GetTopSeqEntryForEntityID (entityID); 1865 return bsp; 1866 } 1867 1868 static CharPtr TrimBracketsFromString ( 1869 CharPtr str, 1870 SqnTagPtr stp 1871 ) 1872 1873 { 1874 Uchar ch; /* to use 8bit characters in multibyte languages */ 1875 Int2 count; 1876 CharPtr dst; 1877 CharPtr ptr; 1878 1879 if (StringHasNoText (str) || stp == NULL) return str; 1880 1881 /* remove bracketed fields */ 1882 1883 count = 0; 1884 dst = str; 1885 ptr = str; 1886 ch = *ptr; 1887 while (ch != '\0') { 1888 if (ch == '[') { 1889 if (count < stp->num_tags && (! stp->used [count])) { 1890 *dst = ch; 1891 dst++; 1892 ptr++; 1893 ch = *ptr; 1894 while (ch != '\0' && ch != ']') { 1895 *dst = ch; 1896 dst++; 1897 ptr++; 1898 ch = *ptr; 1899 } 1900 *dst = ch; 1901 dst++; 1902 ptr++; 1903 ch = *ptr; 1904 } else { 1905 ptr++; 1906 ch = *ptr; 1907 while (ch != '\0' && ch != ']' && ch != '"') { 1908 ptr++; 1909 ch = *ptr; 1910 } 1911 if (ch == '"') { 1912 ptr++; 1913 ch = *ptr; 1914 while (ch != '\0' && ch != '"') { 1915 ptr++; 1916 ch = *ptr; 1917 } 1918 } 1919 while (ch != '\0' && ch != ']') { 1920 ptr++; 1921 ch = *ptr; 1922 } 1923 ptr++; 1924 ch = *ptr; 1925 } 1926 count++; 1927 } else { 1928 *dst = ch; 1929 dst++; 1930 ptr++; 1931 ch = *ptr; 1932 } 1933 } 1934 *dst = '\0'; 1935 1936 /* remove runs of whitespace characters */ 1937 1938 dst = str; 1939 ptr = str; 1940 ch = *ptr; 1941 while (ch != '\0') { 1942 if (IS_WHITESP (ch)) { 1943 *dst = ch; 1944 dst++; 1945 ptr++; 1946 ch = *ptr; 1947 while (IS_WHITESP (ch)) { 1948 ptr++; 1949 ch = *ptr; 1950 } 1951 } else { 1952 *dst = ch; 1953 dst++; 1954 ptr++; 1955 ch = *ptr; 1956 } 1957 } 1958 *dst = '\0'; 1959 1960 return str; 1961 } 1962 1963 static Boolean HasTpaAccession ( 1964 UserObjectPtr uop 1965 ) 1966 1967 { 1968 UserFieldPtr curr; 1969 ObjectIdPtr oip; 1970 CharPtr str; 1971 UserFieldPtr ufp; 1972 1973 if (uop == NULL) return FALSE; 1974 if ((oip = uop->type) == NULL) return FALSE; 1975 if (StringCmp (oip->str, "TpaAssembly") != 0) return FALSE; 1976 1977 for (curr = uop->data; curr != NULL; curr = curr->next) { 1978 if (curr->choice != 11) continue; 1979 for (ufp = curr->data.ptrvalue; ufp != NULL; ufp = ufp->next) { 1980 if (ufp->choice != 1) continue; 1981 oip = ufp->label; 1982 if (oip == NULL || StringICmp (oip->str, "accession") != 0) continue; 1983 str = (CharPtr) ufp->data.ptrvalue; 1984 if (StringDoesHaveText (str)) return TRUE; 1985 } 1986 } 1987 1988 return FALSE; 1989 } 1990 1991 static Boolean HasGenomeProjectDB ( 1992 UserObjectPtr uop 1993 ) 1994 1995 { 1996 UserFieldPtr curr; 1997 ObjectIdPtr oip; 1998 Int4 val; 1999 2000 if (uop == NULL) return FALSE; 2001 if ((oip = uop->type) == NULL) return FALSE; 2002 if (StringCmp (oip->str, "GenomeProjectsDB") != 0) return FALSE; 2003 2004 for (curr = uop->data; curr != NULL; curr = curr->next) { 2005 oip = curr->label; 2006 if (oip == NULL || StringICmp (oip->str, "ProjectID") != 0) continue; 2007 if (curr->choice != 2) continue; 2008 val = (Int4) curr->data.intvalue; 2009 if (val > 0) return TRUE; 2010 } 2011 2012 return FALSE; 2013 } 2014 2015 static void GetFirstBiop ( 2016 BioSourcePtr biop, 2017 Pointer userdata 2018 ) 2019 2020 { 2021 BioSourcePtr PNTR biopp; 2022 2023 biopp = (BioSourcePtr PNTR) userdata; 2024 if (biop == NULL || biopp == NULL) return; 2025 if (*biopp != NULL) return; 2026 *biopp = biop; 2027 } 2028 2029 static void ProcessOneNuc ( 2030 Uint2 entityID, 2031 BioseqPtr bsp, 2032 BioSourcePtr src, 2033 TblArgsPtr tbl, 2034 MolInfoPtr template_molinfo 2035 ) 2036 2037 { 2038 Boolean addNewBiop = TRUE; 2039 Boolean addNewMip = TRUE; 2040 BioSourcePtr biop = NULL; 2041 SeqFeatPtr cds; 2042 GBBlockPtr gbp; 2043 Int2 genCode; 2044 size_t len; 2045 MolInfoPtr mip = NULL; 2046 Boolean mito; 2047 OrgNamePtr onp; 2048 OrgRefPtr orp; 2049 SeqDescrPtr sdp; 2050 SeqHistPtr shp; 2051 SqnTagPtr stp = NULL; 2052 CharPtr str; 2053 CharPtr tmp; 2054 CharPtr ttl = NULL; 2055 UserObjectPtr uop; 2056 ValNodePtr vnp; 2057 SeqMgrDescContext dcontext; 2058 2059 if (bsp == NULL) return; 2060 2061 genCode = GetGenCodeForBsp (bsp); 2062 2063 if (bsp->mol == Seq_mol_na) { 2064 bsp->mol = Seq_mol_dna; 2065 } 2066 2067 if (src != NULL) { 2068 src = AsnIoMemCopy ((Pointer) src, 2069 (AsnReadFunc) BioSourceAsnRead, 2070 (AsnWriteFunc) BioSourceAsnWrite); 2071 } else { 2072 sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext); 2073 if (sdp != NULL) { 2074 src = sdp->data.ptrvalue; 2075 if (src != NULL) { 2076 addNewBiop = FALSE; 2077 } 2078 } 2079 } 2080 2081 vnp = ValNodeExtract (&(bsp->descr), Seq_descr_title); 2082 if (vnp != NULL) { 2083 ttl = (CharPtr) vnp->data.ptrvalue; 2084 } 2085 2086 if (ttl != NULL || tbl->srcquals != NULL) { 2087 len = StringLen (ttl) + StringLen (tbl->srcquals) + 5; 2088 str = (CharPtr) MemNew (len * sizeof (Char)); 2089 if (str != NULL) { 2090 StringCpy (str, ttl); 2091 if (ttl != NULL && tbl->srcquals != NULL) { 2092 StringCat (str, "; "); 2093 } 2094 StringCat (str, tbl->srcquals); 2095 stp = SqnTagParse (str); 2096 } 2097 MemFree (str); 2098 } 2099 2100 if (stp != NULL) { 2101 biop = ParseTitleIntoBioSource (stp, tbl->organism, src); 2102 ParseTitleIntoBioseq (stp, bsp); 2103 str = SqnTagFind (stp, "comment"); 2104 if (str != NULL) { 2105 tmp = StringSave (str); 2106 SeqDescrAddPointer (&(bsp->descr), Seq_descr_comment, (Pointer) tmp); 2107 } 2108 } 2109 if (biop == NULL) { 2110 biop = ParseTitleIntoBioSource (NULL, tbl->organism, src); 2111 } 2112 if (biop != NULL && addNewBiop) { 2113 SeqDescrAddPointer (&(bsp->descr), Seq_descr_source, (Pointer) biop); 2114 } 2115 if (biop != NULL) { 2116 AddMissingSourceInfo (biop); 2117 } 2118 2119 sdp = BioseqGetSeqDescr (bsp, Seq_descr_molinfo, NULL); 2120 if (sdp != NULL && sdp->choice == Seq_descr_molinfo) { 2121 mip = (MolInfoPtr) sdp->data.ptrvalue; 2122 addNewMip = FALSE; 2123 } else { 2124 mip = MolInfoNew (); 2125 } 2126 if (mip != NULL) { 2127 if (stp != NULL) { 2128 mip = ParseTitleIntoMolInfo (stp, mip); 2129 } 2130 if (mip->biomol == 0 && template_molinfo != NULL) 2131 { 2132 mip->biomol = template_molinfo->biomol; 2133 } 2134 if (mip->biomol == 0) { 2135 mip->biomol = MOLECULE_TYPE_GENOMIC; 2136 } 2137 if (addNewMip) { 2138 SeqDescrAddPointer (&(bsp->descr), Seq_descr_molinfo, (Pointer) mip); 2139 } 2140 switch (mip->biomol) { 2141 case MOLECULE_TYPE_PRE_MRNA : 2142 case MOLECULE_TYPE_MRNA : 2143 case MOLECULE_TYPE_RRNA : 2144 case MOLECULE_TYPE_TRNA : 2145 case MOLECULE_TYPE_SNRNA : 2146 case MOLECULE_TYPE_SCRNA : 2147 case MOLECULE_TYPE_CRNA : 2148 case MOLECULE_TYPE_SNORNA : 2149 case MOLECULE_TYPE_TRANSCRIBED_RNA : 2150 case MOLECULE_TYPE_NCRNA : 2151 case MOLECULE_TYPE_TMRNA : 2152 if (bsp->mol == Seq_mol_dna) { 2153 str = SqnTagFind (stp, "molecule"); 2154 if (str == NULL) { 2155 str = SqnTagFind (stp, "mol"); 2156 } 2157 if (str != NULL) { 2158 if (StringICmp (str, "dna") == 0) break; 2159 } 2160 bsp->mol = Seq_mol_rna; 2161 } 2162 break; 2163 default : 2164 break; 2165 } 2166 } 2167 2168 if (genCode == 0 && biop != NULL) { 2169 orp = biop->org; 2170 if (orp != NULL) { 2171 onp = orp->orgname; 2172 if (onp != NULL) { 2173 mito = (Boolean) (biop->genome == 4 || biop->genome == 5); 2174 if (mito) { 2175 genCode = onp->mgcode; 2176 } else { 2177 genCode = onp->gcode; 2178 } 2179 } 2180 } 2181 } 2182 2183 if (StringDoesHaveText (tbl->comment)) { 2184 str = StringSave (tbl->comment); 2185 SeqDescrAddPointer (&(bsp->descr), Seq_descr_comment, (Pointer) str); 2186 } 2187 if (StringDoesHaveText (tbl->commentFile)) { 2188 str = StringSave (tbl->commentFile); 2189 SeqDescrAddPointer (&(bsp->descr), Seq_descr_comment, (Pointer) str); 2190 } 2191 2192 if (stp != NULL) { 2193 gbp = ParseTitleIntoGenBank (stp, NULL); 2194 if (gbp != NULL && (gbp->extra_accessions != NULL || gbp->keywords != NULL)) { 2195 SeqDescrAddPointer (&(bsp->descr), Seq_descr_genbank, (Pointer) gbp); 2196 } else { 2197 gbp = GBBlockFree (gbp); 2198 } 2199 2200 shp = ParseTitleIntoSeqHist (stp, NULL); 2201 if (shp != NULL && shp->replace_ids != NULL) { 2202 bsp->hist = SeqHistFree (bsp->hist); 2203 bsp->hist = shp; 2204 } else { 2205 shp = SeqHistFree (shp); 2206 } 2207 } 2208 2209 if (stp != NULL) { 2210 uop = ParseTitleIntoTpaAssembly (stp, NULL); 2211 if (uop != NULL && HasTpaAccession (uop)) { 2212 SeqDescrAddPointer (&(bsp->descr), Seq_descr_user, (Pointer) uop); 2213 } else { 2214 uop = UserObjectFree (uop); 2215 } 2216 } 2217 2218 if (stp != NULL) { 2219 uop = ParseTitleIntoGenomeProjectsDB (stp, NULL); 2220 if (uop != NULL && HasGenomeProjectDB (uop)) { 2221 SeqDescrAddPointer (&(bsp->descr), Seq_descr_user, (Pointer) uop); 2222 } else { 2223 uop = UserObjectFree (uop); 2224 } 2225 } 2226 2227 /* look for pubmed IDs */ 2228 if (stp != NULL) { 2229 AddPubsFromTitle (stp, &(bsp->descr)); 2230 } 2231 2232 if (tbl->findorf) { 2233 cds = AnnotateBestOrf (bsp, genCode, tbl->altstart, tbl->runonorf, stp); 2234 if (cds != NULL) { 2235 PromoteXrefsExEx (cds, bsp, entityID, TRUE, FALSE, FALSE, tbl->forcelocalid); 2236 } 2237 } 2238 2239 TrimBracketsFromString (ttl, stp); 2240 if (StringDoesHaveText (ttl)) { 2241 str = StringSave (ttl); 2242 SeqDescrAddPointer (&(bsp->descr), Seq_descr_title, (Pointer) str); 2243 } 2244 2245 if (stp != NULL) { 2246 SqnTagFree (stp); 2247 } 2248 2249 ValNodeFreeData (vnp); 2250 } 2251 2252 static void ProcessNucBioseqs (SeqEntryPtr top_sep, Uint2 entityID, BioSourcePtr src, TblArgsPtr tbl, MolInfoPtr template_molinfo) 2253 { 2254 BioseqPtr bsp; 2255 BioseqSetPtr bssp; 2256 SeqEntryPtr sep; 2257 2258 if (top_sep == NULL || top_sep->data.ptrvalue == NULL) return; 2259 if (IS_Bioseq (top_sep)) { 2260 bsp = (BioseqPtr) top_sep->data.ptrvalue; 2261 if (!ISA_aa (bsp->mol)) { 2262 ProcessOneNuc (entityID, bsp, src, tbl, template_molinfo); 2263 } 2264 } else if (IS_Bioseq_set (top_sep)) { 2265 bssp = (BioseqSetPtr) top_sep->data.ptrvalue; 2266 for (sep = bssp->seq_set; sep != NULL; sep = sep->next) { 2267 ProcessNucBioseqs (sep, entityID, src, tbl, template_molinfo); 2268 } 2269 } 2270 } 2271 2272 2273 static void ProcessOneAnnot ( 2274 SeqAnnotPtr sap, 2275 Uint2 entityID, 2276 TblArgsPtr tbl 2277 ) 2278 2279 { 2280 BioseqPtr bsp; 2281 Int2 genCode; 2282 SeqFeatPtr sfp; 2283 SeqEntryPtr sep; 2284 2285 if (sap == NULL || tbl == NULL) return; 2286 2287 bsp = AttachSeqAnnotEntity (entityID, sap, tbl); 2288 if (bsp == NULL) return; 2289 2290 sep = GetTopSeqEntryForEntityID (entityID); 2291 2292 /* correct all idx parent pointers */ 2293 2294 AssignIDsInEntity (entityID, 0, NULL); 2295 2296 genCode = GetGenCodeForBsp (bsp); 2297 2298 /* coercion of SeqIds to accession moved to ProcessOneRecord->MakeAccessionID */ 2299 2300 /* for parsed in features or best ORF, promote CDS products to protein bioseq */ 2301 2302 for (sap = bsp->annot; sap != NULL; sap = sap->next) { 2303 if (sap->type == 1) { 2304 SetEmptyGeneticCodes (sap, genCode); 2305 sfp = (SeqFeatPtr) sap->data; 2306 PromoteXrefsExEx (sfp, bsp, entityID, TRUE, FALSE, tbl->genprodset, tbl->forcelocalid); 2307 } 2308 } 2309 sep = GetTopSeqEntryForEntityID (entityID); 2310 } 2311 2312 static void UpdateException ( 2313 SeqFeatPtr sfp, 2314 CharPtr text 2315 ) 2316 2317 { 2318 size_t len; 2319 CharPtr str; 2320 2321 if (sfp == NULL) return; 2322 2323 sfp->excpt = TRUE; 2324 2325 if (sfp->except_text == NULL) { 2326 sfp->except_text = StringSave (text); 2327 } else { 2328 len = StringLen (sfp->except_text) + StringLen (text) + 5; 2329 str = MemNew (sizeof (Char) * len); 2330 StringCpy (str, sfp->except_text); 2331 StringCat (str, ","); 2332 StringCat (str, text); 2333 sfp->except_text = MemFree (sfp->except_text); 2334 sfp->except_text = str; 2335 } 2336 } 2337 2338 static void ReplaceOnePeptide ( 2339 SimpleSeqPtr ssp, 2340 Boolean conflict, 2341 Boolean genprodset 2342 ) 2343 2344 { 2345 Uint1 aa; 2346 ByteStorePtr bs; 2347 BioseqPtr bsp, gen; 2348 SeqFeatPtr cds; 2349 CdRegionPtr crp; 2350 SeqMgrDescContext dcontext; 2351 MolInfoPtr mip; 2352 SeqFeatPtr prt; 2353 SeqDescrPtr sdp; 2354 SeqIntPtr sintp; 2355 SeqIdPtr sip; 2356 SeqLocPtr slp; 2357 CharPtr str, str1, str2; 2358 ValNodePtr vnp; 2359 2360 if (ssp == NULL || ssp->numid < 1) return; 2361 2362 str = ssp->id [0]; 2363 if (StringHasNoText (str)) { 2364 str = "?"; 2365 } 2366 sip = MakeSeqID (str); 2367 bsp = BioseqFind (sip); 2368 SeqIdFree (sip); 2369 if (bsp == NULL) { 2370 Message (MSG_POSTERR, "Unable to find protein sequence %s", str); 2371 } 2372 if (bsp == NULL || bsp->repr != Seq_repr_raw) return; 2373 2374 if (bsp->seq_data_type == Seq_code_gap) return; 2375 2376 if (! ISA_aa (bsp->mol)) { 2377 Message (MSG_POSTERR, "Will not replace mRNA sequence %s with protein", str); 2378 return; 2379 } 2380 2381 /* remove trailing X and * - now just trailing star */ 2382 2383 bs = ssp->seq; 2384 BSSeek (bs, -1, SEEK_END); 2385 aa = (Uint1) BSGetByte (bs); 2386 while (( /* aa == 'X' || */ aa == '*') && ssp->seqlen > 0) { 2387 BSSeek (bs, -1, SEEK_END); 2388 BSDelete (bs, 1); 2389 BSSeek (bs, -1, SEEK_END); 2390 aa = (Uint1) BSGetByte (bs); 2391 } 2392 ssp->seqlen = BSLen (bs); 2393 2394 str1 = BSMerge (ssp->seq, NULL); 2395 str2 = BSMerge ((ByteStorePtr) bsp->seq_data, NULL); 2396 2397 if (StringCmp (str1, str2) != 0) { 2398 2399 /* swap sequence byte stores */ 2400 2401 bs = (ByteStorePtr) bsp->seq_data; 2402 bsp->seq_data = (SeqDataPtr) ssp->seq; 2403 ssp->seq = bs; 2404 bsp->length = BSLen ((ByteStorePtr) bsp->seq_data); 2405 bsp->seq_data_type = Seq_code_ncbieaa; 2406 2407 if (genprodset) { 2408 2409 /* SeqMgrGetCDSgivenProduct here would return CDS within nuc-prot set, not genomic */ 2410 2411 for (vnp = SeqMgrGetSfpProductList (bsp); vnp != NULL; vnp = vnp->next) { 2412 cds = (SeqFeatPtr) vnp->data.ptrvalue; 2413 if (cds == NULL || cds->data.choice != SEQFEAT_CDREGION) continue; 2414 gen = BioseqFindFromSeqLoc (cds->location); 2415 if (gen == NULL) continue; 2416 2417 sdp = SeqMgrGetNextDescriptor (gen, NULL, Seq_descr_molinfo, &dcontext); 2418 if (sdp == NULL) continue; 2419 mip = (MolInfoPtr) sdp->data.ptrvalue; 2420 if (mip == NULL) continue; 2421 2422 if (mip->biomol == MOLECULE_TYPE_GENOMIC) { 2423 2424 UpdateException (cds, "translated product replaced"); 2425 2426 } else if (mip->biomol == MOLECULE_TYPE_MRNA) { 2427 2428 crp = (CdRegionPtr) cds->data.value.ptrvalue; 2429 if (crp != NULL && conflict) { 2430 2431 /* mark CDS in nuc-prot set for coordinate adjustment */ 2432 2433 crp->conflict = TRUE; 2434 } 2435 } 2436 } 2437 2438 } else { 2439 2440 cds = SeqMgrGetCDSgivenProduct (bsp, NULL); 2441 if (cds != NULL) { 2442 UpdateException (cds, "translated product replaced"); 2443 } 2444 } 2445 2446 prt = SeqMgrGetBestProteinFeature (bsp, NULL); 2447 if (prt != NULL) { 2448 slp = prt->location; 2449 if (slp != NULL && slp->choice == SEQLOC_INT) { 2450 sintp = (SeqIntPtr) slp->data.ptrvalue; 2451 if (sintp != NULL) { 2452 sintp->to = bsp->length - 1; 2453 } 2454 } 2455 } 2456 } 2457 2458 MemFree (str1); 2459 MemFree (str2); 2460 } 2461 2462 static void ReplaceOneRNA ( 2463 SimpleSeqPtr ssp, 2464 Boolean conflict 2465 ) 2466 2467 { 2468 ByteStorePtr bs; 2469 BioseqPtr bsp; 2470 SeqMgrFeatContext ccontext; 2471 SeqFeatPtr cds, mrna; 2472 SeqIntPtr sintp; 2473 SeqIdPtr sip; 2474 SeqLocPtr slp; 2475 CharPtr str, str1, str2; 2476 2477 if (ssp == NULL || ssp->numid < 1) return; 2478 2479 str = ssp->id [0]; 2480 if (StringHasNoText (str)) { 2481 str = "?"; 2482 } 2483 sip = MakeSeqID (str); 2484 bsp = BioseqFind (sip); 2485 SeqIdFree (sip); 2486 if (bsp == NULL) { 2487 Message (MSG_POSTERR, "Unable to find mRNA sequence %s", str); 2488 } 2489 if (bsp == NULL || bsp->repr != Seq_repr_raw) return; 2490 if (! ISA_na (bsp->mol)) { 2491 Message (MSG_POSTERR, "Will not replace protein sequence %s with mRNA", str); 2492 return; 2493 } 2494 2495 /* remove trailing X and * */ 2496 2497 bs = ssp->seq; 2498 ssp->seqlen = BSLen (bs); 2499 2500 str1 = BSMerge (ssp->seq, NULL); 2501 str2 = GetSequenceByBsp (bsp); 2502 2503 if (StringCmp (str1, str2) != 0) { 2504 2505 /* swap sequence byte stores */ 2506 2507 bs = (ByteStorePtr) bsp->seq_data; 2508 bsp->seq_data = (SeqDataPtr) ssp->seq; 2509 ssp->seq = bs; 2510 bsp->length = BSLen ((ByteStorePtr) bsp->seq_data); 2511 bsp->seq_data_type = Seq_code_iupacna; 2512 BioseqPack (bsp); 2513 2514 mrna = SeqMgrGetRNAgivenProduct (bsp, NULL); 2515 if (mrna != NULL) { 2516 UpdateException (mrna, "transcribed product replaced"); 2517 2518 /* 2519 if (conflict) { 2520 mrna->excpt = TRUE; 2521 if (StringHasNoText (mrna->except_text)) { 2522 mrna->except_text = StringSave ("RNA editing"); 2523 } 2524 } 2525 */ 2526 } 2527 2528 /* make sure CDS in nuc-prot set is not longer than just-replaced RNA */ 2529 2530 cds = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &ccontext); 2531 if (cds != NULL) { 2532 slp = cds->location; 2533 if (slp != NULL && slp->choice == SEQLOC_INT) { 2534 sintp = (SeqIntPtr) slp->data.ptrvalue; 2535 if (sintp != NULL) { 2536 if (sintp->from == 0 && sintp->to > bsp->length - 1) { 2537 sintp->to = bsp->length - 1; 2538 } 2539 } 2540 } 2541 } 2542 } 2543 2544 MemFree (str1); 2545 MemFree (str2); 2546 } 2547 2548 static SeqLocPtr PredictOneCodingRegion (BioseqPtr nucbsp, BioseqPtr protbsp, Int2 genCode) 2549 2550 { 2551 BioseqPtr bsp; 2552 SeqLocPtr oldslp; 2553 SeqAnnotPtr sap; 2554 SeqFeatPtr sfp; 2555 SeqIdPtr sip; 2556 SeqLocPtr slp; 2557 2558 slp = NULL; 2559 sap = SuggestCodingRegion (nucbsp, protbsp, genCode); 2560 if (sap != NULL && sap->type == 1) { 2561 sfp = (SeqFeatPtr) sap->data; 2562 if (sfp != NULL && sfp->data.choice == SEQFEAT_CDREGION) { 2563 slp = sfp->location; 2564 sfp->location = NULL; 2565 sip = SeqLocId (slp); 2566 if (sip != NULL) { 2567 bsp = BioseqFind (sip); 2568 if (bsp != NULL) { 2569 if (bsp->repr == Seq_repr_seg) { 2570 oldslp = slp; 2571 slp = SegLocToParts (bsp, oldslp); 2572 FreeAllFuzz (slp); 2573 SeqLocFree (oldslp); 2574 } 2575 } 2576 } 2577 } 2578 } 2579 sap = SeqAnnotFree (sap); 2580 StripLocusFromSeqLoc (slp); 2581 return slp; 2582 } 2583 2584 static void SuggestOnePeptide ( 2585 BioseqPtr nucbsp, 2586 BioseqPtr protbsp, 2587 Int2 genCode 2588 ) 2589 2590 { 2591 SeqFeatPtr cds; 2592 CdRegionPtr crp; 2593 SeqFeatPtr gene; 2594 GeneRefPtr grp; 2595 Boolean partial5; 2596 Boolean partial3; 2597 ProtRefPtr prp; 2598 SeqFeatPtr prt; 2599 SeqLocPtr slp; 2600 SqnTagPtr stp; 2601 CharPtr ttl; 2602 ValNodePtr vnp; 2603 2604 if (nucbsp == NULL || protbsp == NULL) return; 2605 slp = PredictOneCodingRegion (nucbsp, protbsp, genCode); 2606 if (slp == NULL) return; 2607 2608 crp = CreateNewCdRgn (0, FALSE, genCode); 2609 if (crp != NULL) { 2610 CheckSeqLocForPartial (slp, &partial5, &partial3); 2611 2612 cds = CreateNewFeatureOnBioseq (nucbsp, SEQFEAT_CDREGION, slp); 2613 if (cds != NULL) { 2614 cds->data.value.ptrvalue = (Pointer) crp; 2615 cds->partial |= partial5 | partial3; 2616 SetSeqFeatProduct (cds, protbsp); 2617 } 2618 2619 if (protbsp->descr != NULL) { 2620 vnp = ValNodeExtract (&(protbsp->descr), Seq_descr_title); 2621 if (vnp != NULL) { 2622 ttl = (CharPtr) vnp->data.ptrvalue; 2623 if (ttl != NULL) { 2624 stp = SqnTagParse (ttl); 2625 if (stp != NULL) { 2626 2627 prp = ProtRefNew (); 2628 prp = ParseTitleIntoProtRef (stp, prp); 2629 if (prp != NULL) { 2630 if (prp->name == NULL && prp->desc == NULL) { 2631 prp->name = ValNodeCopyStr (NULL, 0, "unknown"); 2632 } 2633 prt = CreateNewFeatureOnBioseq (protbsp, SEQFEAT_PROT, NULL); 2634 if (prt != NULL) { 2635 prt->data.value.ptrvalue = (Pointer) prp; 2636 prt->partial |= partial5 | partial3; 2637 } 2638 } 2639 2640 grp = GeneRefNew (); 2641 grp = ParseTitleIntoGeneRef (stp, grp); 2642 if (grp != NULL) { 2643 if (grp->locus == NULL && grp->syn == NULL) { 2644 GeneRefFree (grp); 2645 } else { 2646 gene = CreateNewFeatureOnBioseq (nucbsp, SEQFEAT_GENE, NULL); 2647 if (gene != NULL) { 2648 gene->data.value.ptrvalue = (Pointer) grp; 2649 gene->partial |= partial5 | partial3; 2650 gene->location = SeqLocFree (gene->location); 2651 gene->location = SeqLocMerge (nucbsp, slp, NULL, TRUE, TRUE, TRUE); 2652 } 2653 } 2654 } 2655 2656 SqnTagFree (stp); 2657 } 2658 } 2659 2660 ValNodeFreeData (vnp); 2661 } 2662 } 2663 } 2664 2665 SeqLocFree (slp); 2666 } 2667 2668 static void RnaProtTrailingCommaFix (SeqFeatPtr sfp, Pointer userdata) 2669 2670 { 2671 Char ch; 2672 size_t len; 2673 ProtRefPtr prp; 2674 RnaRefPtr rrp; 2675 CharPtr str; 2676 ValNodePtr vnp; 2677 2678 if (sfp == NULL) return; 2679 2680 if (sfp->data.choice == SEQFEAT_PROT) { 2681 prp = (ProtRefPtr) sfp->data.value.ptrvalue; 2682 /* turn trailing space into trailing underscore for validator */ 2683 for (vnp = prp->name; vnp != NULL; vnp = vnp->next) { 2684 str = (CharPtr) vnp->data.ptrvalue; 2685 if (StringHasNoText (str)) continue; 2686 len = StringLen (str); 2687 if (len < 1) continue; 2688 ch = str [len - 1]; 2689 while (ch == ' ' && len > 2) { 2690 len--; 2691 ch = str [len - 1]; 2692 } 2693 if (ch == ',') { 2694 str [len - 1] = '_'; 2695 str [len] = '\0'; 2696 } 2697 } 2698 } else if (sfp->data.choice == SEQFEAT_RNA) { 2699 rrp = (RnaRefPtr) sfp->data.value.ptrvalue; 2700 /* turn trailing space into trailing underscore for validator */ 2701 if (rrp->ext.choice == 1) { 2702 str = rrp->ext.value.ptrvalue; 2703 if (StringDoesHaveText (str)) { 2704 len = StringLen (str); 2705 if (len > 0) { 2706 ch = str [len - 1]; 2707 while (ch == ' ' && len > 2) { 2708 len--; 2709 ch = str [len - 1]; 2710 } 2711 if (ch == ',') { 2712 str [len - 1] = '_'; 2713 str [len] = '\0'; 2714 } 2715 } 2716 } 2717 } 2718 } 2719 } 2720 2721 static Uint2 ProcessOneAsn ( 2722 FILE* fp, 2723 BioSourcePtr src, 2724 TblArgsPtr tbl, 2725 CharPtr localname, 2726 SeqEntryPtr gsep, 2727 MolInfoPtr template_molinfo 2728 ) 2729 2730 { 2731 BioseqPtr bsp = NULL; 2732 BioseqSetPtr bssp; 2733 Pointer dataptr; 2734 Uint2 datatype, entityID; 2735 ObjMgrDataPtr omdptop; 2736 ObjMgrData omdata; 2737 Uint2 parenttype; 2738 Pointer parentptr; 2739 SeqEntryPtr sep; 2740 SeqIdPtr sip; 2741 2742 if (fp == NULL) return 0; 2743 2744 if (gsep != NULL) { 2745 bssp = (BioseqSetPtr) gsep->data.ptrvalue; 2746 if (bssp == NULL) return 0; 2747 2748 SaveSeqEntryObjMgrData (gsep, &omdptop, &omdata); 2749 GetSeqEntryParent (gsep, &parentptr, &parenttype); 2750 2751 dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, TRUE, FALSE, TRUE, FALSE); 2752 if (datatype == OBJ_BIOSEQ) { 2753 bssp->seq_set = SeqMgrGetSeqEntryForData (dataptr); 2754 SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) dataptr, gsep); 2755 } else if (datatype == OBJ_BIOSEQSET) { 2756 bssp->seq_set = SeqMgrGetSeqEntryForData (dataptr); 2757 SeqMgrSeqEntry (SM_BIOSEQSET, (Pointer) dataptr, gsep); 2758 } else if (datatype == OBJ_SEQENTRY) { 2759 sep = (SeqEntryPtr) dataptr; 2760 bssp->seq_set = sep; 2761 if (IS_Bioseq (sep)) { 2762 SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) sep->data.ptrvalue, gsep); 2763 } else if (IS_Bioseq_set (sep)) { 2764 SeqMgrSeqEntry (SM_BIOSEQSET, (Pointer) sep->data.ptrvalue, gsep); 2765 } else return 0; 2766 } else return 0; 2767 2768 SeqMgrLinkSeqEntry (gsep, parenttype, parentptr); 2769 RestoreSeqEntryObjMgrData (gsep, omdptop, &omdata); 2770 2771 entityID = ObjMgrRegister (OBJ_BIOSEQSET, (Pointer) bssp); 2772 } else { 2773 dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, &entityID, TRUE, FALSE, TRUE, FALSE); 2774 } 2775 if (dataptr == NULL) return 0; 2776 2777 sep = GetTopSeqEntryForEntityID (entityID); 2778 bsp = FindNucBioseq (sep); 2779 if (bsp == NULL) { 2780 ObjMgrFreeByEntityID (entityID); 2781 return 0; 2782 } 2783 2784 VisitFeaturesInSep (sep, NULL, RnaProtTrailingCommaFix); 2785 2786 if (StringDoesHaveText (localname)) { 2787 sip = MakeSeqID (localname); 2788 if (sip != NULL) { 2789 bsp->id = SeqIdSetFree (bsp->id); 2790 bsp->id = sip; 2791 SeqMgrReplaceInBioseqIndex (bsp); 2792 VisitFeaturesOnBsp (bsp, (Pointer) bsp->id, CorrectFeatureSeqIds); 2793 } 2794 } 2795 2796 ProcessNucBioseqs (sep, entityID, src, tbl, template_molinfo); 2797 2798 return entityID; 2799 } 2800 2801 static Uint2 ProcessRaw2Delt ( 2802 FILE* fp, 2803 BioSourcePtr src, 2804 TblArgsPtr tbl, 2805 CharPtr localname, 2806 SeqEntryPtr gsep, 2807 MolInfoPtr template_molinfo 2808 ) 2809 2810 { 2811 BioseqPtr bsp = NULL; 2812 BioseqSetPtr bssp; 2813 Pointer dataptr; 2814 Uint2 datatype, entityID; 2815 Int4 gap_sizes [2]; 2816 ObjMgrDataPtr omdptop; 2817 ObjMgrData omdata; 2818 Uint2 parenttype; 2819 Pointer parentptr; 2820 SeqEntryPtr sep; 2821 SeqIdPtr sip; 2822 2823 if (fp == NULL) return 0; 2824 2825 if (gsep != NULL) { 2826 bssp = (BioseqSetPtr) gsep->data.ptrvalue; 2827 if (bssp == NULL) return 0; 2828 2829 SaveSeqEntryObjMgrData (gsep, &omdptop, &omdata); 2830 GetSeqEntryParent (gsep, &parentptr, &parenttype); 2831 2832 dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, TRUE, FALSE, TRUE, FALSE); 2833 if (datatype == OBJ_BIOSEQ) { 2834 bssp->seq_set = SeqMgrGetSeqEntryForData (dataptr); 2835 SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) dataptr, gsep); 2836 } else if (datatype == OBJ_BIOSEQSET) { 2837 bssp->seq_set = SeqMgrGetSeqEntryForData (dataptr); 2838 SeqMgrSeqEntry (SM_BIOSEQSET, (Pointer) dataptr, gsep); 2839 } else if (datatype == OBJ_SEQENTRY) { 2840 sep = (SeqEntryPtr) dataptr; 2841 bssp->seq_set = sep; 2842 if (IS_Bioseq (sep)) { 2843 SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) sep->data.ptrvalue, gsep); 2844 } else if (IS_Bioseq_set (sep)) { 2845 SeqMgrSeqEntry (SM_BIOSEQSET, (Pointer) sep->data.ptrvalue, gsep); 2846 } else return 0; 2847 } else return 0; 2848 2849 SeqMgrLinkSeqEntry (gsep, parenttype, parentptr); 2850 RestoreSeqEntryObjMgrData (gsep, omdptop, &omdata); 2851 2852 entityID = ObjMgrRegister (OBJ_BIOSEQSET, (Pointer) bssp); 2853 } else { 2854 dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, &entityID, TRUE, FALSE, TRUE, FALSE); 2855 } 2856 if (dataptr == NULL) return 0; 2857 2858 sep = GetTopSeqEntryForEntityID (entityID); 2859 bsp = FindNucBioseq (sep); 2860 if (bsp == NULL) { 2861 ObjMgrFreeByEntityID (entityID); 2862 return 0; 2863 } 2864 2865 VisitFeaturesInSep (sep, NULL, RnaProtTrailingCommaFix); 2866 2867 if (StringDoesHaveText (localname)) { 2868 sip = MakeSeqID (localname); 2869 if (sip != NULL) { 2870 bsp->id = SeqIdSetFree (bsp->id); 2871 bsp->id = sip; 2872 SeqMgrReplaceInBioseqIndex (bsp); 2873 VisitFeaturesOnBsp (bsp, (Pointer) bsp->id, CorrectFeatureSeqIds); 2874 } 2875 } 2876 2877 if (bsp->repr == Seq_repr_raw) { 2878 if (tbl->r2dunk100) { 2879 gap_sizes [0] = 100; 2880 } else { 2881 gap_sizes [0] = 0; 2882 } 2883 gap_sizes [1] = -(tbl->r2dmin); 2884 2885 ConvertNsToGaps (bsp, gap_sizes); 2886 } 2887 2888 ProcessOneNuc (entityID, bsp, src, tbl, template_molinfo); 2889 2890 return entityID; 2891 } 2892 2893 static Uint2 ProcessGappedSet ( 2894 FILE* fp, 2895 BioSourcePtr src, 2896 TblArgsPtr tbl, 2897 SeqEntryPtr gsep, 2898 MolInfoPtr template_molinfo 2899 ) 2900 2901 { 2902 BioseqPtr bsp = NULL; 2903 BioseqSetPtr bssp; 2904 Uint2 entityID; 2905 ObjMgrDataPtr omdptop; 2906 ObjMgrData omdata; 2907 Uint2 parenttype; 2908 Pointer parentptr; 2909 SeqEntryPtr sep; 2910 2911 if (fp == NULL) return 0; 2912 2913 if (gsep != NULL) { 2914 bssp = (BioseqSetPtr) gsep->data.ptrvalue; 2915 if (bssp == NULL) return 0; 2916 2917 SaveSeqEntryObjMgrData (gsep, &omdptop, &omdata); 2918 GetSeqEntryParent (gsep, &parentptr, &parenttype); 2919 2920 bsp = ReadDeltaFasta (fp, NULL); 2921 if (bsp != NULL) { 2922 sep = SeqMgrGetSeqEntryForData (bsp); 2923 bssp->seq_set = sep; 2924 SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, gsep); 2925 } else return 0; 2926 2927 SeqMgrLinkSeqEntry (gsep, parenttype, parentptr); 2928 RestoreSeqEntryObjMgrData (gsep, omdptop, &omdata); 2929 2930 entityID = ObjMgrRegister (OBJ_BIOSEQSET, (Pointer) bssp); 2931 } else { 2932 bsp = ReadDeltaFasta (fp, NULL); 2933 if (bsp != NULL) { 2934 entityID = ObjMgrRegister (OBJ_BIOSEQ, (Pointer) bsp); 2935 } 2936 } 2937 if (bsp == NULL) return 0; 2938 2939 sep = GetTopSeqEntryForEntityID (entityID); 2940 bsp = FindNucBioseq (sep); 2941 if (bsp == NULL) { 2942 ObjMgrFreeByEntityID (entityID); 2943 return 0; 2944 } 2945 2946 VisitFeaturesInSep (sep, NULL, RnaProtTrailingCommaFix); 2947 2948 ProcessOneNuc (entityID, bsp, src, tbl, template_molinfo); 2949 2950 return entityID; 2951 } 2952 2953 typedef struct resqseqgph { 2954 Int2 index; 2955 SeqGraphPtr sgp; 2956 } ResqSeqgph, PNTR ResqSeqgphPtr; 2957 2958 static void RescueSeqGraphs ( 2959 BioseqPtr bsp, 2960 Int2 index, 2961 ValNodePtr PNTR vnpp 2962 ) 2963 2964 { 2965 SeqAnnotPtr nextsap; 2966 SeqGraphPtr nextsgp; 2967 Pointer PNTR prevsap; 2968 Pointer PNTR prevsgp; 2969 ResqSeqgphPtr rsp; 2970 SeqAnnotPtr sap; 2971 SeqGraphPtr sgp; 2972 2973 if (bsp == NULL || vnpp == NULL) return; 2974 sap = bsp->annot; 2975 prevsap = (Pointer PNTR) &(bsp->annot); 2976 while (sap != NULL) { 2977 nextsap = sap->next; 2978 if (sap->type == 3) { 2979 sgp = (SeqGraphPtr) sap->data; 2980 prevsgp = (Pointer PNTR) &(sap->data); 2981 while (sgp != NULL) { 2982 nextsgp = sgp->next; 2983 *(prevsgp) = sgp->next; 2984 sgp->next = NULL; 2985 rsp = (ResqSeqgphPtr) MemNew (sizeof (ResqSeqgph)); 2986 rsp->index = index; 2987 rsp->sgp = sgp; 2988 ValNodeAddPointer (vnpp, 0, (Pointer) rsp); 2989 sgp = nextsgp; 2990 } 2991 } 2992 if (sap->data == NULL) { 2993 *(prevsap) = sap->next; 2994 sap->next = NULL; 2995 SeqAnnotFree (sap); 2996 } else { 2997 prevsap = (Pointer PNTR) &(sap->next); 2998 } 2999 sap = nextsap; 3000 } 3001 } 3002 3003 static SeqAnnotPtr NewSeqAnnotType3 ( 3004 CharPtr name, 3005 SeqGraphPtr sgp 3006 ) 3007 3008 { 3009 SeqAnnotPtr sap = NULL; 3010 3011 if (sgp == NULL) return NULL; 3012 sap = SeqAnnotNew (); 3013 if (sap == NULL) return NULL; 3014 3015 if (StringDoesHaveText (name)) { 3016 SeqDescrAddPointer (&(sap->desc), Annot_descr_name, StringSave (name)); 3017 } 3018 sap->type = 3; 3019 sap->data = (Pointer) sgp; 3020 3021 return sap; 3022 } 3023 3024 static void OffsetAndLinkSeqGraph ( 3025 BioseqPtr bsp, 3026 SeqGraphPtr sgp, 3027 Int2 index 3028 ) 3029 3030 { 3031 DeltaSeqPtr dsp; 3032 SeqGraphPtr lastsgp; 3033 Int4 len; 3034 SeqLitPtr litp; 3035 SeqAnnotPtr sap; 3036 SeqIntPtr sintp; 3037 SeqLocPtr slp; 3038 3039 if (bsp == NULL || sgp == NULL || index < 1) return; 3040 len = 0; 3041 if (bsp->repr == Seq_repr_delta && bsp->seq_ext_type == 4) { 3042 for (dsp = (DeltaSeqPtr) (bsp->seq_ext); 3043 dsp != NULL && index > 1; dsp = dsp->next, index--) { 3044 if (dsp->choice == 1) { 3045 len += SeqLocLen ((SeqLocPtr) dsp->data.ptrvalue); 3046 } else if (dsp->choice == 2) { 3047 litp = (SeqLitPtr) dsp->data.ptrvalue; 3048 if (litp != NULL) { 3049 len += litp->length; 3050 } 3051 } 3052 } 3053 } 3054 slp = sgp->loc; 3055 if (slp != NULL && slp->choice == SEQLOC_INT) { 3056 sintp = (SeqIntPtr) slp->data.ptrvalue; 3057 if (sintp != NULL) { 3058 sintp->from += len; 3059 sintp->to += len; 3060 sintp->id = SeqIdFree (sintp->id); 3061 sintp->id = SeqIdDup (bsp->id); 3062 } 3063 } 3064 for (sap = bsp->annot; sap != NULL; sap = sap->next) { 3065 if (sap->type == 3) { 3066 for (lastsgp = sap->data; lastsgp->next != NULL; lastsgp = lastsgp->next) { 3067 continue; 3068 } 3069 lastsgp->next = sgp; 3070 break; 3071 } 3072 } 3073 if (sap == NULL) { 3074 if (bsp->annot != NULL) { 3075 for (sap = bsp->annot; sap->next != NULL; sap = sap->next) { 3076 continue; 3077 } 3078 sap->next = NewSeqAnnotType3 ("Phrap Graph", sgp); 3079 } else { 3080 bsp->annot = NewSeqAnnotType3 ("Phrap Graph", sgp); 3081 } 3082 } 3083 } 3084 3085 static CharPtr BioseqGetLocalIdStr ( 3086 BioseqPtr bsp 3087 ) 3088 3089 { 3090 ObjectIdPtr oip; 3091 SeqIdPtr sip; 3092 3093 if (bsp == NULL) return NULL; 3094 for (sip = bsp->id; sip != NULL; sip = sip->next) { 3095 if (sip->choice == SEQID_LOCAL) { 3096 oip = (ObjectIdPtr) sip->data.ptrvalue; 3097 if (oip != NULL && oip->str != NULL) { 3098 return oip->str; 3099 } 3100 } 3101 } 3102 return NULL; 3103 } 3104 3105 typedef struct reqcontig { 3106 Int2 index; 3107 Char str [41]; 3108 } ResqContig, PNTR ResqContigPtr; 3109 3110 #define MAX_FIELDS 8 3111 3112 static CharPtr ReadContigFile ( 3113 CharPtr directory, 3114 CharPtr base, 3115 ValNodePtr PNTR fragmentgroupsp, 3116 CharPtr dumsp6, 3117 CharPtr dumt7, 3118 CharPtr PNTR sp6_clonep, 3119 CharPtr PNTR sp6_endp, 3120 CharPtr PNTR t7_clonep, 3121 CharPtr PNTR t7_endp 3122 ) 3123 3124 { 3125 Char buf [256], instr [120]; 3126 FileCache fc; 3127 CharPtr field [MAX_FIELDS]; 3128 FILE *fp; 3129 int frg; 3130 Boolean left_end, right_end, nonewline; 3131 Int4 len; 3132 Int2 numFields; 3133 CharPtr pstring = NULL, ptr, str, sp6_end = NULL, t7_end = NULL; 3134 ValNodePtr rescuedcontigs = NULL, vnp; 3135 3136 fp = OpenOneFile (directory, base, ".ctg"); 3137 if (fp == NULL) return NULL; 3138 3139 FileCacheSetup (&fc, fp); 3140 3141 str = FileCacheReadLine (&fc, buf, sizeof (buf), &nonewline); 3142 while (str != NULL) { 3143 MemSet ((Pointer) field, 0, sizeof (field)); 3144 3145 /* 3146 * parse tab-delimited output line into array of fields, avoiding use of 3147 * strtok so that empty columns (adjacent tabs) are properly assigned to 3148 * field array 3149 */ 3150 3151 ptr = buf; 3152 for (numFields = 0; numFields < MAX_FIELDS && ptr != NULL; numFields++) { 3153 field [numFields] = ptr; 3154 ptr = StringChr (ptr, '\t'); 3155 if (ptr == NULL) { 3156 ptr = StringChr (ptr, '\n'); 3157 } 3158 if (ptr == NULL) { 3159 ptr = StringChr (ptr, '\r'); 3160 } 3161 if (ptr != NULL) { 3162 *ptr = '\0'; 3163 ptr++; 3164 } 3165 } 3166 3167 if (StringDoesHaveText (field [0])) { 3168 StringNCpy_0 (instr, field [0], sizeof (instr) - 2); 3169 if (StringDoesHaveText (field [1])) { 3170 if (StringNICmp (field [1], "-", 1) == 0) { 3171 StringCat (instr, "-"); 3172 } 3173 } 3174 ValNodeCopyStr (&rescuedcontigs, 0, instr); 3175 if (StringDoesHaveText (field [2])) { 3176 if (sscanf (field [2], "%d", &frg) == 1) { 3177 ValNodeCopyStr (fragmentgroupsp, (Uint1) frg, field [0]); 3178 } 3179 } 3180 left_end = FALSE; 3181 right_end = FALSE; 3182 if (StringDoesHaveText (field [3])) { 3183 if (StringDoesHaveText (field [4])) { 3184 if (StringNICmp (field [4], "l", 1) == 0) { 3185 left_end = TRUE; 3186 } else if (StringNICmp (field [4], "r", 1) == 0) { 3187 right_end = TRUE; 3188 } 3189 } 3190 if (StringICmp (field [3], "sp6") == 0) { 3191 StringCpy (dumsp6, field [0]); 3192 if (left_end) { 3193 StringCat (dumsp6, ",left"); 3194 } else if (right_end) { 3195 StringCat (dumsp6, ",right"); 3196 } 3197 if (sp6_clonep != NULL && *sp6_clonep == NULL) { 3198 *sp6_clonep = dumsp6; 3199 } 3200 } else if (StringICmp (field [3], "t7") == 0) { 3201 StringCpy (dumt7, field [0]); 3202 if (left_end) { 3203 StringCat (dumt7, ",left"); 3204 } else if (right_end) { 3205 StringCat (dumt7, ",right"); 3206 } 3207 if (t7_clonep != NULL && *t7_clonep == NULL) { 3208 *t7_clonep = dumt7; 3209 } 3210 } 3211 } 3212 } 3213 str = FileCacheReadLine (&fc, buf, sizeof (buf), &nonewline); 3214 } 3215 3216 FileClose (fp); 3217 3218 len = 0; 3219 for (vnp = rescuedcontigs; vnp != NULL; vnp = vnp->next) { 3220 len += StringLen ((CharPtr) vnp->data.ptrvalue) + 1; 3221 } 3222 if (len > 1) { 3223 pstring = MemNew ((size_t) (len + 2)); 3224 for (vnp = rescuedcontigs; vnp != NULL; vnp = vnp->next) { 3225 if (vnp != rescuedcontigs) { 3226 StringCat (pstring, ","); 3227 } 3228 StringCat (pstring, (CharPtr) vnp->data.ptrvalue); 3229 } 3230 } 3231 3232 rescuedcontigs = ValNodeFreeData (rescuedcontigs); 3233 3234 if (sp6_clonep != NULL && *sp6_clonep != NULL) { 3235 sp6_end = StringChr (*sp6_clonep, ','); 3236 if (sp6_end != NULL) { 3237 *sp6_end = '\0'; 3238 sp6_end++; 3239 if (StringICmp (sp6_end, "left") == 0) { 3240 sp6_end = "left"; 3241 } else if (StringICmp (sp6_end, "right") == 0) { 3242 sp6_end = "right"; 3243 } else { 3244 sp6_end = NULL; 3245 } 3246 } 3247 if (sp6_endp != NULL) { 3248 *sp6_endp = sp6_end; 3249 } 3250 } 3251 if (t7_clonep != NULL && *t7_clonep != NULL) { 3252 t7_end = StringChr (*t7_clonep, ','); 3253 if (t7_end != NULL) { 3254 *t7_end = '\0'; 3255 t7_end++; 3256 if (StringICmp (t7_end, "left") == 0) { 3257 t7_end = "left"; 3258 } else if (StringICmp (t7_end, "right") == 0) { 3259 t7_end = "right"; 3260 } else { 3261 t7_end = NULL; 3262 } 3263 } 3264 if (t7_endp != NULL) { 3265 *t7_endp = t7_end; 3266 } 3267 } 3268 3269 return pstring; 3270 } 3271 3272 static void MakeAssemblyFragments ( 3273 BioseqPtr bsp, 3274 CharPtr name, 3275 Int2 index, 3276 CharPtr sp6_clone, 3277 CharPtr sp6_end, 3278 CharPtr t7_clone, 3279 CharPtr t7_end, 3280 Uint1 frag 3281 ) 3282 3283 { 3284 DeltaSeqPtr dsp = NULL; 3285 Int4 from, to; 3286 ImpFeatPtr ifp; 3287 SeqLitPtr litp; 3288 SeqFeatPtr sfp; 3289 SeqInt sint; 3290 Char str [128]; 3291 Char tmp [32]; 3292 ValNode vn; 3293 3294 if (bsp == NULL || name == NULL || index < 1) return; 3295 from = 0; 3296 to = 0; 3297 if (bsp->repr == Seq_repr_delta && bsp->seq_ext_type == 4) { 3298 for (dsp = (DeltaSeqPtr) (bsp->seq_ext); 3299 dsp != NULL && index > 1; dsp = dsp->next, index--) { 3300 if (dsp->choice == 1) { 3301 from += SeqLocLen ((SeqLocPtr) dsp->data.ptrvalue); 3302 } else if (dsp->choice == 2) { 3303 litp = (SeqLitPtr) dsp->data.ptrvalue; 3304 if (litp != NULL) { 3305 from += litp->length; 3306 } 3307 } 3308 } 3309 } 3310 if (dsp != NULL && dsp->choice == 2) { 3311 litp = (SeqLitPtr) dsp->data.ptrvalue; 3312 if (litp != NULL) { 3313 to = litp->length + from - 1; 3314 } 3315 } 3316 MemSet ((Pointer) &vn, 0, sizeof (ValNode)); 3317 vn.choice = SEQLOC_INT; 3318 vn.data.ptrvalue = &sint; 3319 3320 MemSet ((Pointer) &sint, 0, sizeof (SeqInt)); 3321 sint.id = SeqIdDup (SeqIdFindBest (bsp->id, 0)); 3322 3323 sint.from = from; 3324 sint.to = to; 3325 sint.strand = Seq_strand_plus; 3326 3327 ifp = ImpFeatNew (); 3328 if (ifp == NULL) return; 3329 sfp = CreateNewFeatureOnBioseq (bsp, SEQFEAT_IMP, &vn); 3330 if (sfp == NULL) return; 3331 sfp->data.value.ptrvalue = (Pointer) ifp; 3332 ifp->key = StringSave ("misc_feature"); 3333 3334 sprintf (str, "assembly_name:%s", name); 3335 if (frag > 0) { 3336 sprintf (tmp, "~fragment_group:%d", (int) frag); 3337 StringCat (str, tmp); 3338 } 3339 if (StringICmp (name, sp6_clone) == 0) { 3340 StringCat (str, "~clone_end:SP6"); 3341 if (sp6_end != NULL) { 3342 StringCat (str, "~vector_side:"); 3343 StringCat (str, sp6_end); 3344 } 3345 } else if (StringICmp (name, t7_clone) == 0) { 3346 StringCat (str, "~clone_end:T7"); 3347 if (t7_end != NULL) { 3348 StringCat (str, "~vector_side:"); 3349 StringCat (str, t7_end); 3350 } 3351 } 3352 sfp->comment = StringSaveNoNull (str); 3353 } 3354 3355 static Uint2 ProcessPhrapAce ( 3356 FILE* fp, 3357 BioSourcePtr src, 3358 TblArgsPtr tbl, 3359 CharPtr localname, 3360 SeqEntryPtr gsep, 3361 MolInfoPtr template_molinfo, 3362 CharPtr directory, 3363 CharPtr base 3364 ) 3365 3366 { 3367 BioseqPtr bsp, deltabsp; 3368 BioseqSetPtr bssp; 3369 CharPtr contigs; 3370 Boolean do_contig = FALSE; 3371 Char dumsp6 [64], dumt7 [64]; 3372 Uint2 entityID; 3373 SeqEntryPtr firstsep, nextsep, sep, topsep; 3374 Uint1 frag; 3375 IntFuzzPtr ifp; 3376 Int2 index = 0; 3377 Boolean is_unk100, lastwasraw; 3378 ObjMgrDataPtr omdptop; 3379 ObjMgrData omdata; 3380 Uint2 parenttype; 3381 Pointer parentptr; 3382 ResqContigPtr rcp; 3383 ResqSeqgphPtr rsp; 3384 CharPtr seqbuf; 3385 SeqIdPtr sip; 3386 SeqLitPtr slp; 3387 CharPtr sp6_clone = NULL, t7_clone = NULL, sp6_end = NULL, t7_end = NULL; 3388 ValNodePtr rescuedcontigs = NULL, rescuedsgps = NULL, fragmentgroups = NULL, vnp, vnp2; 3389 3390 if (fp == NULL) return 0; 3391 3392 firstsep = ReadPhrapFile (fp); 3393 if (firstsep == NULL) return 0; 3394 3395 dumsp6 [0] = '\0'; 3396 dumt7 [0] = '\0'; 3397 contigs = ReadContigFile (directory, base, &fragmentgroups, dumsp6, 3398 dumt7, &sp6_clone, &sp6_end, &t7_clone, &t7_end); 3399 firstsep = SetPhrapContigOrder (firstsep, contigs); 3400 if (firstsep == NULL) return 0; 3401 if (contigs != NULL) { 3402 do_contig = TRUE; 3403 } 3404 3405 /* always make delta, even if one component */ 3406 3407 bsp = FindNucBioseq (firstsep); 3408 if (bsp == NULL) return 0; 3409 3410 sip = SeqIdSetDup (bsp->id); 3411 vnp = ValNodeExtract (&(bsp->descr), Seq_descr_title); 3412 3413 deltabsp = BioseqNew (); 3414 if (deltabsp == NULL) return 0; 3415 deltabsp->repr = Seq_repr_delta; 3416 deltabsp->seq_ext_type = 4; 3417 deltabsp->mol = Seq_mol_dna; 3418 deltabsp->length = 0; 3419 3420 topsep = SeqEntryNew (); 3421 if (topsep == NULL) return 0; 3422 topsep->choice = 1; 3423 topsep->data.ptrvalue = (Pointer) deltabsp; 3424 3425 SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) deltabsp, topsep); 3426 3427 if (gsep != NULL) { 3428 bssp = (BioseqSetPtr) gsep->data.ptrvalue; 3429 if (bssp == NULL) return 0; 3430 3431 SaveSeqEntryObjMgrData (gsep, &omdptop, &omdata); 3432 GetSeqEntryParent (gsep, &parentptr, &parenttype); 3433 3434 bssp->seq_set = topsep; 3435 SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) deltabsp, gsep); 3436 3437 SeqMgrLinkSeqEntry (gsep, parenttype, parentptr); 3438 RestoreSeqEntryObjMgrData (gsep, omdptop, &omdata); 3439 3440 entityID = ObjMgrRegister (OBJ_BIOSEQSET, (Pointer) bssp); 3441 } else { 3442 entityID = ObjMgrRegister (OBJ_BIOSEQ, (Pointer) deltabsp); 3443 } 3444 3445 lastwasraw = FALSE; 3446 for (sep = firstsep; sep != NULL; sep = nextsep) { 3447 nextsep = sep->next; 3448 sep->next = NULL; 3449 3450 bsp = (BioseqPtr) sep->data.ptrvalue; 3451 if (bsp == NULL) continue; 3452 3453 if (bsp->repr == Seq_repr_raw) { 3454 3455 if (lastwasraw) { 3456 slp = (SeqLitPtr) MemNew (sizeof (SeqLit)); 3457 if (slp == NULL) break; 3458 3459 slp->length = 100 ; 3460 is_unk100 = TRUE; 3461 3462 if (slp->length < 1 || is_unk100) { 3463 if (slp->length < 1) { 3464 slp->length = 0; 3465 } 3466 ifp = IntFuzzNew (); 3467 ifp->choice = 4; 3468 slp->fuzz = ifp; 3469 } 3470 3471 ValNodeAddPointer ((ValNodePtr PNTR) &(deltabsp->seq_ext), (Int2) 2, (Pointer) slp); 3472 3473 deltabsp->length += slp->length; 3474 index++; 3475 } 3476 3477 BioseqRawConvert (bsp, Seq_code_iupacna); 3478 seqbuf = BSMerge ((ByteStorePtr) bsp->seq_data, NULL); 3479 slp = (SeqLitPtr) MemNew (sizeof (SeqLit)); 3480 if (slp == NULL) continue; 3481 3482 slp->length = bsp->length; 3483 ValNodeAddPointer ((ValNodePtr PNTR) &(deltabsp->seq_ext), (Int2) 2, (Pointer) slp); 3484 slp->seq_data = (SeqDataPtr) BSNew (slp->length); 3485 slp->seq_data_type = Seq_code_iupacna; 3486 AddBasesToByteStore ((ByteStorePtr) slp->seq_data, seqbuf); 3487 MemFree (seqbuf); 3488 lastwasraw = TRUE; 3489 3490 deltabsp->length += slp->length; 3491 index++; 3492 3493 RescueSeqGraphs (bsp, index, &rescuedsgps); 3494 if (do_contig) { 3495 rcp = (ResqContigPtr) MemNew (sizeof (ResqContig)); 3496 if (rcp != NULL) { 3497 rcp->index = index; 3498 StringNCpy_0 (rcp->str, BioseqGetLocalIdStr (bsp), sizeof (rcp->str)); 3499 ValNodeAddPointer (&rescuedcontigs, 0, (Pointer) rcp); 3500 } 3501 } 3502 } 3503 3504 SeqEntryFree (sep); 3505 } 3506 3507 ValNodeLink (&(deltabsp->descr), vnp); 3508 deltabsp->id = sip; 3509 3510 if (deltabsp != NULL) { 3511 for (vnp = rescuedsgps; vnp != NULL; vnp = vnp->next) { 3512 rsp = (ResqSeqgphPtr) vnp->data.ptrvalue; 3513 if (rsp != NULL) { 3514 OffsetAndLinkSeqGraph (deltabsp, rsp->sgp, (Int2) rsp->index); 3515 } 3516 } 3517 for (vnp = rescuedcontigs; vnp != NULL; vnp = vnp->next) { 3518 rcp = (ResqContigPtr) vnp->data.ptrvalue; 3519 if (rcp != NULL) { 3520 frag = 0; 3521 for (vnp2 = fragmentgroups; vnp2 != NULL; vnp2 = vnp2->next) { 3522 if (StringICmp ((CharPtr) vnp2->data.ptrvalue, rcp->str) == 0) { 3523 frag = (Uint1) vnp2->choice; 3524 } 3525 } 3526 MakeAssemblyFragments (deltabsp, rcp->str, (Int2) rcp->index, 3527 sp6_clone, sp6_end, t7_clone, t7_end, frag); 3528 } 3529 } 3530 } 3531 rescuedsgps = ValNodeFreeData (rescuedsgps); 3532 rescuedcontigs = ValNodeFreeData (rescuedcontigs); 3533 3534 3535 if (gsep == NULL) { 3536 SeqMgrLinkSeqEntry (topsep, 0, NULL); 3537 } 3538 3539 if (StringDoesHaveText (localname)) { 3540 sip = MakeSeqID (localname); 3541 if (sip != NULL) { 3542 bsp->id = SeqIdSetFree (bsp->id); 3543 bsp->id = sip; 3544 SeqMgrReplaceInBioseqIndex (bsp); 3545 VisitFeaturesOnBsp (bsp, (Pointer) bsp->id, CorrectFeatureSeqIds); 3546 } 3547 } 3548 3549 ProcessOneNuc (entityID, deltabsp, src, tbl, template_molinfo); 3550 3551 return entityID; 3552 } 3553 3554 static Uint2 ProcessBulkSet ( 3555 FILE* fp, 3556 BioSourcePtr src, 3557 TblArgsPtr tbl, 3558 MolInfoPtr template_molinfo 3559 ) 3560 3561 { 3562 BioseqPtr bsp; 3563 BioseqSetPtr bssp; 3564 Uint2 entityID; 3565 SeqEntryPtr lastsep, sep, topsep; 3566 /* 3567 Pointer dataptr; 3568 Uint2 datatype; 3569 */ 3570 3571 if (fp == NULL || tbl == NULL) return 0; 3572 3573 bssp = BioseqSetNew (); 3574 if (bssp == NULL) return 0; 3575 3576 switch (tbl->whichclass) { 3577 case 1 : 3578 bssp->_class = BioseqseqSet_class_pop_set; 3579 break; 3580 case 2 : 3581 bssp->_class = BioseqseqSet_class_phy_set; 3582 break; 3583 case 3 : 3584 bssp->_class = BioseqseqSet_class_mut_set; 3585 break; 3586 case 4 : 3587 bssp->_class = BioseqseqSet_class_eco_set; 3588 break; 3589 default : 3590 bssp->_class = BioseqseqSet_class_genbank; 3591 break; 3592 } 3593 3594 topsep = SeqEntryNew (); 3595 if (topsep == NULL) return 0; 3596 topsep->choice = 2; 3597 topsep->data.ptrvalue = (Pointer) bssp; 3598 3599 entityID = ObjMgrRegister (OBJ_BIOSEQSET, (Pointer) bssp); 3600 3601 lastsep = NULL; 3602 3603 /* 3604 while ((dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, TRUE, FALSE, TRUE, FALSE)) != NULL) { 3605 if (datatype == OBJ_BIOSEQ) { 3606 3607 sep = SeqMgrGetSeqEntryForData (dataptr); 3608 if (lastsep == NULL) { 3609 bssp->seq_set = sep; 3610 } else { 3611 lastsep->next = sep; 3612 } 3613 lastsep = sep; 3614 3615 bsp = (BioseqPtr) dataptr; 3616 ProcessOneNuc (entityID, bsp, src, tbl); 3617 3618 } else { 3619 ObjMgrFree (datatype, dataptr); 3620 } 3621 } 3622 */ 3623 3624 while ((bsp = ReadDeltaFasta (fp, NULL)) != NULL) { 3625 3626 sep = SeqMgrGetSeqEntryForData (bsp); 3627 if (lastsep == NULL) { 3628 bssp->seq_set = sep; 3629 } else { 3630 lastsep->next = sep; 3631 } 3632 lastsep = sep; 3633 3634 ProcessOneNuc (entityID, bsp, src, tbl, template_molinfo); 3635 } 3636 3637 SeqMgrLinkSeqEntry (topsep, 0, NULL); 3638 3639 return entityID; 3640 } 3641 3642 static SeqEntryPtr FA2SEP ( 3643 FILE *fp 3644 ) 3645 3646 { 3647 BioseqPtr bsp; 3648 Pointer dataptr; 3649 Uint2 datatype; 3650 SeqEntryPtr sep; 3651 3652 if (fp == NULL) return NULL; 3653 3654 dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, TRUE, FALSE, TRUE, FALSE); 3655 if (datatype == OBJ_BIOSEQ) { 3656 sep = SeqMgrGetSeqEntryForData (dataptr); 3657 if (sep == NULL) { 3658 sep = SeqEntryNew (); 3659 if (sep != NULL) { 3660 bsp = (BioseqPtr) dataptr; 3661 sep->choice = 1; 3662 sep->data.ptrvalue = bsp; 3663 SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, sep); 3664 } 3665 } 3666 return sep; 3667 } 3668 3669 return NULL; 3670 } 3671 3672 static SeqEntryPtr MakeUnk100GapSep (void) 3673 3674 { 3675 BioseqPtr bsp; 3676 SeqEntryPtr sep; 3677 3678 sep = SeqEntryNew (); 3679 if (sep == NULL) return NULL; 3680 bsp = BioseqNew (); 3681 if (bsp == NULL) return NULL; 3682 bsp->repr = Seq_repr_virtual; 3683 bsp->mol = Seq_mol_na; 3684 bsp->length = 100; 3685 bsp->id = SeqIdParse ("lcl|unk100"); 3686 sep->choice = 1; 3687 sep->data.ptrvalue = bsp; 3688 SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, sep); 3689 return sep; 3690 } 3691 3692 static Uint2 ProcessDeltaSet ( 3693 FILE* fp, 3694 BioSourcePtr src, 3695 TblArgsPtr tbl, 3696 CharPtr localname, 3697 SeqEntryPtr gsep, 3698 MolInfoPtr template_molinfo 3699 ) 3700 3701 { 3702 BioseqPtr bsp, deltabsp; 3703 BioseqSetPtr bssp; 3704 Uint2 entityID; 3705 SeqEntryPtr firstsep, lastsep, nextsep, sep, tmp, topsep; 3706 IntFuzzPtr ifp; 3707 Boolean is_unk100; 3708 ObjectIdPtr oip; 3709 ObjMgrDataPtr omdptop; 3710 ObjMgrData omdata; 3711 Uint2 parenttype; 3712 Pointer parentptr; 3713 CharPtr seqbuf; 3714 SeqIdPtr sip, virtid; 3715 SeqLitPtr slp; 3716 ValNodePtr vnp; 3717 3718 if (fp == NULL) return 0; 3719 3720 firstsep = NULL; 3721 lastsep = NULL; 3722 3723 /* 3724 sep = FastaToSeqEntry (fp, TRUE); 3725 */ 3726 sep = FA2SEP (fp); 3727 if (sep == NULL) return 0; 3728 3729 /* loop to collect subsequent entries */ 3730 3731 while (sep != NULL) { 3732 if (firstsep == NULL) { 3733 firstsep = sep; 3734 } 3735 if (tbl->implicitgaps && lastsep != NULL) { 3736 tmp = MakeUnk100GapSep (); 3737 if (tmp != NULL) { 3738 ValNodeLink (&lastsep, tmp); 3739 lastsep = tmp; 3740 } 3741 } 3742 if (lastsep != NULL) { 3743 ValNodeLink (&lastsep, sep); 3744 } 3745 lastsep = sep; 3746 /* 3747 sep = FastaToSeqEntry (fp, TRUE); 3748 */ 3749 sep = FA2SEP (fp); 3750 } 3751 3752 /* if only one FASTA, treat as raw */ 3753 3754 if (firstsep->next == NULL) { 3755 bsp = FindNucBioseq (firstsep); 3756 if (bsp == NULL) return 0; 3757 3758 if (gsep != NULL) { 3759 bssp = (BioseqSetPtr) gsep->data.ptrvalue; 3760 if (bssp == NULL) return 0; 3761 3762 SaveSeqEntryObjMgrData (gsep, &omdptop, &omdata); 3763 GetSeqEntryParent (gsep, &parentptr, &parenttype); 3764 3765 bssp->seq_set = SeqMgrGetSeqEntryForData (bsp); 3766 SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, gsep); 3767 3768 SeqMgrLinkSeqEntry (gsep, parenttype, parentptr); 3769 RestoreSeqEntryObjMgrData (gsep, omdptop, &omdata); 3770 3771 entityID = ObjMgrRegister (OBJ_BIOSEQSET, (Pointer) bssp); 3772 } else { 3773 entityID = ObjMgrRegister (OBJ_BIOSEQ, (Pointer) bsp); 3774 } 3775 3776 ProcessOneNuc (entityID, bsp, src, tbl, template_molinfo); 3777 return entityID; 3778 } 3779 3780 /* now process delta */ 3781 3782 bsp = FindNucBioseq (firstsep); 3783 if (bsp == NULL) return 0; 3784 3785 sip = SeqIdSetDup (bsp->id); 3786 vnp = ValNodeExtract (&(bsp->descr), Seq_descr_title); 3787 3788 deltabsp = BioseqNew (); 3789 if (deltabsp == NULL) return 0; 3790 deltabsp->repr = Seq_repr_delta; 3791 deltabsp->seq_ext_type = 4; 3792 deltabsp->mol = Seq_mol_dna; 3793 deltabsp->length = 0; 3794 3795 topsep = SeqEntryNew (); 3796 if (topsep == NULL) return 0; 3797 topsep->choice = 1; 3798 topsep->data.ptrvalue = (Pointer) deltabsp; 3799 3800 SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) deltabsp, topsep); 3801 3802 if (gsep != NULL) { 3803 bssp = (BioseqSetPtr) gsep->data.ptrvalue; 3804 if (bssp == NULL) return 0; 3805 3806 SaveSeqEntryObjMgrData (gsep, &omdptop, &omdata); 3807 GetSeqEntryParent (gsep, &parentptr, &parenttype); 3808 3809 bssp->seq_set = topsep; 3810 SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) deltabsp, gsep); 3811 3812 SeqMgrLinkSeqEntry (gsep, parenttype, parentptr); 3813 RestoreSeqEntryObjMgrData (gsep, omdptop, &omdata); 3814 3815 entityID = ObjMgrRegister (OBJ_BIOSEQSET, (Pointer) bssp); 3816 } else { 3817 entityID = ObjMgrRegister (OBJ_BIOSEQ, (Pointer) deltabsp); 3818 } 3819 3820 for (sep = firstsep; sep != NULL; sep = nextsep) { 3821 nextsep = sep->next; 3822 sep->next = NULL; 3823 3824 bsp = (BioseqPtr) sep->data.ptrvalue; 3825 if (bsp == NULL) continue; 3826 3827 if (bsp->repr == Seq_repr_raw) { 3828 BioseqRawConvert (bsp, Seq_code_iupacna); 3829 seqbuf = BSMerge ((ByteStorePtr) bsp->seq_data, NULL); 3830 slp = (SeqLitPtr) MemNew (sizeof (SeqLit)); 3831 if (slp == NULL) continue; 3832 3833 slp->length = bsp->length; 3834 ValNodeAddPointer ((ValNodePtr PNTR) &(deltabsp->seq_ext), (Int2) 2, (Pointer) slp); 3835 slp->seq_data = (SeqDataPtr) BSNew (slp->length); 3836 slp->seq_data_type = Seq_code_iupacna; 3837 AddBasesToByteStore ((ByteStorePtr) slp->seq_data, seqbuf); 3838 MemFree(seqbuf); 3839 3840 deltabsp->length += slp->length; 3841 3842 } else if (bsp->repr == Seq_repr_virtual) { 3843 slp = (SeqLitPtr) MemNew (sizeof (SeqLit)); 3844 if (slp == NULL) continue; 3845 3846 slp->length = bsp->length; 3847 ValNodeAddPointer ((ValNodePtr PNTR) &(deltabsp->seq_ext), (Int2) 2, (Pointer) slp); 3848 3849 is_unk100 = FALSE; 3850 virtid = bsp->id; 3851 if (virtid != NULL && virtid->choice == SEQID_LOCAL) { 3852 oip = (ObjectIdPtr) virtid->data.ptrvalue; 3853 if (oip != NULL) { 3854 if (StringCmp (oip->str, "unk100") == 0) { 3855 is_unk100 = TRUE; 3856 } 3857 } 3858 } 3859 if (slp->length < 1 || is_unk100) { 3860 if (slp->length < 1) { 3861 slp->length = 0; 3862 } 3863 ifp = IntFuzzNew (); 3864 ifp->choice = 4; 3865 slp->fuzz = ifp; 3866 } 3867 3868 deltabsp->length += slp->length; 3869 } 3870 3871 SeqEntryFree (sep); 3872 } 3873 3874 ValNodeLink (&(deltabsp->descr), vnp); 3875 deltabsp->id = sip; 3876 3877 if (gsep == NULL) { 3878 SeqMgrLinkSeqEntry (topsep, 0, NULL); 3879 } 3880 3881 if (StringDoesHaveText (localname)) { 3882 sip = MakeSeqID (localname); 3883 if (sip != NULL) { 3884 bsp->id = SeqIdSetFree (bsp->id); 3885 bsp->id = sip; 3886 SeqMgrReplaceInBioseqIndex (bsp); 3887 VisitFeaturesOnBsp (bsp, (Pointer) bsp->id, CorrectFeatureSeqIds); 3888 } 3889 } 3890 3891 ProcessOneNuc (entityID, deltabsp, src, tbl, template_molinfo); 3892 3893 return entityID; 3894 } 3895 3896 static Boolean DoSequenceLengthsMatch ( 3897 TAlignmentFilePtr afp 3898 ) 3899 3900 { 3901 int seq_index; 3902 Int4 seq_len; 3903 3904 if (afp == NULL || afp->sequences == NULL || afp->num_sequences == 0) { 3905 return TRUE; 3906 } 3907 seq_len = StringLen (afp->sequences[0]); 3908 for (seq_index = 1; seq_index < afp->num_sequences; seq_index++) { 3909 if (StringLen (afp->sequences[seq_index]) != seq_len) { 3910 return FALSE; 3911 } 3912 } 3913 return TRUE; 3914 } 3915 3916 static void ShowAlignmentNotes ( 3917 TAlignmentFilePtr afp, 3918 TErrorInfoPtr error_list 3919 ) 3920 3921 { 3922 TErrorInfoPtr eip; 3923 Int4 index; 3924 3925 for (eip = error_list; eip != NULL; eip = eip->next) { 3926 printf ("*****\nError category %d\n", eip->category); 3927 if (eip->line_num > -1) { 3928 printf ("Line number %d\n", eip->line_num); 3929 } 3930 if (eip->id != NULL) { 3931 printf ("Sequence ID %s\n", eip->id); 3932 } 3933 if (eip->message != NULL) { 3934 printf ("%s\n", eip->message); 3935 } 3936 } 3937 if (afp == NULL) { 3938 printf ("Catastrophic failure during reading\n"); 3939 } else { 3940 printf ("Found %d sequences\n", afp->num_sequences); 3941 printf ("Found %d organisms\n", afp->num_organisms); 3942 for (index = 0; index < afp->num_sequences; index++) 3943 { 3944 printf ("\t%s\t", afp->ids [index]); 3945 if (index < afp->num_organisms) { 3946 printf ("%s\n", afp->organisms [index]); 3947 } else { 3948 printf ("No organism information\n"); 3949 } 3950 } 3951 while (index < afp->num_organisms) { 3952 printf ("Unclaimed organism: %s\n", afp->organisms [index]); 3953 index++; 3954 } 3955 } 3956 } 3957 3958 static Uint2 ProcessAlignSet ( 3959 FILE *fp, 3960 BioSourcePtr src, 3961 TblArgsPtr tbl, 3962 MolInfoPtr template_molinfo 3963 ) 3964 3965 { 3966 TSequenceInfoPtr sequence_info; 3967 TErrorInfoPtr error_list; 3968 ReadBufferData rbd; 3969 TAlignmentFilePtr afp; 3970 SeqEntryPtr sep = NULL; 3971 BioseqPtr bsp; 3972 BioseqSetPtr bssp; 3973 Char ch; 3974 Uint2 entityID; 3975 SeqEntryPtr tmp; 3976 Char nucleotide_alphabet[] = "ABCDGHKMRSTUVWXYabcdghkmrstuvwxy"; 3977 Char protein_alphabet[] = "ABCDEFGHIKLMPQRSTUVWXYZabcdefghiklmpqrstuvwxyz"; 3978 Uint1 moltype = Seq_mol_dna; 3979 3980 if (fp == NULL) return 0; 3981 3982 sequence_info = SequenceInfoNew (); 3983 if (sequence_info == NULL) return 0; 3984 3985 /* format sequence options based on commandline arguments */ 3986 /* set sequence alphabet */ 3987 if (tbl->aln_is_protein) { 3988 moltype = Seq_mol_aa; 3989 sequence_info->alphabet = protein_alphabet; 3990 } else { 3991 moltype = Seq_mol_dna; 3992 sequence_info->alphabet = nucleotide_alphabet; 3993 } 3994 3995 sequence_info->beginning_gap = MemFree (sequence_info->beginning_gap); 3996 if (StringHasNoText (tbl->aln_beginning_gap)) { 3997 sequence_info->beginning_gap = StringSave (".-?"); 3998 } else { 3999 sequence_info->beginning_gap = StringSave (tbl->aln_beginning_gap); 4000 } 4001 sequence_info->middle_gap = MemFree (sequence_info->middle_gap); 4002 if (StringHasNoText (tbl->aln_middle_gap)) { 4003 sequence_info->middle_gap = StringSave ("-"); 4004 } else { 4005 sequence_info->middle_gap = StringSave (tbl->aln_middle_gap); 4006 } 4007 sequence_info->end_gap = MemFree (sequence_info->end_gap); 4008 if (StringHasNoText (tbl->aln_end_gap)) { 4009 sequence_info->end_gap = StringSave (".-?"); 4010 } else { 4011 sequence_info->end_gap = StringSave (tbl->aln_end_gap); 4012 } 4013 sequence_info->missing = MemFree (sequence_info->missing); 4014 if (StringHasNoText (tbl->aln_missing)) { 4015 sequence_info->missing = StringSave ("Nn?"); 4016 } else { 4017 sequence_info->missing = StringSave (tbl->aln_missing); 4018 } 4019 sequence_info->match = MemFree (sequence_info->match); 4020 if (StringHasNoText (tbl->aln_match)) { 4021 sequence_info->match = StringSave ("."); 4022 } else { 4023 sequence_info->match = StringSave (tbl->aln_match); 4024 } 4025 4026 error_list = NULL; 4027 rbd.fp = fp; 4028 rbd.current_data = NULL; 4029 afp = ReadAlignmentFile ( AbstractReadFunction, 4030 (Pointer) &rbd, 4031 AbstractReportError, 4032 (Pointer) &error_list, 4033 sequence_info); 4034 4035 ShowAlignmentNotes (afp, error_list); 4036 ErrorInfoFree (error_list); 4037 if (afp != NULL) { 4038 if (afp->num_organisms == 0 && src == NULL) { 4039 printf ("No organisms supplied!\n"); 4040 } else if (afp->num_organisms != 0 && afp->num_organisms != afp->num_sequences) { 4041 printf ( "Number of organisms must match number of sequences!"); 4042 } else { 4043 ch = 'y'; 4044 if (! DoSequenceLengthsMatch (afp)) { 4045 printf ("Sequences are not all the same length - are you sure you want to continue?"); 4046 ch = getchar (); 4047 } 4048 if (ch == 'y' || ch == 'Y') { 4049 sep = MakeSequinDataFromAlignment (afp, moltype); 4050 } 4051 } 4052 } 4053 SequenceInfoFree (sequence_info); 4054 4055 AlignmentFileFree (afp); 4056 4057 if (sep == NULL || sep->data.ptrvalue == NULL) return 0; 4058 4059 if (IS_Bioseq (sep)) { 4060 bsp = (BioseqPtr) sep->data.ptrvalue; 4061 entityID = ObjMgrRegister (OBJ_BIOSEQ, (Pointer) bsp); 4062 ProcessOneNuc (entityID, bsp, src, tbl, template_molinfo); 4063 } else if (IS_Bioseq_set (sep)) { 4064 bssp = (BioseqSetPtr) sep->data.ptrvalue; 4065 bssp->_class = BioseqseqSet_class_phy_set; 4066 entityID = ObjMgrRegister (OBJ_BIOSEQSET, (Pointer) bssp); 4067 for (tmp = bssp->seq_set; tmp != NULL; tmp = tmp->next) { 4068 if (IS_Bioseq (tmp)) { 4069 bsp = (BioseqPtr) tmp->data.ptrvalue; 4070 ProcessOneNuc (entityID, bsp, src, tbl, template_molinfo); 4071 } 4072 } 4073 } else return 0; 4074 4075 SeqMgrLinkSeqEntry (sep, 0, NULL); 4076 4077 return entityID; 4078 } 4079 4080 static SeqAnnotPtr NewGraphSeqAnnot ( 4081 CharPtr name, 4082 SeqGraphPtr sgp 4083 ) 4084 4085 { 4086 SeqAnnotPtr sap = NULL; 4087 4088 if (sgp == NULL) return NULL; 4089 sap = SeqAnnotNew (); 4090 if (sap == NULL) return NULL; 4091 4092 if (StringDoesHaveText (name)) { 4093 SeqDescrAddPointer (&(sap->desc), Annot_descr_name, StringSave (name)); 4094 } 4095 sap->type = 3; 4096 sap->data = (Pointer) sgp; 4097 4098 return sap; 4099 } 4100 4101 typedef struct npsseqs { 4102 BioseqPtr nuc; 4103 BioseqPtr prot; 4104 } NpsSeqs, PNTR NpsSeqsPtr; 4105 4106 static void FindNucProtSeqs ( 4107 BioseqPtr bsp, 4108 Pointer userdata 4109 ) 4110 4111 { 4112 NpsSeqsPtr nsp; 4113 4114 if (bsp == NULL) return; 4115 nsp = (NpsSeqsPtr) userdata; 4116 if (nsp == NULL) return; 4117 4118 if (ISA_na (bsp->mol)) { 4119 nsp->nuc = bsp; 4120 } else if (ISA_aa (bsp->mol)) { 4121 nsp->prot = bsp; 4122 } 4123 } 4124 4125 static Boolean InRightNps ( 4126 CharPtr gbqval, 4127 SeqIdPtr protids, 4128 Boolean force_local_id 4129 ) 4130 4131 { 4132 Int2 adv; 4133 Char id [64]; 4134 Char lcl [64]; 4135 SeqIdPtr sip = NULL; 4136 CharPtr ptr; 4137 Boolean rsult; 4138 long int val; 4139 Uint4 version = 0; 4140 4141 StringNCpy_0 (id, gbqval, sizeof (id)); 4142 if (StringDoesHaveText (id)) { 4143 if (StringChr (id, '|') != NULL) { 4144 sip = SeqIdParse (id); 4145 } else if (force_local_id) { 4146 sprintf (lcl, "lcl|%s", id); 4147 sip = SeqIdParse (lcl); 4148 } else { 4149 adv = ValidateAccnDotVer (id); 4150 if (adv == 0 || adv == -5) { 4151 ptr = StringChr (id, '.'); 4152 if (ptr != NULL) { 4153 *ptr = '\0'; 4154 ptr++; 4155 if (sscanf (ptr, "%ld", &val) == 1) { 4156 version = (Uint4) val; 4157 } 4158 } 4159 sip = SeqIdFromAccession (id, version, NULL); 4160 } else { 4161 sprintf (lcl, "lcl|%s", id); 4162 sip = SeqIdParse (lcl); 4163 } 4164 } 4165 } 4166 if (sip == NULL) return FALSE; 4167 rsult = SeqIdIn (sip, protids); 4168 SeqIdFree (sip); 4169 return rsult; 4170 } 4171 4172 static void MakeNucProtCDS ( 4173 BioseqSetPtr bssp, 4174 Pointer userdata 4175 ) 4176 4177 { 4178 CodeBreakPtr cbp; 4179 SeqFeatPtr cds; 4180 CdRegionPtr crp; 4181 GBQualPtr gbq; 4182 Char id [64]; 4183 SeqFeatPtr mrna; 4184 GBQualPtr nextqual; 4185 NpsSeqs ns; 4186 Boolean partial5, partial3; 4187 GBQualPtr PNTR prevqual; 4188 SeqFeatPtr sfp; 4189 SeqIdPtr sip; 4190 SeqLocPtr slp; 4191 Int4 start, stop; 4192 TblArgsPtr tbl; 4193 SeqFeatPtr temp; 4194 4195 tbl = (TblArgsPtr) userdata; 4196 if (tbl == NULL) return; 4197 4198 ns.nuc = NULL; 4199 ns.prot = NULL; 4200 if (VisitBioseqsInSet (bssp, (Pointer) &ns, FindNucProtSeqs) != 2) return; 4201 if (ns.nuc == NULL || ns.prot == NULL) return; 4202 4203 cds = SeqMgrGetCDSgivenProduct (ns.prot, NULL); 4204 mrna = SeqMgrGetRNAgivenProduct (ns.nuc, NULL); 4205 if (cds == NULL || mrna == NULL) return; 4206 4207 CheckSeqLocForPartial (cds->location, &partial5, &partial3); 4208 4209 start = GetOffsetInLoc (cds->location, mrna->location, SEQLOC_START); 4210 stop = GetOffsetInLoc (cds->location, mrna->location, SEQLOC_STOP); 4211 4212 if (start < 0 || start >= ns.nuc->length || 4213 stop < 0 || stop >= ns.nuc->length) return; 4214 4215 sip = SeqIdFindBest (ns.nuc->id, 0); 4216 if (sip == NULL) return; 4217 4218 /* copy cds feature fields to paste into new cds feature */ 4219 temp = AsnIoMemCopy (cds, 4220 (AsnReadFunc) SeqFeatAsnRead, 4221 (AsnWriteFunc) SeqFeatAsnWrite); 4222 if (temp == NULL) return; 4223 4224 sfp = CreateNewFeatureOnBioseq (ns.nuc, SEQFEAT_CDREGION, NULL); 4225 if (sfp == NULL) return; 4226 4227 sfp->location = SeqLocFree (sfp->location); 4228 if (StringISearch (cds->except_text, "ribosomal slippage") == NULL && 4229 StringISearch (cds->except_text, "ribosome slippage") == NULL && 4230 StringISearch (cds->except_text, "trans splicing") == NULL && 4231 StringISearch (cds->except_text, "trans-splicing") == NULL && 4232 StringISearch (cds->except_text, "artificial frameshift") == NULL) { 4233 sfp->location = AddIntervalToLocation (NULL, sip, start, stop, partial5, partial3); 4234 } else { 4235 slp = SeqLocFindNext (cds->location, NULL); 4236 while (slp != NULL) { 4237 start = GetOffsetInLoc (slp, mrna->location, SEQLOC_START); 4238 stop = GetOffsetInLoc (slp, mrna->location, SEQLOC_STOP); 4239 sfp->location = AddIntervalToLocation (sfp->location, sip, start, stop, partial5, partial3); 4240 slp = SeqLocFindNext (cds->location, slp); 4241 } 4242 sfp->location = SeqLocMergeEx (ns.nuc, sfp->location, NULL, FALSE, TRUE, FALSE, FALSE); 4243 } 4244 SetSeqFeatProduct (sfp, ns.prot); 4245 4246 /* paste fields from temp copy of original cds */ 4247 crp = (CdRegionPtr) temp->data.value.ptrvalue; 4248 sfp->data.value.ptrvalue = (Pointer) crp; 4249 4250 sfp->partial = temp->partial; 4251 sfp->excpt = temp->excpt; 4252 sfp->comment = temp->comment; 4253 sfp->qual = temp->qual; 4254 sfp->title = temp->title; 4255 sfp->ext = temp->ext; 4256 sfp->cit = temp->cit; 4257 sfp->exp_ev = temp->exp_ev; 4258 sfp->xref = temp->xref; 4259 sfp->dbxref = temp->dbxref; 4260 sfp->pseudo = temp->pseudo; 4261 sfp->except_text = temp->except_text; 4262 4263 MemFree (temp); /* do not SeqFeatFree */ 4264 4265 /* update code break locations */ 4266 for (cbp = crp->code_break; cbp != NULL; cbp = cbp->next) { 4267 CheckSeqLocForPartial (cbp->loc, &partial5, &partial3); 4268 start = GetOffsetInLoc (cbp->loc, mrna->location, SEQLOC_START); 4269 stop = GetOffsetInLoc (cbp->loc, mrna->location, SEQLOC_STOP); 4270 if (start < 0 || start >= ns.nuc->length || 4271 stop < 0 || stop >= ns.nuc->length) continue; 4272 cbp->loc = SeqLocFree (cbp->loc); 4273 cbp->loc = AddIntervalToLocation (NULL, sip, start, stop, partial5, partial3);; 4274 } 4275 4276 /* get rid of protein_id in mRNA if it matches protein Seq-id */ 4277 gbq = mrna->qual; 4278 prevqual = (GBQualPtr PNTR) &(mrna->qual); 4279 id [0] = '\0'; 4280 sip = NULL; 4281 while (gbq != NULL) { 4282 nextqual = gbq->next; 4283 if (StringICmp (gbq->qual, "protein_id") == 0 && 4284 InRightNps (gbq->val, ns.prot->id, tbl->forcelocalid)) { 4285 *(prevqual) = gbq->next; 4286 gbq->next = NULL; 4287 StringNCpy_0 (id, gbq->val, sizeof (id)); 4288 GBQualFree (gbq); 4289 } else { 4290 prevqual = (GBQualPtr PNTR) &(gbq->next); 4291 } 4292 gbq = nextqual; 4293 } 4294 } 4295 4296 /* copy gene from contig onto nuc-prot, single interval on cdna bioseq */ 4297 4298 static void CopyGene ( 4299 SeqFeatPtr sfp, 4300 Pointer userdata 4301 ) 4302 4303 { 4304 BioseqPtr bsp; 4305 SeqMgrFeatContext gcontext; 4306 SeqFeatPtr gene, copy, temp; 4307 GeneRefPtr grp, xref; 4308 Boolean partial5, partial3; 4309 4310 /* input mrna features are multi-interval on contig */ 4311 4312 if (sfp->data.choice != SEQFEAT_RNA) return; 4313 4314 /* find cdna product of mrna */ 4315 4316 bsp = BioseqFindFromSeqLoc (sfp->product); 4317 if (bsp == NULL) return; 4318 4319 /* check for gene xref */ 4320 4321 xref = SeqMgrGetGeneXref (sfp); 4322 if (xref != NULL) { 4323 if (SeqMgrGeneIsSuppressed (xref)) return; 4324 4325 /* copy gene xref for new gene feature */ 4326 4327 grp = AsnIoMemCopy (xref, 4328 (AsnReadFunc) GeneRefAsnRead, 4329 (AsnWriteFunc) GeneRefAsnWrite); 4330 if (grp == NULL) return; 4331 4332 /* make new gene feature on full-length of cdna */ 4333 4334 copy = CreateNewFeatureOnBioseq (bsp, SEQFEAT_GENE, NULL); 4335 if (copy == NULL) return; 4336 4337 copy->data.value.ptrvalue = grp; 4338 return; 4339 } 4340 4341 /* overlapping gene should be single interval on contig */ 4342 4343 gene = SeqMgrGetOverlappingGene (sfp->location, &gcontext); 4344 if (gene == NULL) return; 4345 4346 CheckSeqLocForPartial (gene->location, &partial5, &partial3); 4347 4348 /* copy gene feature fields to paste into new gene feature */ 4349 4350 temp = AsnIoMemCopy (gene, 4351 (AsnReadFunc) SeqFeatAsnRead, 4352 (AsnWriteFunc) SeqFeatAsnWrite); 4353 if (temp == NULL) return; 4354 4355 /* make new gene feature on full-length of cdna */ 4356 4357 copy = CreateNewFeatureOnBioseq (bsp, SEQFEAT_GENE, NULL); 4358 if (copy == NULL) { 4359 SeqFeatFree (temp); 4360 return; 4361 } 4362 4363 /* paste fields from temp copy of original gene */ 4364 4365 copy->data.value.ptrvalue = temp->data.value.ptrvalue; 4366 copy->partial = temp->partial; 4367 copy->excpt = temp->excpt; 4368 copy->comment = temp->comment; 4369 copy->qual = temp->qual; 4370 copy->title = temp->title; 4371 copy->ext = temp->ext; 4372 copy->cit = temp->cit; 4373 copy->exp_ev = temp->exp_ev; 4374 copy->xref = temp->xref; 4375 copy->dbxref = temp->dbxref; 4376 copy->pseudo = temp->pseudo; 4377 copy->except_text = temp->except_text; 4378 4379 SetSeqLocPartial (copy->location, partial5, partial3); 4380 4381 SeqLocFree (temp->location); 4382 MemFree (temp); /* do not SeqFeatFree */ 4383 } 4384 4385 static void CopyNcRna ( 4386 SeqFeatPtr sfp, 4387 Pointer userdata 4388 ) 4389 4390 { 4391 BioseqPtr bsp; 4392 SeqFeatPtr copy, temp; 4393 Boolean partial5, partial3; 4394 4395 if (sfp->data.choice != SEQFEAT_RNA) return; 4396 if (sfp->idx.subtype != FEATDEF_ncRNA) return; 4397 4398 /* find instantiated product of ncRNA */ 4399 4400 bsp = BioseqFindFromSeqLoc (sfp->product); 4401 if (bsp == NULL) return; 4402 4403 CheckSeqLocForPartial (sfp->location, &partial5, &partial3); 4404 4405 /* copy ncRNA feature fields to paste into new ncRNA feature */ 4406 4407 temp = AsnIoMemCopy (sfp, 4408 (AsnReadFunc) SeqFeatAsnRead, 4409 (AsnWriteFunc) SeqFeatAsnWrite); 4410 if (temp == NULL) return; 4411 4412 /* make new ncRNA feature on full-length of transcript */ 4413 4414 copy = CreateNewFeatureOnBioseq (bsp, SEQFEAT_RNA, NULL); 4415 if (copy == NULL) { 4416 SeqFeatFree (temp); 4417 return; 4418 } 4419 4420 /* paste fields from temp copy of original ncRNA */ 4421 4422 copy->data.value.ptrvalue = temp->data.value.ptrvalue; 4423 copy->partial = temp->partial; 4424 copy->excpt = temp->excpt; 4425 copy->comment = temp->comment; 4426 copy->qual = temp->qual; 4427 copy->title = temp->title; 4428 copy->ext = temp->ext; 4429 copy->cit = temp->cit; 4430 copy->exp_ev = temp->exp_ev; 4431 copy->xref = temp->xref; 4432 copy->dbxref = temp->dbxref; 4433 copy->pseudo = temp->pseudo; 4434 copy->except_text = temp->except_text; 4435 4436 SetSeqLocPartial (copy->location, partial5, partial3); 4437 4438 SeqLocFree (temp->location); 4439 SeqLocFree (temp->product); 4440 MemFree (temp); /* do not SeqFeatFree */ 4441 } 4442 4443 static void ClearRnaProducts ( 4444 SeqFeatPtr sfp, 4445 Pointer userdata 4446 ) 4447 4448 { 4449 if (sfp == NULL || sfp->data.choice != SEQFEAT_RNA) return; 4450 if (sfp->product == NULL) return; 4451 4452 sfp->product = SeqLocFree (sfp->product); 4453 } 4454 4455 static void RemoveGBQualIDs ( 4456 SeqFeatPtr sfp, 4457 Pointer userdata 4458 ) 4459 4460 { 4461 GBQualPtr gbq; 4462 GBQualPtr nextqual; 4463 GBQualPtr PNTR prevqual; 4464 4465 if (sfp->data.choice != SEQFEAT_CDREGION && sfp->data.choice != SEQFEAT_RNA) return; 4466 4467 gbq = sfp->qual; 4468 prevqual = (GBQualPtr PNTR) &(sfp->qual); 4469 while (gbq != NULL) { 4470 nextqual = gbq->next; 4471 if (StringICmp (gbq->qual, "transcript_id") == 0 || 4472 StringICmp (gbq->qual, "protein_id") == 0) { 4473 *(prevqual) = gbq->next; 4474 gbq->next = NULL; 4475 GBQualFree (gbq); 4476 } else { 4477 prevqual = (GBQualPtr PNTR) &(gbq->next); 4478 } 4479 gbq = nextqual; 4480 } 4481 } 4482 4483 typedef struct dupprot { 4484 SeqFeatPtr firstprot; 4485 SeqFeatPtr secondprot; 4486 } DupProt, PNTR DupProtPtr; 4487 4488 static void FindDupProtFeats ( 4489 SeqFeatPtr sfp, 4490 Pointer userdata 4491 ) 4492 4493 { 4494 DupProtPtr dpp; 4495 ProtRefPtr prp; 4496 4497 if (sfp == NULL || sfp->data.choice != SEQFEAT_PROT) return; 4498 dpp = (DupProtPtr) userdata; 4499 prp = (ProtRefPtr) sfp->data.value.ptrvalue; 4500 if (dpp == NULL || prp == NULL) return; 4501 if (prp->processed != 0) return; 4502 if (dpp->firstprot == NULL) { 4503 dpp->firstprot = sfp; 4504 } else if (dpp->secondprot == NULL) { 4505 dpp->secondprot = sfp; 4506 } 4507 } 4508 4509 static void ClearProtFeatStrand ( 4510 SeqFeatPtr sfp, 4511 Pointer userdata 4512 ) 4513 4514 { 4515 SeqIntPtr sintp; 4516 SeqLocPtr slp; 4517 4518 if (sfp == NULL) return; 4519 if (sfp->data.choice != SEQFEAT_REGION && 4520 sfp->data.choice != SEQFEAT_SITE && 4521 sfp->data.choice != SEQFEAT_BOND && 4522 sfp->data.choice != SEQFEAT_PROT) return; 4523 4524 slp = SeqLocFindNext (sfp->location, NULL); 4525 while (slp != NULL) { 4526 if (slp->choice == SEQLOC_INT) { 4527 sintp = (SeqIntPtr) slp->data.ptrvalue; 4528 if (sintp != NULL) { 4529 if (sintp->strand != Seq_strand_unknown) { 4530 sintp->strand = Seq_strand_unknown; 4531 } 4532 } 4533 } 4534 slp = SeqLocFindNext (sfp->location, slp); 4535 } 4536 } 4537 4538 static void RemoveDupProtFeats ( 4539 BioseqPtr bsp, 4540 Pointer userdata 4541 ) 4542 4543 { 4544 DupProt dp; 4545 4546 if (bsp == NULL) return; 4547 if (! ISA_aa (bsp->mol)) return; 4548 VisitFeaturesOnBsp (bsp, NULL, ClearProtFeatStrand); 4549 dp.firstprot = NULL; 4550 dp.secondprot = NULL; 4551 VisitFeaturesOnBsp (bsp, (Pointer) &dp, FindDupProtFeats); 4552 if (dp.firstprot == NULL || dp.secondprot == NULL) return; 4553 if (AsnIoMemComp ((Pointer) dp.firstprot, (Pointer) dp.secondprot, (AsnWriteFunc) SeqFeatAsnWrite)) { 4554 dp.firstprot->idx.deleteme = TRUE; 4555 } 4556 } 4557 4558 /* 4559 static void RemoveUnnecGeneXref ( 4560 SeqFeatPtr sfp, 4561 Pointer userdata 4562 ) 4563 4564 { 4565 SeqFeatXrefPtr curr, next; 4566 SeqFeatXrefPtr PNTR last; 4567 GeneRefPtr grp, grpx; 4568 Boolean redundantgenexref; 4569 SeqFeatPtr sfpx; 4570 CharPtr syn1, syn2; 4571 4572 if (sfp == NULL || sfp->data.choice == SEQFEAT_GENE) return; 4573 grp = SeqMgrGetGeneXref (sfp); 4574 if (grp == NULL || SeqMgrGeneIsSuppressed (grp)) return; 4575 sfpx = SeqMgrGetOverlappingGene (sfp->location, NULL); 4576 if (sfpx == NULL || sfpx->data.choice != SEQFEAT_GENE) return; 4577 grpx = (GeneRefPtr) sfpx->data.value.ptrvalue; 4578 if (grpx == NULL) return; 4579 4580 redundantgenexref = FALSE; 4581 if (StringDoesHaveText (grp->locus) && StringDoesHaveText (grpx->locus)) { 4582 if ((StringICmp (grp->locus, grpx->locus) == 0)) { 4583 redundantgenexref = TRUE; 4584 } 4585 } else if (StringDoesHaveText (grp->locus_tag) && StringDoesHaveText (grpx->locus_tag)) { 4586 if ((StringICmp (grp->locus_tag, grpx->locus_tag) == 0)) { 4587 redundantgenexref = TRUE; 4588 } 4589 } else if (grp->syn != NULL && grpx->syn != NULL) { 4590 syn1 = (CharPtr) grp->syn->data.ptrvalue; 4591 syn2 = (CharPtr) grpx->syn->data.ptrvalue; 4592 if (StringDoesHaveText (syn1) && StringDoesHaveText (syn2)) { 4593 if (StringICmp (syn1, syn2) == 0) { 4594 redundantgenexref = TRUE; 4595 } 4596 } 4597 } 4598 4599 if (redundantgenexref) { 4600 last = (SeqFeatXrefPtr PNTR) &(sfp->xref); 4601 curr = sfp->xref; 4602 while (curr != NULL) { 4603 next = curr->next; 4604 if (curr->data.choice == SEQFEAT_GENE) { 4605 *last = next; 4606 curr->next = NULL; 4607 SeqFeatXrefFree (curr); 4608 } else { 4609 last = &(curr->next); 4610 } 4611 curr = next; 4612 } 4613 } 4614 } 4615 */ 4616 4617 typedef struct dummysmfedata { 4618 Int4 max; 4619 Int4 num_at_max; 4620 } DummySmfeData, PNTR DummySmfePtr; 4621 4622 static Boolean LIBCALLBACK T2ADummySMFEProc ( 4623 SeqFeatPtr sfp, 4624 SeqMgrFeatContextPtr context 4625 ) 4626 4627 4628 { 4629 DummySmfePtr dsp; 4630 Int4 len; 4631 4632 if (sfp == NULL || context == NULL) return TRUE; 4633 dsp = context->userdata; 4634 if (dsp == NULL) return TRUE; 4635 4636 len = SeqLocLen (sfp->location); 4637 if (len < dsp->max) { 4638 dsp->max = len; 4639 dsp->num_at_max = 1; 4640 } else if (len == dsp->max) { 4641 (dsp->num_at_max)++; 4642 } 4643 4644 return TRUE; 4645 } 4646 4647 static void FillInPartialGeneXref ( 4648 SeqFeatPtr sfp, 4649 Pointer userdata 4650 ) 4651 4652 { 4653 BioseqPtr bsp; 4654 SeqMgrFeatContext context; 4655 SeqFeatPtr gene; 4656 GeneRefPtr grp, grpx; 4657 4658 if (sfp == NULL || sfp->data.choice == SEQFEAT_GENE) return; 4659 4660 grp = SeqMgrGetGeneXref (sfp); 4661 if (grp == NULL || SeqMgrGeneIsSuppressed (grp)) return; 4662 if (StringDoesHaveText (grp->locus) || StringHasNoText (grp->locus_tag)) return; 4663 4664 bsp = BioseqFindFromSeqLoc (sfp->location); 4665 if (bsp == NULL) return; 4666 gene = SeqMgrGetGeneByLocusTag (bsp, grp->locus_tag, &context); 4667 if (gene == NULL || gene->data.choice != SEQFEAT_GENE) return; 4668 grpx = (GeneRefPtr) gene->data.value.ptrvalue; 4669 if (grpx == NULL) return; 4670 4671 if (StringHasNoText (grpx->locus)) return; 4672 grp->locus = StringSave (grpx->locus); 4673 } 4674 4675 static void RemoveUnnecGeneXref ( 4676 SeqFeatPtr sfp, 4677 Pointer userdata 4678 ) 4679 4680 { 4681 Int2 count; 4682 SeqFeatXrefPtr curr, next; 4683 DummySmfeData dsd; 4684 SeqMgrFeatContext fcontext; 4685 SeqFeatXrefPtr PNTR last; 4686 GeneRefPtr grp, grpx; 4687 SeqFeatPtr sfpx; 4688 CharPtr syn1, syn2; 4689 4690 if (sfp == NULL || sfp->data.choice == SEQFEAT_GENE) return; 4691 grp = SeqMgrGetGeneXref (sfp); 4692 if (grp == NULL || SeqMgrGeneIsSuppressed (grp)) return; 4693 sfpx = SeqMgrGetOverlappingGene (sfp->location, &fcontext); 4694 if (sfpx == NULL || sfpx->data.choice != SEQFEAT_GENE) return; 4695 grpx = (GeneRefPtr) sfpx->data.value.ptrvalue; 4696 if (grpx == NULL) return; 4697 4698 if ((!StringHasNoText (grp->locus)) && (!StringHasNoText (grpx->locus))) { 4699 if ((StringICmp (grp->locus, grpx->locus) != 0)) return; 4700 } else if (StringDoesHaveText (grp->locus_tag) && StringDoesHaveText (grp->locus_tag)) { 4701 if ((StringICmp (grp->locus_tag, grpx->locus_tag) != 0)) return; 4702 } else if (grp->syn != NULL && grpx->syn != NULL) { 4703 syn1 = (CharPtr) grp->syn->data.ptrvalue; 4704 syn2 = (CharPtr) grpx->syn->data.ptrvalue; 4705 if ((!StringHasNoText (syn1)) && (!StringHasNoText (syn2))) { 4706 if ((StringICmp (syn1, syn2) != 0)) return; 4707 } 4708 } 4709 4710 MemSet ((Pointer) &dsd, 0, sizeof (DummySmfeData)); 4711 dsd.max = INT4_MAX; 4712 dsd.num_at_max = 0; 4713 count = SeqMgrGetAllOverlappingFeatures (sfp->location, FEATDEF_GENE, NULL, 0, 4714 LOCATION_SUBSET, (Pointer) &dsd, T2ADummySMFEProc); 4715 4716 if (dsd.num_at_max < 2) { 4717 last = (SeqFeatXrefPtr PNTR) &(sfp->xref); 4718 curr = sfp->xref; 4719 while (curr != NULL) { 4720 next = curr->next; 4721 if (curr->data.choice == SEQFEAT_GENE) { 4722 *last = next; 4723 curr->next = NULL; 4724 SeqFeatXrefFree (curr); 4725 } else { 4726 last = &(curr->next); 4727 } 4728 curr = next; 4729 } 4730 } 4731 } 4732 4733 static CharPtr RnaTypeLabel ( 4734 SeqFeatPtr rna 4735 ) 4736 4737 { 4738 if (rna == NULL) return "RNA"; 4739 switch (rna->idx.subtype) { 4740 case FEATDEF_preRNA : 4741 return "preRNA"; 4742 case FEATDEF_mRNA : 4743 return "mRNA"; 4744 case FEATDEF_tRNA : 4745 return "tRNA"; 4746 case FEATDEF_rRNA : 4747 return "rRNA"; 4748 case FEATDEF_snRNA : 4749 return "snRNA"; 4750 case FEATDEF_scRNA : 4751 return "scRNA"; 4752 case FEATDEF_otherRNA : 4753 return "otherRNA"; 4754 case FEATDEF_snoRNA : 4755 return "snoRNA"; 4756 case FEATDEF_ncRNA : 4757 return "ncRNA"; 4758 case FEATDEF_tmRNA : 4759 return "tmRNA"; 4760 default : 4761 break; 4762 } 4763 return "RNA"; 4764 } 4765 4766 static void AddRnaTitles ( 4767 SeqFeatPtr rna, 4768 CharPtr organism 4769 ) 4770 4771 { 4772 BioseqPtr bsp; 4773 SeqMgrFeatContext ccontext; 4774 CharPtr cdslabel = NULL; 4775 SeqMgrFeatContext gcontext; 4776 CharPtr genelabel = NULL; 4777 size_t len; 4778 SeqFeatPtr sfp; 4779 CharPtr str; 4780 CharPtr typ = NULL; 4781 4782 if (rna == NULL || rna->product == NULL) return; 4783 bsp = BioseqFindFromSeqLoc (rna->product); 4784 if (bsp == NULL) return; 4785 if (! ISA_na (bsp->mol)) return; 4786 if (BioseqGetTitle (bsp) != NULL) return; 4787 sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_GENE, 0, &gcontext); 4788 if (sfp != NULL) { 4789 genelabel = gcontext.label; 4790 if (StringHasNoText (genelabel)) { 4791 genelabel = NULL; 4792 } 4793 } 4794 sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &ccontext); 4795 if (sfp != NULL) { 4796 cdslabel = ccontext.label; 4797 if (StringHasNoText (cdslabel)) { 4798 cdslabel = NULL; 4799 } 4800 } 4801 typ = RnaTypeLabel (rna); 4802 len = StringLen (organism) + StringLen (genelabel) + StringLen (cdslabel) + 4803 StringLen (" mRNA, complete cds.") + StringLen (typ) + 10; 4804 str = (CharPtr) MemNew (len * sizeof (Char)); 4805 if (str == NULL) return; 4806 str [0] = '\0'; 4807 4808 if (StringDoesHaveText (organism)) { 4809 StringCat (str, organism); 4810 } 4811 if (cdslabel != NULL) { 4812 StringCat (str, " "); 4813 StringCat (str, cdslabel); 4814 } 4815 if (genelabel != NULL) { 4816 StringCat (str, " ("); 4817 StringCat (str, genelabel); 4818 StringCat (str, ")"); 4819 } 4820 if (cdslabel != NULL && genelabel != NULL) { 4821 StringCat (str, " "); 4822 StringCat (str, typ); 4823 if (ccontext.partialL || ccontext.partialR) { 4824 StringCat (str, ", partial cds."); 4825 } else { 4826 StringCat (str, ", complete cds."); 4827 } 4828 } else if (genelabel != NULL) { 4829 StringCat (str, " "); 4830 StringCat (str, typ); 4831 StringCat (str, "."); 4832 } 4833 SeqDescrAddPointer (&(bsp->descr), Seq_descr_title, (Pointer) str); 4834 } 4835 4836 static void MakeOneRnaTitle ( 4837 SeqFeatPtr rna, 4838 SeqFeatPtr gene, 4839 CharPtr label, 4840 CharPtr organism, 4841 Boolean alt_splice 4842 ) 4843 4844 { 4845 BioseqPtr bsp; 4846 SeqMgrFeatContext ccontext; 4847 SeqFeatPtr cds; 4848 GeneRefPtr grp; 4849 Char id [64]; 4850 CharPtr lbl = NULL; 4851 size_t len; 4852 CharPtr ptr; 4853 CharPtr str; 4854 CharPtr typ = NULL; 4855 4856 if (rna == NULL || rna->product == NULL) return; 4857 4858 grp = SeqMgrGetGeneXref (rna); 4859 if (SeqMgrGeneIsSuppressed (grp)) return; 4860 if (grp == NULL && gene != NULL) { 4861 grp = (GeneRefPtr) gene->data.value.ptrvalue; 4862 } 4863 if (grp == NULL) return; 4864 4865 bsp = BioseqFindFromSeqLoc (rna->product); 4866 if (bsp == NULL) return; 4867 SeqIdWrite (bsp->id, id, PRINTID_TEXTID_ACC_VER, sizeof (id) - 1); 4868 4869 typ = RnaTypeLabel (rna); 4870 lbl = StringSaveNoNull (label); 4871 4872 cds = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &ccontext); 4873 4874 len = StringLen (organism) + StringLen (grp->locus_tag) + StringLen (grp->locus) + 4875 StringLen (id) + StringLen (" transcript variant") + StringLen (lbl) + 4876 StringLen (" mRNA, complete cds.") + StringLen (typ) + 20; 4877 str = (CharPtr) MemNew (len * sizeof (Char)); 4878 if (str == NULL) return; 4879 str [0] = '\0'; 4880 4881 if (StringDoesHaveText (organism)) { 4882 StringCat (str, organism); 4883 } 4884 if (lbl != NULL) { 4885 StringCat (str, " "); 4886 ptr = StringStr (lbl, ", transcript variant "); 4887 if (ptr != NULL) { 4888 *ptr = '\0'; 4889 ptr += 2; 4890 StringCat (str, lbl); 4891 if (StringDoesHaveText (grp->locus)) { 4892 StringCat (str, " ("); 4893 StringCat (str, grp->locus); 4894 StringCat (str, ")"); 4895 } 4896 StringCat (str, ", "); 4897 StringCat (str, ptr); 4898 } else { 4899 StringCat (str, lbl); 4900 if (StringDoesHaveText (grp->locus)) { 4901 StringCat (str, " ("); 4902 StringCat (str, grp->locus); 4903 StringCat (str, ")"); 4904 } 4905 } 4906 } 4907 4908 StringCat (str, ", "); 4909 StringCat (str, typ); 4910 StringCat (str, "."); 4911 4912 SeqDescrAddPointer (&(bsp->descr), Seq_descr_title, (Pointer) str); 4913 MemFree (lbl); 4914 } 4915 4916 static void MakeSmartRnaTitles ( 4917 BioseqPtr bsp, 4918 CharPtr organism 4919 ) 4920 4921 { 4922 SeqMgrFeatContext context; 4923 GmcDataPtr gdp, head; 4924 GeneRefPtr grp; 4925 Int2 i, j, k, numgene, numrna; 4926 SeqFeatPtr sfp; 4927 4928 if (bsp == NULL) return; 4929 4930 numgene = 0; 4931 numrna = 0; 4932 4933 sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &context); 4934 while (sfp != NULL) { 4935 switch (sfp->data.choice) { 4936 case SEQFEAT_GENE : 4937 numgene++; 4938 break; 4939 case SEQFEAT_RNA : 4940 numrna++; 4941 break; 4942 default : 4943 break; 4944 } 4945 sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &context); 4946 } 4947 4948 /* if (numgene == 0) return; */ 4949 4950 if (numrna > 0) { 4951 head = (GmcDataPtr) MemNew (sizeof (GmcData) * (size_t) (numrna + 1)); 4952 if (head != NULL) { 4953 gdp = head; 4954 sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_RNA, 0, &context); 4955 while (sfp != NULL) { 4956 if (sfp->product != NULL) { 4957 gdp->feat = sfp; 4958 gdp->label = context.label; 4959 grp = SeqMgrGetGeneXref (sfp); 4960 if (grp == NULL || (! SeqMgrGeneIsSuppressed (grp))) { 4961 gdp->gene = SeqMgrGetOverlappingGene (sfp->location, NULL); 4962 } 4963 gdp++; 4964 } 4965 sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_RNA, 0, &context); 4966 } 4967 HeapSort (head, (size_t) numrna, sizeof (GmcData), SortByGenePtr); 4968 for (i = 0; i < numrna; i += j) { 4969 sfp = head [i].gene; 4970 for (j = 1; i + j < numrna && sfp == head [i + j].gene; j++) continue; 4971 if (j == 1) { 4972 /* no alt splicing */ 4973 MakeOneRnaTitle (head [i].feat, head [i].gene, head [i].label, organism, FALSE); 4974 } else { 4975 /* is alt splicing */ 4976 for (k = 0; k < j; k++) { 4977 MakeOneRnaTitle (head [i + k].feat, head [i + k].gene, head [i + k].label, organism, TRUE); 4978 } 4979 } 4980 } 4981 } 4982 MemFree (head); 4983 } 4984 } 4985 4986 typedef struct gosearch { 4987 TextFsaPtr gotags; 4988 Boolean isbad; 4989 } GoSearch, PNTR GoSearchPtr; 4990 4991 static void LookForGo ( 4992 SeqFeatPtr sfp, 4993 Pointer userdata 4994 ) 4995 4996 { 4997 Char ch; 4998 GoSearchPtr gsp; 4999 CharPtr ptr; 5000 Int4 state; 5001 ValNodePtr matches; 5002 5003 if (sfp == NULL || StringHasNoText (sfp->comment)) return; 5004 gsp = (GoSearchPtr) userdata; 5005 5006 state = 0; 5007 ptr = sfp->comment; 5008 ch = *ptr; 5009 while (ch != '\0') { 5010 matches = NULL; 5011 state = TextFsaNext (gsp->gotags, state, ch, &matches); 5012 if (matches != NULL) { 5013 gsp->isbad = TRUE; 5014 } 5015 ptr++; 5016 ch = *ptr; 5017 } 5018 } 5019 5020 static Boolean HasGoTermsInNote ( 5021 SeqEntryPtr sep, 5022 TextFsaPtr gotags 5023 ) 5024 5025 { 5026 GoSearch gs; 5027 5028 gs.gotags = gotags; 5029 gs.isbad = FALSE; 5030 VisitFeaturesInSep (sep, (Pointer) &gs, LookForGo); 5031 return gs.isbad; 5032 } 5033 5034 static void TakeProteinsFromGPS ( 5035 BioseqPtr bsp, 5036 Pointer userdata 5037 ) 5038 5039 { 5040 SeqEntryPtr PNTR lastp; 5041 SeqEntryPtr sep; 5042 5043 if (bsp == NULL || (! ISA_aa (bsp->mol))) return; 5044 lastp = (SeqEntryPtr PNTR) userdata; 5045 if (lastp == NULL) return; 5046 5047 /* link copy after genomic sequence */ 5048 5049 bsp = (BioseqPtr) AsnIoMemCopy ((Pointer) bsp, 5050 (AsnReadFunc) BioseqAsnRead, 5051 (AsnWriteFunc) BioseqAsnWrite); 5052 sep = ValNodeAddPointer (lastp, 1, (Pointer) bsp); 5053 *lastp = sep; 5054 } 5055 5056 static void GPStoNPS ( 5057 SeqEntryPtr top, 5058 Uint2 entityID 5059 ) 5060 5061 { 5062 BioseqSetPtr bssp; 5063 BioseqSetPtr dum; 5064 SeqEntryPtr last, sep; 5065 Uint2 parenttype; 5066 Pointer parentptr; 5067 5068 if (top == NULL || top->choice != 2) { 5069 Message (MSG_POSTERR, "GPStoNPS failed at top || top->choice"); 5070 return; 5071 } 5072 bssp = (BioseqSetPtr) top->data.ptrvalue; 5073 if (bssp == NULL || bssp->_class != BioseqseqSet_class_gen_prod_set) { 5074 Message (MSG_POSTERR, "GPStoNPS failed at bssp || bssp->_class"); 5075 return; 5076 } 5077 5078 GetSeqEntryParent (top, &parentptr, &parenttype); 5079 5080 /* point to genomic Bioseq component of gps */ 5081 5082 sep = bssp->seq_set; 5083 if (sep == NULL || sep->choice != 1) { 5084 Message (MSG_POSTERR, "GPStoNPS failed at sep || sep->choice"); 5085 return; 5086 } 5087 5088 /* unlink nuc-prot sets, etc., from genomic Bioseq */ 5089 5090 dum = BioseqSetNew (); 5091 if (dum == NULL) { 5092 Message (MSG_POSTERR, "GPStoNPS failed at BioseqSetNew"); 5093 return; 5094 } 5095 dum->_class = 1; 5096 dum->seq_set = sep->next; 5097 sep->next = NULL; 5098 5099 last = sep; 5100 VisitBioseqsInSet (dum, (Pointer) &last, TakeProteinsFromGPS); 5101 5102 bssp->_class = BioseqseqSet_class_nuc_prot; 5103 5104 SeqMgrLinkSeqEntry (top, parenttype, parentptr); 5105 5106 SeqMgrClearFeatureIndexes (bssp->idx.entityID, NULL); 5107 5108 VisitFeaturesInSet (bssp, NULL, ClearRnaProducts); 5109 5110 move_cds (top); 5111 5112 /* in case result has no proteins, demote to bioseq */ 5113 5114 RenormalizeNucProtSets (top, TRUE); 5115 5116 /* cleanup original nuc-prot sets */ 5117 5118 BioseqSetFree (dum); 5119 } 5120 5121 static void GeneralToNote ( 5122 SeqFeatPtr sfp, 5123 Pointer userdata 5124 ) 5125 5126 { 5127 BioseqPtr bsp; 5128 Char buf [41]; 5129 DbtagPtr dbt; 5130 size_t len; 5131 SeqIdPtr sip; 5132 CharPtr str; 5133 5134 if (sfp == NULL || sfp->product == NULL) return; 5135 if (sfp->data.choice != SEQFEAT_RNA) return; 5136 5137 bsp = BioseqFindFromSeqLoc (sfp->product); 5138 if (bsp == NULL) return; 5139 5140 for (sip = bsp->id; sip != NULL; sip = sip->next) { 5141 if (sip->choice != SEQID_GENERAL) continue; 5142 dbt = (DbtagPtr) sip->data.ptrvalue; 5143 if (dbt == NULL) continue; 5144 if (StringICmp (dbt->db, "TMSMART") == 0 || StringICmp (dbt->db, "NCBIFILE") == 0) continue; 5145 5146 SeqIdWrite (sip, buf, PRINTID_REPORT, sizeof (buf) - 1); 5147 5148 if (sfp->comment == NULL) { 5149 sfp->comment = StringSave (buf); 5150 } else { 5151 len = StringLen (sfp->comment) + StringLen (buf) + 5; 5152 str = MemNew (sizeof (Char) * len); 5153 StringCpy (str, sfp->comment); 5154 StringCat (str, "; "); 5155 StringCat (str, buf); 5156 sfp->comment = MemFree (sfp->comment); 5157 sfp->comment = str; 5158 } 5159 } 5160 } 5161 5162 static SeqEntryPtr PropagateDescsFromGenBankSet ( 5163 SeqEntryPtr sep 5164 ) 5165 5166 { 5167 BioseqPtr bsp; 5168 BioseqSetPtr bssp; 5169 SeqEntryPtr firstsep = NULL; 5170 SeqEntryPtr seqentry; 5171 ValNodePtr sourcedescr; 5172 5173 if (sep == NULL) return NULL; 5174 if (! IS_Bioseq_set (sep)) return sep; 5175 bssp = (BioseqSetPtr) sep->data.ptrvalue; 5176 if (bssp == NULL) return sep; 5177 sourcedescr = bssp->descr; 5178 if (sourcedescr == NULL) return sep; 5179 firstsep = bssp->seq_set; 5180 seqentry = firstsep; 5181 while (seqentry != NULL) { 5182 if (seqentry->data.ptrvalue != NULL) { 5183 if (seqentry->choice == 1) { 5184 bsp = (BioseqPtr) seqentry->data.ptrvalue; 5185 ValNodeLink (&(bsp->descr), 5186 AsnIoMemCopy ((Pointer) sourcedescr, 5187 (AsnReadFunc) SeqDescrAsnRead, 5188 (AsnWriteFunc) SeqDescrAsnWrite)); 5189 } else if (seqentry->choice == 2) { 5190 bssp = (BioseqSetPtr) seqentry->data.ptrvalue; 5191 ValNodeLink (&(bssp->descr), 5192 AsnIoMemCopy ((Pointer) sourcedescr, 5193 (AsnReadFunc) SeqDescrAsnRead, 5194 (AsnWriteFunc) SeqDescrAsnWrite)); 5195 } 5196 } 5197 seqentry = seqentry->next; 5198 } 5199 bssp = (BioseqSetPtr) sep->data.ptrvalue; 5200 bssp->descr = SeqDescrFree (bssp->descr); 5201 NormalizeDescriptorOrder (sep); 5202 return firstsep; 5203 } 5204 5205 typedef struct srcdata { 5206 Boolean isSeqId; 5207 Boolean isOrganism; 5208 Uint1 orgmodType; 5209 Uint1 subsourceType; 5210 } SrcData, PNTR SrcDataPtr; 5211 5212 static void ParseOneOrgLabel ( 5213 SrcDataPtr field, 5214 CharPtr label 5215 ) 5216 5217 { 5218 Int2 i; 5219 5220 if (field == NULL || StringHasNoText (label)) return; 5221 5222 if (StringICmp (label, "local_id") == 0 || 5223 StringICmp (label, "local id") == 0 || 5224 StringICmp (label, "SequenceID") == 0 || 5225 StringICmp (label, "Sequence_ID") == 0 || 5226 StringICmp (label, "Sequence ID") == 0 || 5227 StringICmp (label, "SeqID") == 0 || 5228 StringICmp (label, "Seq_ID") == 0 || 5229 StringICmp (label, "Seq ID") == 0) { 5230 field->isSeqId = TRUE; 5231 return; 5232 } 5233 if (StringICmp (label, "organism") == 0) { 5234 field->isOrganism = TRUE; 5235 return; 5236 } 5237 5238 i = EquivalentOrgMod (label); 5239 if (i != 0) { 5240 field->orgmodType = (Uint1) i; 5241 return; 5242 } 5243 i = EquivalentSubSource (label); 5244 if (i != 0) { 5245 field->subsourceType = (Uint1) i; 5246 return; 5247 } 5248 if (StringICmp (label, "note") == 0) { 5249 field->subsourceType = (Uint1) SUBSRC_other; 5250 } 5251 } 5252 5253 static void ProcessSourceTable ( 5254 FILE *fp 5255 ) 5256 5257 { 5258 BioSourcePtr biop; 5259 BioseqPtr bsp; 5260 CharPtr columns [80]; 5261 FileCache fc; 5262 SrcData fields [80]; 5263 Int2 i, numfields; 5264 Char line [4095]; 5265 OrgModPtr omp; 5266 OrgNamePtr onp; 5267 OrgRefPtr orp; 5268 CharPtr ptr, str; 5269 SeqDescrPtr sdp; 5270 SeqIdPtr sip; 5271 SubSourcePtr ssp; 5272 5273 if (fp == NULL) return; 5274 5275 MemSet ((Pointer) fields, 0, sizeof (fields)); 5276 numfields = 0; 5277 5278 FileCacheSetup (&fc, fp); 5279 5280 /* read first line with field names */ 5281 5282 str = FileCacheReadLine (&fc, line, sizeof (line), NULL); 5283 if (str == NULL) return; 5284 5285 TrimSpacesAroundString (str); 5286 while (StringDoesHaveText (str) && numfields < 78) { 5287 ptr = StringChr (str, '\t'); 5288 if (ptr != NULL) { 5289 *ptr = '\0'; 5290 ptr++; 5291 } 5292 TrimSpacesAroundString (str); 5293 ParseOneOrgLabel (&(fields [numfields]), str); 5294 numfields++; 5295 str = ptr; 5296 } 5297 5298 if (! fields [0].isSeqId) return; 5299 5300 /* read remaining lines with source data */ 5301 5302 str = FileCacheReadLine (&fc, line, sizeof (line), NULL); 5303 while (str != NULL) { 5304 5305 MemSet ((Pointer) columns, 0, sizeof (columns)); 5306 5307 TrimSpacesAroundString (str); 5308 i = 0; 5309 while (StringDoesHaveText (str) && i < numfields) { 5310 ptr = StringChr (str, '\t'); 5311 if (ptr != NULL) { 5312 *ptr = '\0'; 5313 ptr++; 5314 } 5315 TrimSpacesAroundString (str); 5316 columns [i] = str; 5317 i++; 5318 str = ptr; 5319 } 5320 5321 if (StringDoesHaveText (columns [0])) { 5322 sip = MakeSeqID (columns [0]); 5323 if (sip != NULL) { 5324 bsp = BioseqFind (sip); 5325 if (bsp != NULL) { 5326 biop = NULL; 5327 sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_source, NULL); 5328 if (sdp != NULL) { 5329 biop = (BioSourcePtr) sdp->data.ptrvalue; 5330 } 5331 if (biop == NULL) { 5332 biop = BioSourceNew (); 5333 if (biop != NULL) { 5334 SeqDescrAddPointer (&(bsp->descr), Seq_descr_source, (Pointer) biop); 5335 } 5336 } 5337 if (biop != NULL) { 5338 for (i = 1; i < numfields; i++) { 5339 if (StringHasNoText (columns [i])) continue; 5340 if (fields [i].isOrganism) { 5341 if (biop->org == NULL) { 5342 biop->org = OrgRefNew (); 5343 } 5344 orp = biop->org; 5345 if (orp != NULL) { 5346 orp->taxname = MemFree (orp->taxname); 5347 orp->taxname = StringSave (columns [i]); 5348 } 5349 } else if (fields [i].orgmodType > 0) { 5350 if (biop->org == NULL) { 5351 biop->org = OrgRefNew (); 5352 } 5353 orp = biop->org; 5354 if (orp != NULL) { 5355 if (orp->orgname == NULL) { 5356 orp->orgname = OrgNameNew (); 5357 } 5358 onp = orp->orgname; 5359 if (onp != NULL) { 5360 omp = OrgModNew (); 5361 if (omp != NULL) { 5362 omp->subtype = (Uint1) fields [i].orgmodType; 5363 omp->subname = StringSave (columns [i]); 5364 omp->next = onp->mod; 5365 onp->mod = omp; 5366 } 5367 } 5368 } 5369 } else if (fields [i].subsourceType > 0) { 5370 ssp = SubSourceNew (); 5371 if (ssp != NULL) { 5372 ssp->subtype = (Uint1) fields [i].subsourceType; 5373 ssp->name = StringSave (columns [i]); 5374 ssp->next = biop->subtype; 5375 biop->subtype = ssp; 5376 } 5377 } 5378 } 5379 } 5380 } 5381 sip = SeqIdFree (sip); 5382 } 5383 } 5384 5385 str = FileCacheReadLine (&fc, line, sizeof (line), NULL); 5386 } 5387 } 5388 5389 static SeqDescrPtr GetDescriptorTypeAlreadyInList ( 5390 Uint1 descr_choice, 5391 SeqDescrPtr list 5392 ) 5393 5394 { 5395 while (list != NULL && list->choice != descr_choice) { 5396 list = list->next; 5397 } 5398 return list; 5399 } 5400 5401 static void AddTemplateDescriptors ( 5402 SeqDescrPtr PNTR current_list, 5403 SeqDescrPtr new_list, 5404 Boolean copy 5405 ) 5406 5407 { 5408 SeqDescrPtr dsc, sdp_next, sdp; 5409 5410 if (current_list == NULL || new_list == NULL) return; 5411 5412 for (sdp = new_list; sdp != NULL; sdp = sdp_next) { 5413 sdp_next = sdp->next; 5414 if (sdp->choice == Seq_descr_molinfo) continue; 5415 if (sdp->choice == Seq_descr_source && 5416 GetDescriptorTypeAlreadyInList (Seq_descr_source, *current_list) != NULL) continue; 5417 sdp->next = NULL; 5418 if (copy) { 5419 dsc = AsnIoMemCopy ((Pointer) sdp, 5420 (AsnReadFunc) SeqDescrAsnRead, 5421 (AsnWriteFunc) SeqDescrAsnWrite); 5422 } else { 5423 dsc = sdp; 5424 } 5425 ValNodeLink (current_list, (Pointer) dsc); 5426 sdp->next = sdp_next; 5427 } 5428 } 5429 5430 static void GenomizeSeqId ( 5431 SeqIdPtr sip, 5432 Pointer userdata 5433 ) 5434 5435 { 5436 CharPtr accn = NULL; 5437 CharPtr center; 5438 DbtagPtr dbt; 5439 ObjectIdPtr oip; 5440 5441 if (sip == NULL || sip->choice != SEQID_LOCAL) return; 5442 center = (CharPtr) userdata; 5443 if (StringHasNoText (center)) return; 5444 5445 oip = (ObjectIdPtr) sip->data.ptrvalue; 5446 if (oip == NULL) return; 5447 accn = oip->str; 5448 if (StringHasNoText (accn)) return; 5449 5450 dbt = DbtagNew (); 5451 if (dbt == NULL) return; 5452 oip = ObjectIdNew (); 5453 if (oip == NULL) return; 5454 oip->str = StringSave (accn); 5455 dbt->db = StringSave (center); 5456 dbt->tag = oip; 5457 5458 sip->data.ptrvalue = ObjectIdFree ((ObjectIdPtr) sip->data.ptrvalue); 5459 sip->data.ptrvalue = (Pointer) dbt; 5460 sip->choice = SEQID_GENERAL; 5461 } 5462 5463 static void GenomizeFeatureSeqIds ( 5464 SeqFeatPtr sfp, 5465 Pointer userdata 5466 ) 5467 5468 { 5469 VisitSeqIdsInSeqLoc (sfp->location, userdata, GenomizeSeqId); 5470 } 5471 5472 static void GenomizeGraphSeqIds ( 5473 SeqGraphPtr sgp, 5474 Pointer userdata 5475 ) 5476 5477 { 5478 VisitSeqIdsInSeqGraph (sgp, userdata, GenomizeSeqId); 5479 } 5480 5481 static void MakeGenomeCenterID ( 5482 BioseqPtr bsp, 5483 Pointer userdata 5484 ) 5485 5486 { 5487 CharPtr center; 5488 5489 if (bsp == NULL) return; 5490 center = (CharPtr) userdata; 5491 if (StringHasNoText (center)) return; 5492 5493 VisitSeqIdsInBioseq (bsp, userdata, GenomizeSeqId); 5494 SeqMgrReplaceInBioseqIndex (bsp); 5495 VisitFeaturesOnBsp (bsp, userdata, GenomizeFeatureSeqIds); 5496 VisitGraphsOnBsp (bsp, userdata, GenomizeGraphSeqIds); 5497 } 5498 5499 static void MakeAccessionID ( 5500 BioseqPtr bsp, 5501 Pointer userdata 5502 ) 5503 5504 { 5505 CharPtr accn; 5506 ValNodePtr generalIDs; 5507 SeqIdPtr sip; 5508 5509 if (bsp == NULL) return; 5510 if (! ISA_na (bsp->mol)) return; 5511 accn = (CharPtr) userdata; 5512 if (StringHasNoText (accn)) return; 5513 5514 /* if existing accession, coerce all SeqIds */ 5515 5516 sip = SeqIdFromAccession (accn, INT2_MIN, NULL); 5517 if (sip == NULL) return; 5518 generalIDs = ValNodeExtractList (&(bsp->id), SEQID_GENERAL); 5519 bsp->id = SeqIdSetFree (bsp->id); 5520 bsp->id = sip; 5521 if (generalIDs != NULL) { 5522 ValNodeLink (&(bsp->id), generalIDs); 5523 } 5524 SeqMgrReplaceInBioseqIndex (bsp); 5525 VisitFeaturesOnBsp (bsp, (Pointer) bsp->id, CorrectFeatureSeqIds); 5526 VisitGraphsOnBsp (bsp, (Pointer) bsp->id, CorrectGraphSeqIds); 5527 } 5528 5529 static void FindCreateDate ( 5530 SeqDescrPtr sdp, 5531 Pointer userdata 5532 ) 5533 5534 { 5535 BoolPtr has_create_dateP; 5536 5537 if (sdp == NULL || sdp->choice != Seq_descr_create_date || userdata == NULL) return; 5538 has_create_dateP = (BoolPtr) userdata; 5539 *has_create_dateP = TRUE; 5540 } 5541 5542 static void ConvertStructuredComment ( 5543 SeqDescrPtr sdp, 5544 Pointer userdata 5545 ) 5546 5547 { 5548 SeqDescrPtr com; 5549 CharPtr prefix = NULL; 5550 CharPtr str; 5551 UserObjectPtr uop = NULL; 5552 5553 if (sdp == NULL || sdp->choice != Seq_descr_comment) return; 5554 str = (CharPtr) sdp->data.ptrvalue; 5555 if (StringHasNoText (str)) return; 5556 5557 if (StringStr (str, "##HIVData-START##") != NULL && 5558 StringStr (str, "##HIVData-END##") != NULL) { 5559 prefix = StringStr (str, "##HIVData-START##"); 5560 uop = ParseStringIntoStructuredComment (NULL, str, "##HIVData-START##", 5561 "##HIVData-END##"); 5562 } else if (StringStr (str, "##FluData-START##") != NULL && 5563 StringStr (str, "##FluData-END##") != NULL) { 5564 prefix = StringStr (str, "##FluData-START##"); 5565 uop = ParseStringIntoStructuredComment (NULL, str, "##FluData-START##", 5566 "##FluData-END##"); 5567 } 5568 if (uop == NULL) return; 5569 5570 /* if there is text before prefix, truncate existing comment and append user object */ 5571 5572 if (prefix != NULL) { 5573 *prefix = '\0'; 5574 TrimSpacesAroundString (str); 5575 if (StringDoesHaveText (str)) { 5576 com = SeqDescrNew (NULL); 5577 if (com != NULL) { 5578 com->choice = Seq_descr_user; 5579 com->data.ptrvalue = uop; 5580 com->next = sdp->next; 5581 sdp->next = com; 5582 return; 5583 } 5584 } 5585 } 5586 5587 /* if entire comment was structured, replace existing descriptor with user object */ 5588 5589 MemFree (sdp->data.ptrvalue); 5590 sdp->choice = Seq_descr_user; 5591 sdp->data.ptrvalue = uop; 5592 } 5593 5594 static void CleanUpLatLonAndCountry ( 5595 BioSourcePtr biop, 5596 Pointer userdata 5597 ) 5598 5599 { 5600 CharPtr fix_lat_lon; 5601 Boolean format_ok = FALSE; 5602 CharPtr lat_lon = NULL; 5603 Boolean lat_in_range = FALSE; 5604 Boolean lon_in_range = FALSE; 5605 CharPtr PNTR list; 5606 CharPtr new_country; 5607 SubSourcePtr ssp; 5608 5609 if (biop == NULL) return; 5610 list = (CharPtr PNTR) userdata; 5611 if (list == NULL) return; 5612 5613 for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) { 5614 if (ssp->subtype == SUBSRC_country && StringDoesHaveText (ssp->name)) { 5615 new_country = GetCountryFix (ssp->name, list); 5616 if (new_country != NULL) { 5617 ssp->name = MemFree (ssp->name); 5618 ssp->name = new_country; 5619 } 5620 } else if (ssp->subtype == SUBSRC_lat_lon && StringDoesHaveText (ssp->name)) { 5621 lat_lon = ssp->name; 5622 IsCorrectLatLonFormat (lat_lon, &format_ok, &lat_in_range, &lon_in_range); 5623 if (! format_ok) { 5624 fix_lat_lon = FixLatLonFormat (lat_lon); 5625 if (fix_lat_lon != NULL) { 5626 ssp->name = MemFree (ssp->name); 5627 ssp->name = fix_lat_lon; 5628 } 5629 } 5630 } 5631 } 5632 } 5633 5634 static void LookupPubdesc ( 5635 PubdescPtr pdp, 5636 Pointer userdata 5637 ) 5638 5639 { 5640 CitArtPtr cap; 5641 MedlineEntryPtr mep; 5642 PubmedEntryPtr pep; 5643 Int4 pmid = 0; 5644 ValNodePtr vnp; 5645 5646 if (pdp == NULL) return; 5647 5648 for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) { 5649 switch (vnp->choice) { 5650 case PUB_Muid : 5651 /* ignore obsolete muids */ 5652 break; 5653 case PUB_PMid : 5654 pmid = vnp->data.intvalue; 5655 break; 5656 default : 5657 /* return on real pub */ 5658 return; 5659 break; 5660 } 5661 } 5662 5663 if (pmid == 0) return; 5664 5665 pep = GetPubMedForUid (pmid); 5666 if (pep == NULL) return; 5667 mep = (MedlineEntryPtr) pep->medent; 5668 if (mep != NULL && mep->cit != NULL) { 5669 cap = AsnIoMemCopy ((Pointer) mep->cit, 5670 (AsnReadFunc) CitArtAsnRead, 5671 (AsnWriteFunc) CitArtAsnWrite); 5672 ValNodeAddPointer (&(pdp->pub), PUB_Article, (Pointer) cap); 5673 } 5674 5675 PubmedEntryFree (pep); 5676 } 5677 5678 5679 5680 #ifdef INTERNAL_NCBI_ASNDISC 5681 const PerformDiscrepancyTest taxlookup = CheckTaxNamesAgainstTaxDatabase; 5682 #else 5683 const PerformDiscrepancyTest taxlookup = NULL; 5684 #endif 5685 5686 5687 static void CleanupCollectionDatesMonthFirst (BioSourcePtr biop, Pointer data) 5688 { 5689 SubSourcePtr ssp; 5690 CharPtr reformatted_date = NULL; 5691 5692 if (biop == NULL) return; 5693 5694 ssp = biop->subtype; 5695 while (ssp != NULL) 5696 { 5697 if (ssp->subtype == SUBSRC_collection_date) 5698 { 5699 reformatted_date = ReformatDateStringEx (ssp->name, TRUE, NULL); 5700 if (reformatted_date != NULL) 5701 { 5702 ssp->name = MemFree (ssp->name); 5703 ssp->name = reformatted_date; 5704 } 5705 } 5706 ssp = ssp->next; 5707 } 5708 } 5709 5710 5711 static void CleanupCollectionDatesDayFirst (BioSourcePtr biop, Pointer data) 5712 { 5713 SubSourcePtr ssp; 5714 CharPtr reformatted_date = NULL; 5715 5716 if (biop == NULL) return; 5717 5718 ssp = biop->subtype; 5719 while (ssp != NULL) 5720 { 5721 if (ssp->subtype == SUBSRC_collection_date) 5722 { 5723 reformatted_date = ReformatDateStringEx (ssp->name, FALSE, NULL); 5724 if (reformatted_date != NULL) 5725 { 5726 ssp->name = MemFree (ssp->name); 5727 ssp->name = reformatted_date; 5728 } 5729 } 5730 ssp = ssp->next; 5731 } 5732 } 5733 5734 5735 static void ValNodeLinkCopy (ValNodePtr PNTR list1, ValNodePtr list2) 5736 { 5737 if (list1 == NULL) return; 5738 while (list2 != NULL) 5739 { 5740 ValNodeAddPointer (list1, list2->choice, list2->data.ptrvalue); 5741 list2 = list2->next; 5742 } 5743 } 5744 5745 static ValNodePtr FindItemListForClickableItemCategory (ValNodePtr list, CharPtr category_fmt) 5746 { 5747 ClickableItemPtr cip; 5748 ValNodePtr vnp; 5749 ValNodePtr item_list = NULL; 5750 CharPtr cp; 5751 5752 if (StringLen (category_fmt) < 2) { 5753 return NULL; 5754 } 5755 for (vnp = list; vnp != NULL; vnp = vnp->next) { 5756 cip = (ClickableItemPtr) vnp->data.ptrvalue; 5757 if (cip != NULL) { 5758 if (cip->description != NULL) { 5759 /* skip number at beginning of category title */ 5760 cp = cip->description; 5761 while (isdigit (*cp)) { 5762 cp++; 5763 } 5764 if (StringCmp (cp, category_fmt + 2) == 0) { 5765 ValNodeLinkCopy (&item_list, cip->item_list); 5766 } 5767 } 5768 ValNodeLink (&item_list, FindItemListForClickableItemCategory (cip->subcategories, category_fmt)); 5769 } 5770 } 5771 return item_list; 5772 } 5773 5774 5775 static void DoTbl2AsnCleanup (SeqEntryPtr sep, CleanupArgsPtr c) 5776 { 5777 ValNodePtr sep_list = NULL; 5778 ValNodePtr discrepancy_list = NULL, item_list = NULL, vnp; 5779 SeqFeatPtr sfp; 5780 5781 if (sep == NULL || c == NULL) { 5782 return; 5783 } 5784 5785 if (c->collection_dates) { 5786 if (c->collection_dates_month_first) { 5787 VisitBioSourcesInSep (sep, NULL, CleanupCollectionDatesMonthFirst); 5788 } else { 5789 VisitBioSourcesInSep (sep, NULL, CleanupCollectionDatesDayFirst); 5790 } 5791 } 5792 if (c->add_notes_to_overlapping_cds_without_abc) { 5793 ValNodeAddPointer (&sep_list, 0, sep); 5794 SeqMgrIndexFeatures (ObjMgrGetEntityIDForChoice (sep), NULL); 5795 AddOverlappingCodingRegionDiscrepancies (&discrepancy_list, sep_list); 5796 sep_list = ValNodeFree (sep_list); 5797 item_list = FindItemListForClickableItemCategory (discrepancy_list, kOverlappingCDSNeedsNoteFmt); 5798 discrepancy_list = FreeClickableList (discrepancy_list); 5799 for (vnp = item_list; vnp != NULL; vnp = vnp->next) { 5800 if (vnp->choice == OBJ_SEQFEAT) { 5801 sfp = (SeqFeatPtr) vnp->data.ptrvalue; 5802 if (sfp != NULL) { 5803 SetStringValue (&(sfp->comment), kOverlappingCDSNoteText, ExistingTextOption_append_semi); 5804 } 5805 } 5806 } 5807 item_list = ValNodeFree (item_list); 5808 } 5809 } 5810 5811 5812 static void SeqEntryHasConflictingIDsCallback (BioseqPtr bsp, Pointer data) 5813 { 5814 CharPtr msg, fmt = "SeqID %s is present on multiple Bioseqs in record"; 5815 BioseqPtr bsp2; 5816 SeqIdPtr sip; 5817 DbtagPtr dbt; 5818 Char buf[100]; 5819 5820 if (bsp == NULL || data == NULL) { 5821 return; 5822 } 5823 5824 for (sip = bsp->id; sip != NULL; sip = sip->next) { 5825 if (sip->choice == SEQID_GENERAL 5826 && (dbt = (DbtagPtr) sip->data.ptrvalue) != NULL 5827 && StringICmp (dbt->db, "NCBIFILE") == 0) { 5828 continue; 5829 } 5830 bsp2 = BioseqFindSpecial (sip); 5831 if (bsp2 != NULL && bsp2 != bsp) { 5832 SeqIdWrite (sip, buf, PRINTID_FASTA_SHORT, sizeof (buf) - 1); 5833 msg = (CharPtr) MemNew (sizeof (Char) * (StringLen (fmt) + StringLen (buf))); 5834 sprintf (msg, fmt, buf); 5835 ValNodeAddPointer ((ValNodePtr PNTR) data, 0, msg); 5836 } 5837 } 5838 } 5839 5840 5841 static Boolean SeqEntryHasConflictingIDs (SeqEntryPtr sep) 5842 { 5843 ValNodePtr errs = NULL, vnp; 5844 5845 VisitBioseqsInSep (sep, &errs, SeqEntryHasConflictingIDsCallback); 5846 if (errs == NULL) { 5847 return FALSE; 5848 } else { 5849 ValNodeUnique (&errs, SortVnpByString, ValNodeFreeData); 5850 for (vnp = errs; vnp != NULL; vnp = vnp->next) { 5851 Message (MSG_POSTERR, vnp->data.ptrvalue); 5852 } 5853 errs = ValNodeFreeData (errs); 5854 return TRUE; 5855 } 5856 } 5857 5858 5859 static void ProcessOneRecord ( 5860 SubmitBlockPtr sbp, 5861 PubdescPtr pdp, 5862 BioSourcePtr src, 5863 CharPtr directory, 5864 CharPtr results, 5865 CharPtr base, 5866 CharPtr suffix, 5867 SeqDescrPtr sdphead, 5868 TblArgsPtr tbl, 5869 TextFsaPtr gotags, 5870 AsnIoPtr aip, 5871 CharPtr outfile 5872 ) 5873 5874 { 5875 AsnTypePtr atp_bssse; 5876 BioSourcePtr biop; 5877 BioseqPtr bsp; 5878 BioseqSetPtr bssp = NULL; 5879 Char buf [256]; 5880 SeqMgrFeatContext context; 5881 Pointer dataptr; 5882 Uint2 datatype, entityID; 5883 SeqDescrPtr descr; 5884 DatePtr dp; 5885 BioseqSetPtr dssp; 5886 Boolean failure = FALSE; 5887 FileCache fc; 5888 FILE *fp; 5889 Int2 genCode; 5890 Boolean goOn; 5891 SeqEntryPtr gsep = NULL; 5892 Boolean has_create_date; 5893 SeqGraphPtr lastsgp; 5894 Int4 linenum = 0; 5895 CharPtr PNTR list; 5896 CharPtr localname = NULL; 5897 MolInfoPtr mip; 5898 ErrSev msev; 5899 Boolean nonewline; 5900 BioseqPtr nucbsp; 5901 ObjMgrDataPtr omdp; 5902 CharPtr organism; 5903 OrgRefPtr orp; 5904 BioseqPtr protbsp; 5905 SeqEntryPtr protsep; 5906 CharPtr ptr; 5907 SeqAnnotPtr sap; 5908 SeqDescrPtr sdp; 5909 SeqEntryPtr sep; 5910 SeqFeatPtr sfp; 5911 CharPtr sfx = NULL; 5912 SeqGraphPtr sgp; 5913 SeqIdPtr sip; 5914 SeqSubmitPtr sub; 5915 SimpleSeqPtr ssp; 5916 CharPtr str; 5917 CharPtr tblfile = NULL; 5918 SeqEntryPtr tmp; 5919 MolInfoPtr template_molinfo = NULL; 5920 ValNodePtr cmt_errors, vnp; 5921 5922 fp = OpenOneFile (directory, base, suffix); 5923 if (fp == NULL) return; 5924 5925 if (tbl->logtoterminal) { 5926 Message (MSG_POSTERR, "File %s", base); 5927 } 5928 5929 /* if genomic product set, make parent set */ 5930 5931 if (tbl->genprodset) { 5932 bssp = BioseqSetNew (); 5933 if (bssp == NULL) return; 5934 bssp->_class = BioseqseqSet_class_gen_prod_set; 5935 5936 gsep = SeqEntryNew (); 5937 if (gsep == NULL) return; 5938 gsep->choice = 2; 5939 gsep->data.ptrvalue = (Pointer) bssp; 5940 SeqMgrSeqEntry (SM_BIOSEQSET, (Pointer) bssp, gsep); 5941 } 5942 5943 if (tbl->seqidfromfile) { 5944 localname = base; 5945 } 5946 5947 /* find MolInfo from template, if there is any */ 5948 sdp = sdphead; 5949 while (sdp != NULL && sdp->choice != Seq_descr_molinfo) { 5950 sdp = sdp->next; 5951 } 5952 if (sdp != NULL) { 5953 template_molinfo = (MolInfoPtr) sdp->data.ptrvalue; 5954 } 5955 5956 /* read one or more ASN.1 or FASTA sequence files */ 5957 5958 if (tbl->fastaset) { 5959 entityID = ProcessBulkSet (fp, src, tbl, template_molinfo); 5960 } else if (tbl->deltaset) { 5961 entityID = ProcessDeltaSet (fp, src, tbl, localname, gsep, template_molinfo); 5962 } else if (tbl->alignset) { 5963 entityID = ProcessAlignSet (fp, src, tbl, template_molinfo); 5964 } else if (tbl->gapped) { 5965 entityID = ProcessGappedSet (fp, src, tbl, gsep, template_molinfo); 5966 } else if (tbl->phrapace) { 5967 entityID = ProcessPhrapAce (fp, src, tbl, localname, gsep, template_molinfo, directory, base); 5968 } else if (tbl->raw2delt) { 5969 entityID = ProcessRaw2Delt (fp, src, tbl, localname, gsep, template_molinfo); 5970 } else { 5971 entityID = ProcessOneAsn (fp, src, tbl, localname, gsep, template_molinfo); 5972 } 5973 FileClose (fp); 5974 5975 if (entityID == 0) return; 5976 5977 sep = GetTopSeqEntryForEntityID (entityID); 5978 if (SeqEntryHasConflictingIDs (sep)) { 5979 return; 5980 } 5981 5982 if (tbl->dotaxlookup) { 5983 sep = GetTopSeqEntryForEntityID (entityID); 5984 if (sep != NULL) { 5985 5986 /* optionally do network taxonomy lookup - prior to instantiating mRNA and protein titles */ 5987 5988 Taxon3ReplaceOrgInSeqEntry (sep, FALSE); 5989 } 5990 } 5991 5992 if (tbl->dopublookup) { 5993 sep = GetTopSeqEntryForEntityID (entityID); 5994 if (sep != NULL) { 5995 5996 /* optionally do network publication lookup of just PMID references */ 5997 5998 VisitPubdescsInSep (sep, NULL, LookupPubdesc); 5999 } 6000 } 6001 6002 organism = NULL; 6003 if (tbl->genprodset) { 6004 descr = ExtractBioSourceAndPubs (bssp->seq_set); 6005 for (sdp = descr; sdp != NULL; sdp = sdp->next) { 6006 if (sdp->choice != Seq_descr_source) continue; 6007 biop = (BioSourcePtr) sdp->data.ptrvalue; 6008 if (biop == NULL) continue; 6009 orp = biop->org; 6010 if (orp == NULL) continue; 6011 if (StringDoesHaveText (orp->taxname)) { 6012 organism = orp->taxname; 6013 } 6014 } 6015 ReplaceBioSourceAndPubs (gsep, descr); 6016 } 6017 6018 /* read one or more feature tables from .tbl file */ 6019 6020 if (StringDoesHaveText (tbl->tableFile)) { 6021 fp = FileOpen (tbl->tableFile, "r"); 6022 tblfile = tbl->tableFile; 6023 } else { 6024 fp = OpenOneFile (directory, base, ".tbl"); 6025 tblfile = base; 6026 sfx = ".tbl"; 6027 } 6028 if (fp != NULL) { 6029 6030 /* indexing needed to find segmented bsp if location is on part */ 6031 6032 sep = GetTopSeqEntryForEntityID (entityID); 6033 6034 SeqMgrIndexFeatures (entityID, NULL); 6035 6036 while ((! failure) && (dataptr = ReadFeatureTableFile (fp, &datatype, NULL, &linenum, &failure)) != NULL) { 6037 if (datatype == OBJ_SEQANNOT) { 6038 6039 sap = (SeqAnnotPtr) dataptr; 6040 ProcessOneAnnot (sap, entityID, tbl); 6041 6042 } else { 6043 ObjMgrFree (datatype, dataptr); 6044 } 6045 } 6046 FileClose (fp); 6047 sep = GetTopSeqEntryForEntityID (entityID); 6048 6049 6050 if (failure) { 6051 if (StringHasNoText (tblfile)) { 6052 tblfile = "?"; 6053 } 6054 ptr = StringRChr (tblfile, DIRDELIMCHR); 6055 if (ptr != NULL) { 6056 ptr++; 6057 tblfile = ptr; 6058 } 6059 Message (MSG_POSTERR, "Bad feature table at line %ld of file %s%s", (long) linenum, tblfile, sfx); 6060 } 6061 } 6062 6063 /* if genomic product set, copy CDS into nucprot sets */ 6064 6065 if (tbl->genprodset) { 6066 /* need to reindex to get mRNA and CDS features from cDNA and protein */ 6067 SeqMgrIndexFeatures (entityID, NULL); 6068 VisitSetsInSet (bssp, (Pointer) tbl, MakeNucProtCDS); 6069 } 6070 6071 /* read source qualifiers for set of sequences from .src file */ 6072 6073 fp = OpenOneFile (directory, base, ".src"); 6074 if (fp != NULL) { 6075 6076 ProcessSourceTable (fp); 6077 6078 FileClose (fp); 6079 } 6080 6081 /* read structured comments from .cmt file */ 6082 fp = OpenOneFile (directory, base, ".cmt"); 6083 if (fp != NULL) { 6084 sep = GetTopSeqEntryForEntityID (entityID); 6085 cmt_errors = CreateStructuredCommentsFromFile (fp, sep); 6086 FileClose (fp); 6087 if (cmt_errors != NULL) { 6088 for (vnp = cmt_errors; vnp != NULL; vnp = vnp->next) { 6089 Message (MSG_POSTERR, "Error processing structured comment (.cmt) file: %s", vnp->data.ptrvalue); 6090 } 6091 cmt_errors = ValNodeFreeData (cmt_errors); 6092 } 6093 } 6094 6095 /* read one or more protein sequences from .pep file */ 6096 6097 fp = OpenOneFile (directory, base, ".pep"); 6098 if (fp != NULL) { 6099 6100 /* indexing needed to find CDS from protein product to set conflict flag */ 6101 6102 SeqMgrIndexFeatures (entityID, NULL); 6103 6104 while ((dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, TRUE, TRUE, TRUE)) != NULL) { 6105 if (datatype == OBJ_FASTA) { 6106 6107 ssp = (SimpleSeqPtr) dataptr; 6108 ReplaceOnePeptide (ssp, tbl->conflict, tbl->genprodset); 6109 SimpleSeqFree (ssp); 6110 6111 } else { 6112 ObjMgrFree (datatype, dataptr); 6113 } 6114 } 6115 FileClose (fp); 6116 } 6117 6118 /* read one or more RNA sequences from .rna file */ 6119 6120 fp = OpenOneFile (directory, base, ".rna"); 6121 if (fp != NULL) { 6122 6123 /* indexing needed to find mRNA from transcript product to set RNA editing exception */ 6124 6125 SeqMgrIndexFeatures (entityID, NULL); 6126 6127 while ((dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, TRUE, TRUE, TRUE)) != NULL) { 6128 if (datatype == OBJ_FASTA) { 6129 6130 ssp = (SimpleSeqPtr) dataptr; 6131 ReplaceOneRNA (ssp, tbl->conflict); 6132 SimpleSeqFree (ssp); 6133 6134 } else { 6135 ObjMgrFree (datatype, dataptr); 6136 } 6137 } 6138 FileClose (fp); 6139 } 6140 6141 /* read one or more protein sequences from .prt file */ 6142 6143 fp = OpenOneFile (directory, base, ".prt"); 6144 if (fp != NULL) { 6145 6146 SeqMgrIndexFeatures (entityID, NULL); 6147 6148 sep = GetTopSeqEntryForEntityID (entityID); 6149 nucbsp = FindNucBioseq (sep); 6150 if (nucbsp != NULL) { 6151 BioseqToGeneticCode (nucbsp, &genCode, NULL, NULL, NULL, 0, NULL); 6152 SetBatchSuggestNucleotide (nucbsp, genCode); 6153 6154 descr = ExtractBioSourceAndPubs (sep); 6155 6156 while ((dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, TRUE, TRUE, FALSE)) != NULL) { 6157 if (datatype == OBJ_BIOSEQ) { 6158 6159 protbsp = (BioseqPtr) dataptr; 6160 protsep = SeqMgrGetSeqEntryForData (protbsp); 6161 mip = MolInfoNew (); 6162 if (mip != NULL) { 6163 mip->biomol = 8; 6164 mip->tech = 13; 6165 sdp = CreateNewDescriptor (protsep, Seq_descr_molinfo); 6166 if (sdp != NULL) { 6167 sdp->data.ptrvalue = (Pointer) mip; 6168 } 6169 } 6170 AddSeqEntryToSeqEntry (sep, protsep, TRUE); 6171 SuggestOnePeptide (nucbsp, protbsp, genCode); 6172 6173 } else { 6174 ObjMgrFree (datatype, dataptr); 6175 } 6176 } 6177 6178 ClearBatchSuggestNucleotide (); 6179 6180 ReplaceBioSourceAndPubs (sep, descr); 6181 } 6182 FileClose (fp); 6183 6184 SeqMgrIndexFeatures (entityID, NULL); 6185 } 6186 6187 /* read one or more quality score blocks from .qvl file */ 6188 6189 fp = OpenOneFile (directory, base, ".qvl"); 6190 if (fp != NULL) { 6191 6192 FileCacheSetup (&fc, fp); 6193 6194 goOn = TRUE; 6195 while (goOn) { 6196 str = FileCacheReadLine (&fc, buf, sizeof (buf), &nonewline); 6197 if (str == NULL) { 6198 goOn = FALSE; 6199 } else if (StringDoesHaveText (str)) { 6200 if (str [0] == '>') { 6201 ptr = StringChr (str, ' '); 6202 if (ptr == NULL) { 6203 ptr = StringChr (str, '\t'); 6204 } 6205 if (ptr != NULL) { 6206 *ptr = '\0'; 6207 } 6208 sip = MakeSeqID (str + 1); 6209 bsp = BioseqFind (sip); 6210 if (bsp != NULL) { 6211 sgp = ReadPhrapQualityFC (&fc, bsp); 6212 if (sgp != NULL) { 6213 for (sap = bsp->annot; sap != NULL; sap = sap->next) { 6214 if (sap->type == 3) { 6215 for (lastsgp = sap->data; lastsgp->next != NULL; lastsgp = lastsgp->next) { 6216 continue; 6217 } 6218 lastsgp->next = sgp; 6219 break; 6220 } 6221 } 6222 if (sap == NULL) { 6223 if (bsp->annot != NULL) { 6224 for (sap = bsp->annot; sap->next != NULL; sap = sap->next) { 6225 continue; 6226 } 6227 sap->next = NewGraphSeqAnnot ("Phrap Graph", sgp); 6228 } else { 6229 bsp->annot = NewGraphSeqAnnot ("Phrap Graph", sgp); 6230 } 6231 } 6232 } 6233 } 6234 SeqIdFree (sip); 6235 } 6236 } 6237 } 6238 FileClose (fp); 6239 } 6240 6241 /* finish processing */ 6242 6243 if (sbp == NULL) { 6244 omdp = ObjMgrGetData (entityID); 6245 if (omdp != NULL && omdp->datatype == OBJ_SEQSUB) { 6246 6247 /* if read a Seq-submit, write out a Seq-submit */ 6248 6249 sub = (SeqSubmitPtr) omdp->dataptr; 6250 if (sub != NULL && sub->datatype == 1) { 6251 sbp = sub->sub; 6252 } 6253 } 6254 } 6255 6256 sep = GetTopSeqEntryForEntityID (entityID); 6257 if (sep != NULL) { 6258 6259 if (tbl->gnltonote) { 6260 VisitFeaturesInSep (sep, NULL, GeneralToNote); 6261 } 6262 6263 if (tbl->gpstonps) { 6264 GPStoNPS (sep, entityID); 6265 sep = GetTopSeqEntryForEntityID (entityID); 6266 } 6267 6268 if (! tbl->genprodset) { 6269 VisitFeaturesInSep (sep, NULL, RemoveGBQualIDs); 6270 } 6271 if (sdphead != NULL) { 6272 if (IS_Bioseq (sep)) { 6273 bsp = (BioseqPtr) sep->data.ptrvalue; 6274 AddTemplateDescriptors (&(bsp->descr), sdphead, TRUE); 6275 } else if (IS_Bioseq_set (sep)) { 6276 dssp = (BioseqSetPtr) sep->data.ptrvalue; 6277 AddTemplateDescriptors (&(dssp->descr), sdphead, TRUE); 6278 } 6279 } 6280 dp = DateCurr (); 6281 if (dp != NULL) { 6282 has_create_date = FALSE; 6283 VisitDescriptorsInSep (sep, (Pointer) &has_create_date, FindCreateDate); 6284 if (has_create_date) { 6285 sdp = CreateNewDescriptor (sep, Seq_descr_update_date); 6286 } else { 6287 sdp = CreateNewDescriptor (sep, Seq_descr_create_date); 6288 } 6289 if (sdp != NULL) { 6290 sdp->data.ptrvalue = (Pointer) dp; 6291 } 6292 } 6293 6294 /* read one or more descriptors from .dsc file */ 6295 6296 fp = OpenOneFile (directory, base, ".dsc"); 6297 if (fp != NULL) { 6298 6299 while ((dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, TRUE, TRUE, TRUE)) != NULL) { 6300 if (datatype == OBJ_SEQDESC) { 6301 6302 if (IS_Bioseq (sep)) { 6303 bsp = (BioseqPtr) sep->data.ptrvalue; 6304 AddTemplateDescriptors (&(bsp->descr), (SeqDescrPtr) dataptr, FALSE); 6305 } else if (IS_Bioseq_set (sep)) { 6306 dssp = (BioseqSetPtr) sep->data.ptrvalue; 6307 AddTemplateDescriptors (&(dssp->descr), (SeqDescrPtr) dataptr, FALSE); 6308 } 6309 6310 } else { 6311 ObjMgrFree (datatype, dataptr); 6312 } 6313 } 6314 FileClose (fp); 6315 } 6316 6317 msev = ErrSetMessageLevel (SEV_MAX); 6318 move_cds (sep); 6319 6320 /* if reading nucleotide and protein tables, remove duplicate prot feat */ 6321 VisitBioseqsInSep (sep, NULL, RemoveDupProtFeats); 6322 DeleteMarkedObjects (entityID, 0, NULL); 6323 6324 /* need to reindex before extending CDS to stop codon */ 6325 SeqMgrIndexFeatures (entityID, NULL); 6326 CdCheck (sep, NULL); 6327 6328 /* need to reindex before copying genes, instantiating protein titles */ 6329 SeqMgrIndexFeatures (entityID, NULL); 6330 EntryChangeImpFeat (sep); 6331 6332 /* find locus for any gene xrefs that only have locus_tag */ 6333 VisitFeaturesInSep (sep, NULL, FillInPartialGeneXref); 6334 6335 if (tbl->removeunnecxref) { 6336 /* if not removed, xref will prevent locus, maploc, dbxref from being copied */ 6337 VisitFeaturesInSep (sep, NULL, RemoveUnnecGeneXref); 6338 } 6339 6340 if (tbl->genprodset) { 6341 VisitFeaturesInSep (sep, NULL, CopyGene); 6342 } 6343 if (tbl->genprodset) { 6344 /* currently copying ncRNA feature onto product */ 6345 VisitFeaturesInSep (sep, NULL, CopyNcRna); 6346 } 6347 if (! tbl->genprodset) { 6348 VisitFeaturesInSep (sep, NULL, ClearRnaProducts); 6349 } 6350 6351 if (tbl->removeunnecxref) { 6352 /* need to reindex before removing unnecesary gene xrefs in nuc-prot sets */ 6353 SeqMgrIndexFeatures (entityID, NULL); 6354 6355 VisitFeaturesInSep (sep, NULL, RemoveUnnecGeneXref); 6356 } 6357 6358 if (! tbl->relaxed) { 6359 list = GetValidCountryList (); 6360 VisitBioSourcesInSep (sep, (Pointer) list, CleanUpLatLonAndCountry); 6361 } 6362 6363 /* need to reindex so hypothetical protein titles pick up locus_tag */ 6364 SeqMgrIndexFeatures (entityID, NULL); 6365 InstantiateProteinTitles (entityID, NULL); 6366 6367 if (tbl->genprodset) { 6368 /* need to reindex before instantiating mRNA titles */ 6369 SeqMgrIndexFeatures (entityID, NULL); 6370 bsp = FindNucBioseq (sep); 6371 6372 if (tbl->smarttitle) { 6373 MakeSmartRnaTitles (bsp, organism); 6374 } else { 6375 sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_RNA, 0, &context); 6376 while (sfp != NULL) { 6377 AddRnaTitles (sfp, organism); 6378 sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_RNA, 0, &context); 6379 } 6380 } 6381 } 6382 6383 if (StringDoesHaveText (tbl->center)) { 6384 VisitBioseqsInSep (sep, tbl->center, MakeGenomeCenterID); 6385 } 6386 6387 if (StringDoesHaveText (tbl->accn)) { 6388 bsp = FindNucBioseq (sep); 6389 MakeAccessionID (bsp, tbl->accn); 6390 } 6391 6392 VisitDescriptorsInSep (sep, NULL, ConvertStructuredComment); 6393 6394 SeqMgrClearFeatureIndexes (entityID, NULL); 6395 BasicSeqEntryCleanup (sep); 6396 ErrSetMessageLevel (msev); 6397 /* 6398 SeriousSeqEntryCleanup (sep, NULL, NULL); 6399 */ 6400 ConvertFullLenSourceFeatToDesc (sep); 6401 ConvertFullLenPubFeatToDesc (sep); 6402 if (tbl->linkbyoverlap) { 6403 SeqMgrIndexFeatures (entityID, NULL); 6404 LinkCDSmRNAbyOverlap (sep); 6405 } else if (tbl->linkbyproduct) { 6406 SeqMgrIndexFeatures (entityID, NULL); 6407 LinkCDSmRNAbyProduct (sep); 6408 } 6409 6410 DoTbl2AsnCleanup (sep, &(tbl->cleanup_args)); 6411 NormalizeDescriptorOrder (sep); 6412 6413 if (StringHasNoText (results)) { 6414 results = directory; 6415 } 6416 6417 if (aip != NULL) { 6418 atp_bssse = AsnFind ("Bioseq-set.seq-set.E"); 6419 if (atp_bssse == NULL) { 6420 Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.seq-set.E"); 6421 } else if (tbl->fastaset && tbl->whichclass == 0) { 6422 /* already has genbank wrapper, write individual components */ 6423 tmp = PropagateDescsFromGenBankSet (sep); 6424 SeqMgrClearFeatureIndexes (entityID, NULL); 6425 while (tmp != NULL) { 6426 SeqEntryAsnWrite (tmp, aip, atp_bssse); 6427 tmp = tmp->next; 6428 } 6429 } else { 6430 SeqEntryAsnWrite (sep, aip, atp_bssse); 6431 } 6432 } else { 6433 if (tbl->fastaset && tbl->whichclass == 0) { 6434 PropagateDescsFromGenBankSet (sep); 6435 SeqMgrClearFeatureIndexes (entityID, NULL); 6436 } 6437 WriteOneFile (results, base, ".sqn", outfile, sep, sbp, tbl->save_bioseq_set); 6438 } 6439 6440 if (HasGoTermsInNote (sep, gotags)) { 6441 Message (MSG_OK, "Illegal GO term format detected in note - contact database for instructions"); 6442 } 6443 6444 if (tbl->global_report != NULL) { 6445 AddSeqEntryToGlobalDiscrepReport (sep, tbl->global_report, base); 6446 } 6447 6448 if (tbl->validate || tbl->flatfile || tbl->genereport || tbl->validate_barcode) { 6449 if (pdp != NULL) { 6450 6451 /* copy in citsub as publication for validator and flatfile */ 6452 6453 sdp = CreateNewDescriptor (sep, Seq_descr_pub); 6454 if (sdp != NULL) { 6455 sdp->data.ptrvalue = AsnIoMemCopy ((Pointer) pdp, 6456 (AsnReadFunc) PubdescAsnRead, 6457 (AsnWriteFunc) PubdescAsnWrite); 6458 } 6459 } 6460 SeqMgrIndexFeatures (entityID, 0); 6461 if (tbl->flatfile) { 6462 Message (MSG_POST, "Flatfile %s\n", base); 6463 FlatfileOneFile (results, base, ".gbf", sep); 6464 } 6465 if (tbl->validate || tbl->validate_barcode) { 6466 Message (MSG_POST, "Validating %s\n", base); 6467 ValidateOneFile (results, base, ".val", sep, tbl->validate, tbl->relaxed, tbl->validate_barcode); 6468 } 6469 if (tbl->genereport) { 6470 GeneReportOneFile (results, base, ".t2g", sep); 6471 } 6472 } 6473 } 6474 6475 ObjMgrFreeByEntityID (entityID); 6476 } 6477 6478 6479 6480 static CharPtr overwriteMsg = "Your template with a .sqn suffix will be overwritten. Do you wish to continue?"; 6481 6482 static Boolean TemplateOverwriteRisk ( 6483 CharPtr filename, 6484 CharPtr single, 6485 CharPtr directory, 6486 CharPtr suffix 6487 ) 6488 6489 { 6490 Char file [FILENAME_MAX], path [PATH_MAX]; 6491 CharPtr ptr; 6492 6493 6494 if (StringStr (filename, ".sqn") == NULL) return FALSE; 6495 if (StringDoesHaveText (single)) { 6496 StringNCpy_0 (file, filename, sizeof (file)); 6497 ptr = StringStr (file, "."); 6498 if (ptr != NULL) { 6499 *ptr = '\0'; 6500 } 6501 ptr = StringStr (single, "."); 6502 if (ptr != NULL) { 6503 StringCat (file, ptr); 6504 } 6505 if (StringCmp (file, single) == 0) return TRUE; 6506 } else if (StringDoesHaveText (directory)) { 6507 StringNCpy_0 (path, directory, sizeof (path)); 6508 StringNCpy_0 (file, filename, sizeof (file)); 6509 ptr = StringStr (file, "."); 6510 if (ptr != NULL) { 6511 *ptr = '\0'; 6512 } 6513 StringCat (file, suffix); 6514 FileBuildPath (path, NULL, file); 6515 if (FileLength (path) > 0) return TRUE; 6516 } 6517 return FALSE; 6518 } 6519 6520 static void FileRecurse ( 6521 SubmitBlockPtr sbp, 6522 PubdescPtr pdp, 6523 BioSourcePtr src, 6524 CharPtr directory, 6525 CharPtr results, 6526 CharPtr suffix, 6527 Boolean recurse, 6528 SeqDescrPtr sdphead, 6529 TblArgsPtr tbl, 6530 TextFsaPtr gotags, 6531 AsnIoPtr aip, 6532 CharPtr outfile 6533 ) 6534 6535 { 6536 Char path [PATH_MAX]; 6537 CharPtr ptr; 6538 CharPtr str; 6539 ValNodePtr head, vnp; 6540 6541 /* get list of all files in source directory */ 6542 6543 head = DirCatalog (directory); 6544 6545 for (vnp = head; vnp != NULL; vnp = vnp->next) { 6546 if (vnp->choice == 0) { 6547 str = (CharPtr) vnp->data.ptrvalue; 6548 if (StringDoesHaveText (str)) { 6549 6550 /* does filename have desired substring? */ 6551 6552 ptr = StringStr (str, suffix); 6553 6554 if (ptr != NULL) { 6555 6556 /* make sure detected suffix is really at end of filename */ 6557 6558 if (StringCmp (ptr, suffix) == 0) { 6559 *ptr = '\0'; 6560 6561 /* process file that has desired suffix (usually .fsa) */ 6562 6563 ProcessOneRecord (sbp, pdp, src, directory, results, str, suffix, sdphead, tbl, gotags, aip, outfile); 6564 } 6565 } 6566 } 6567 } else if (vnp->choice == 1 && recurse) { 6568 6569 /* recurse into subdirectory */ 6570 6571 StringNCpy_0 (path, directory, sizeof (path)); 6572 str = (CharPtr) vnp->data.ptrvalue; 6573 FileBuildPath (path, str, NULL); 6574 FileRecurse (sbp, pdp, src, path, results, suffix, recurse, sdphead, tbl, gotags, aip, outfile); 6575 } 6576 } 6577 6578 /* clean up file list */ 6579 6580 ValNodeFreeData (head); 6581 } 6582 6583 static AsnTypePtr DoFirstPrefix ( 6584 AsnIoPtr aip, 6585 SubmitBlockPtr sbp 6586 ) 6587 6588 { 6589 AsnTypePtr atp_se, atp_ses, atp_ss, atp_ssd, atp_ssde, atp_ssdee, atp_sss, sep_atp, ssp_atp; 6590 DataVal av; 6591 SeqEntry se; 6592 SeqSubmit ss; 6593 6594 if (aip == NULL || sbp == NULL) return NULL; 6595 6596 atp_ss = AsnFind ("Seq-submit"); 6597 if (atp_ss == NULL) { 6598 Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit"); 6599 return NULL; 6600 } 6601 6602 atp_sss = AsnFind ("Seq-submit.sub"); 6603 if (atp_sss == NULL) { 6604 Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit.sub"); 6605 return NULL; 6606 } 6607 6608 atp_ssd = AsnFind ("Seq-submit.data"); 6609 if (atp_ssd == NULL) { 6610 Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit.data"); 6611 return NULL; 6612 } 6613 6614 atp_ssde = AsnFind ("Seq-submit.data.entrys"); 6615 if (atp_ssde == NULL) { 6616 Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit.data.entrys"); 6617 return NULL; 6618 } 6619 6620 atp_se = AsnFind ("Seq-entry"); 6621 if (atp_se == NULL) { 6622 Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-entry"); 6623 return NULL; 6624 } 6625 6626 atp_ses = AsnFind ("Seq-entry.set"); 6627 if (atp_ses == NULL) { 6628 Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-entry.set"); 6629 return NULL; 6630 } 6631 6632 atp_ssdee = AsnFind ("Seq-submit.data.entrys.E"); 6633 if (atp_ssdee == NULL) { 6634 Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit.data.entrys.E"); 6635 return NULL; 6636 } 6637 6638 6639 ssp_atp = AsnLinkType (NULL, atp_ss); 6640 if (ssp_atp == NULL) return NULL; 6641 6642 MemSet ((Pointer) &ss, 0, sizeof (SeqSubmit)); 6643 MemSet ((Pointer) &se, 0, sizeof (SeqEntry)); 6644 se.choice = 2; 6645 6646 if (! AsnOpenStruct (aip, ssp_atp, (Pointer) &ss)) return NULL; 6647 6648 if (! SubmitBlockAsnWrite (sbp, aip, atp_sss)) return NULL; 6649 6650 av.ptrvalue = (Pointer) &se; 6651 if (! AsnWriteChoice (aip, atp_ssd, (Int2) 1, &av)) return NULL; 6652 6653 if (! AsnOpenStruct (aip, atp_ssde, (Pointer) &se)) return NULL; 6654 6655 sep_atp = AsnLinkType (atp_ssdee, atp_se); 6656 if (sep_atp == NULL) return NULL; 6657 6658 av.ptrvalue = (Pointer) &se; 6659 se.choice = 2; 6660 if (! AsnWriteChoice (aip, sep_atp, (Int2) 2, &av)) return NULL; 6661 6662 return ssp_atp; 6663 } 6664 6665 static AsnTypePtr DoSecondPrefix ( 6666 AsnIoPtr aip, 6667 TblArgsPtr tbl 6668 ) 6669 6670 { 6671 AsnTypePtr atp_bsc, atp_bss, atp_bsss, atp_ses, bssp_atp; 6672 DataVal av; 6673 BioseqSet bs; 6674 6675 atp_ses = AsnFind ("Seq-entry.set"); 6676 if (atp_ses == NULL) { 6677 Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-entry.set"); 6678 return NULL; 6679 } 6680 6681 atp_bss = AsnFind ("Bioseq-set"); 6682 if (atp_bss == NULL) { 6683 Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set"); 6684 return NULL; 6685 } 6686 6687 atp_bsc = AsnFind ("Bioseq-set.class"); 6688 if (atp_bsc == NULL) { 6689 Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.class"); 6690 return NULL; 6691 } 6692 6693 atp_bsss = AsnFind ("Bioseq-set.seq-set"); 6694 if (atp_bsss == NULL) { 6695 Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.seq-set"); 6696 return NULL; 6697 } 6698 6699 6700 bssp_atp = AsnLinkType (atp_ses, atp_bss); 6701 if (bssp_atp == NULL) return NULL; 6702 6703 MemSet ((Pointer) &bs, 0, sizeof (BioseqSet)); 6704 6705 if (! AsnOpenStruct (aip, bssp_atp, (Pointer) &bs)) return NULL; 6706 6707 switch (tbl->whichclass) { 6708 case 1 : 6709 av.intvalue = BioseqseqSet_class_pop_set; 6710 break; 6711 case 2 : 6712 av.intvalue = BioseqseqSet_class_phy_set; 6713 break; 6714 case 3 : 6715 av.intvalue = BioseqseqSet_class_mut_set; 6716 break; 6717 case 4 : 6718 av.intvalue = BioseqseqSet_class_eco_set; 6719 break; 6720 default : 6721 av.intvalue = BioseqseqSet_class_genbank; 6722 break; 6723 } 6724 if (! AsnWrite (aip, atp_bsc, &av)) return NULL; 6725 6726 if (! AsnOpenStruct (aip, atp_bsss, (Pointer) &bs.seq_set)) return NULL; 6727 6728 return bssp_atp; 6729 } 6730 6731 static Boolean DoFirstSuffix ( 6732 AsnIoPtr aip, 6733 AsnTypePtr ssp_atp 6734 ) 6735 6736 { 6737 AsnTypePtr atp_bsss, atp_ssde, atp_ssdee; 6738 BioseqSet bs; 6739 SeqEntry se; 6740 SeqSubmit ss; 6741 6742 if (aip == NULL || ssp_atp == NULL) return FALSE; 6743 6744 atp_ssde = AsnFind ("Seq-submit.data.entrys"); 6745 if (atp_ssde == NULL) { 6746 Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit.data.entrys"); 6747 return FALSE; 6748 } 6749 6750 atp_ssdee = AsnFind ("Seq-submit.data.entrys.E"); 6751 if (atp_ssdee == NULL) { 6752 Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit.data.entrys.E"); 6753 return FALSE; 6754 } 6755 6756 atp_bsss = AsnFind ("Bioseq-set.seq-set"); 6757 if (atp_bsss == NULL) { 6758 Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.seq-set"); 6759 return FALSE; 6760 } 6761 6762 6763 MemSet ((Pointer) &ss, 0, sizeof (SeqSubmit)); 6764 MemSet ((Pointer) &se, 0, sizeof (SeqEntry)); 6765 MemSet ((Pointer) &bs, 0, sizeof (BioseqSet)); 6766 6767 if (! AsnCloseStruct (aip, atp_ssde, &se)) return FALSE; 6768 6769 if (! AsnCloseStruct (aip, ssp_atp, (Pointer) &ss)) return FALSE; 6770 6771 AsnUnlinkType (atp_ssdee); 6772 6773 return TRUE; 6774 } 6775 6776 static Boolean DoSecondSuffix ( 6777 AsnIoPtr aip, 6778 AsnTypePtr bssp_atp 6779 ) 6780 6781 { 6782 AsnTypePtr atp_bsss, atp_ses; 6783 BioseqSet bs; 6784 6785 if (aip == NULL || bssp_atp == NULL) return FALSE; 6786 6787 atp_ses = AsnFind ("Seq-entry.set"); 6788 if (atp_ses == NULL) { 6789 Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-entry.set"); 6790 return FALSE; 6791 } 6792 6793 atp_bsss = AsnFind ("Bioseq-set.seq-set"); 6794 if (atp_bsss == NULL) { 6795 Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.seq-set"); 6796 return FALSE; 6797 } 6798 6799 6800 MemSet ((Pointer) &bs, 0, sizeof (BioseqSet)); 6801 6802 if (! AsnCloseStruct(aip, atp_bsss, (Pointer) &bs.seq_set)) return FALSE; 6803 6804 if (! AsnCloseStruct (aip, bssp_atp, (Pointer) &bs)) return FALSE; 6805 6806 AsnUnlinkType (atp_ses); 6807 6808 return TRUE; 6809 } 6810 6811 static CharPtr ReadCommentFile ( 6812 CharPtr filename 6813 ) 6814 6815 { 6816 FileCache fc; 6817 FILE *fp; 6818 ValNodePtr head = NULL, last = NULL, vnp; 6819 Int4 len; 6820 Char line [4096]; 6821 Boolean nonewline, notfirst; 6822 CharPtr ptr, str, tmp; 6823 6824 if (StringHasNoText (filename)) return NULL; 6825 fp = FileOpen (filename, "r"); 6826 if (fp == NULL) return NULL; 6827 6828 FileCacheSetup (&fc, fp); 6829 6830 str = FileCacheReadLine (&fc, line, sizeof (line), &nonewline); 6831 while (str != NULL) { 6832 vnp = ValNodeCopyStr (&last, 0, str); 6833 if (head == NULL) { 6834 head = vnp; 6835 } 6836 last = vnp; 6837 6838 str = FileCacheReadLine (&fc, line, sizeof (line), &nonewline); 6839 } 6840 6841 FileClose (fp); 6842 6843 if (head == NULL) return NULL; 6844 6845 len = 0; 6846 for (vnp = head; vnp != NULL; vnp = vnp->next) { 6847 str = (CharPtr) vnp->data.ptrvalue; 6848 len += StringLen (str) + 1; 6849 } 6850 6851 tmp = (CharPtr) MemNew (sizeof (Char) * (len + 5)); 6852 if (tmp == NULL) return NULL; 6853 6854 ptr = tmp; 6855 notfirst = FALSE; 6856 for (vnp = head; vnp != NULL; vnp = vnp->next) { 6857 str = (CharPtr) vnp->data.ptrvalue; 6858 if (str == NULL) continue; 6859 if (*str == '\0' || *str == ' ') { 6860 ptr = StringMove (ptr, "~"); 6861 } else if (notfirst) { 6862 ptr = StringMove (ptr, " "); 6863 } 6864 ptr = StringMove (ptr, str); 6865 notfirst = TRUE; 6866 } 6867 6868 ValNodeFreeData (head); 6869 6870 return tmp; 6871 } 6872 6873 static CharPtr ParseCommaField ( 6874 CharPtr PNTR strP 6875 ) 6876 6877 { 6878 CharPtr ptr; 6879 CharPtr str; 6880 6881 if (strP == NULL) return NULL; 6882 6883 str = *strP; 6884 if (StringHasNoText (str)) { 6885 *strP = NULL; 6886 return NULL; 6887 } 6888 6889 ptr = StringChr (str, ','); 6890 if (ptr == NULL) { 6891 *strP = NULL; 6892 return str; 6893 } 6894 6895 *ptr = '\0'; 6896 ptr++; 6897 if (StringHasNoText (ptr)) { 6898 ptr = NULL; 6899 } 6900 *strP = ptr; 6901 6902 if (StringHasNoText (str)) { 6903 str = NULL; 6904 } 6905 return str; 6906 } 6907 6908 static DatePtr DateParse ( 6909 CharPtr str 6910 ) 6911 6912 { 6913 Int4 day = -1, month = -1, year = -1; 6914 DatePtr dp; 6915 CharPtr ptr; 6916 Char tmp [64]; 6917 long int val; 6918 6919 if (StringHasNoText (str)) return NULL; 6920 6921 StringNCpy_0 (tmp, str, sizeof (tmp)); 6922 ptr = StringChr (tmp, '/'); 6923 if (ptr == NULL) { 6924 ptr = StringChr (tmp, '-'); 6925 } 6926 if (ptr != NULL) { 6927 *ptr = '\0'; 6928 ptr++; 6929 if (sscanf (tmp, "%ld", &val) == 1) { 6930 month = (Int4) val; 6931 } 6932 str = StringChr (ptr, '/'); 6933 if (str == NULL) { 6934 str = StringChr (ptr, '-'); 6935 } 6936 if (str != NULL) { 6937 *str = '\0'; 6938 str++; 6939 if (sscanf (ptr, "%ld", &val) == 1) { 6940 day = (Int4) val; 6941 } 6942 if (sscanf (str, "%ld", &val) == 1) { 6943 year = (Int4) val; 6944 } 6945 } 6946 } 6947 6948 if (month < 0 || day < 0 || year < 2000) return NULL; 6949 if (month > 12 || day > 31 || year > 2099) return NULL; 6950 6951 dp = DateNew (); 6952 if (dp == NULL) return NULL; 6953 6954 dp->data [0] = 1; 6955 dp->data [1] = (Uint1) (year - 1900); 6956 dp->data [2] = (Uint1) month; 6957 dp->data [3] = (Uint1) day; 6958 6959 return dp; 6960 } 6961 6962 /* Args structure contains command-line arguments */ 6963 6964 #define p_argInputPath 0 6965 #define r_argOutputPath 1 6966 #define i_argInputFile 2 6967 #define o_argOutputFile 3 6968 #define x_argSuffix 4 6969 #define E_argRecurse 5 6970 #define t_argTemplate 6 6971 #define a_argType 7 6972 #define s_argFastaSet 8 6973 #define g_argGenProdSet 9 6974 #define F_argFeatIdLinks 10 6975 #define A_argAccession 11 6976 #define C_argCenter 12 6977 #define n_argOrgName 13 6978 #define j_argSrcQuals 14 6979 #define y_argComment 15 6980 #define Y_argCommentFile 16 6981 #define D_argDescrsFile 17 6982 #define f_argTableFile 18 6983 #define k_argCdsFlags 19 6984 #define V_argVerify 20 6985 #define v_argValidate 21 6986 #define b_argGenBank 22 6987 #define q_argFileID 23 6988 #define u_argUndoGPS 24 6989 #define h_argGnlToNote 25 6990 #define G_argGapFields 26 6991 #define R_argRemote 27 6992 #define S_argSmartFeats 28 6993 #define Q_argSmartTitle 29 6994 #define U_argUnnecXref 30 6995 #define L_argLocalID 31 6996 #define T_argTaxLookup 32 6997 #define P_argPubLookup 33 6998 #define W_argLogProgress 34 6999 #define K_argBioseqSet 35 7000 #define H_argHoldUntilPub 36 7001 #define Z_argDiscRepFile 37 7002 #define c_argCleanupOptions 38 7003 7004 7005 Args myargs [] = { 7006 {"Path to Files", NULL, NULL, NULL, 7007 TRUE, 'p', ARG_STRING, 0.0, 0, NULL}, 7008 {"Path for Results", NULL, NULL, NULL, 7009 TRUE, 'r', ARG_STRING, 0.0, 0, NULL}, 7010 {"Single Input File", NULL, NULL, NULL, 7011 TRUE, 'i', ARG_FILE_IN, 0.0, 0, NULL}, 7012 {"Single Output File", NULL, NULL, NULL, 7013 TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL}, 7014 {"Suffix", ".fsa", NULL, NULL, 7015 TRUE, 'x', ARG_STRING, 0.0, 0, NULL}, 7016 {"Recurse", "F", NULL, NULL, 7017 TRUE, 'E', ARG_BOOLEAN, 0.0, 0, NULL}, 7018 {"Template File", NULL, NULL, NULL, 7019 TRUE, 't', ARG_FILE_IN, 0.0, 0, NULL}, 7020 {"File Type\n" 7021 " a Any\n" 7022 " r20u Runs of 20+ Ns are gaps, 100 Ns are unknown length\n" 7023 " r20k Runs of 20+ Ns are gaps, 100 Ns are known length\n" 7024 " s FASTA Set (s Batch, s1 Pop, s2 Phy, s3 Mut, s4 Eco)\n" 7025 " d FASTA Delta, di FASTA Delta with Implicit Gaps\n" 7026 " l FASTA+Gap Alignment\n" 7027 " z FASTA with Gap Lines\n" 7028 " e PHRAP/ACE\n", "a", NULL, NULL, 7029 TRUE, 'a', ARG_STRING, 0.0, 0, NULL}, 7030 {"Read FASTAs as Set", "F", NULL, NULL, 7031 TRUE, 's', ARG_BOOLEAN, 0.0, 0, NULL}, 7032 {"Genomic Product Set", "F", NULL, NULL, 7033 TRUE, 'g', ARG_BOOLEAN, 0.0, 0, NULL}, 7034 {"Feature ID Links (o by Overlap, p by Product)", NULL, NULL, NULL, 7035 TRUE, 'F', ARG_STRING, 0.0, 0, NULL}, 7036 {"Accession", NULL, NULL, NULL, 7037 TRUE, 'A', ARG_STRING, 0.0, 0, NULL}, 7038 {"Genome Center Tag", NULL, NULL, NULL, 7039 TRUE, 'C', ARG_STRING, 0.0, 0, NULL}, 7040 {"Organism Name", NULL, NULL, NULL, 7041 TRUE, 'n', ARG_STRING, 0.0, 0, NULL}, 7042 {"Source Qualifiers", NULL, NULL, NULL, 7043 TRUE, 'j', ARG_STRING, 0.0, 0, NULL}, 7044 {"Comment", NULL, NULL, NULL, 7045 TRUE, 'y', ARG_STRING, 0.0, 0, NULL}, 7046 {"Comment File", NULL, NULL, NULL, 7047 TRUE, 'Y', ARG_FILE_IN, 0.0, 0, NULL}, 7048 {"Descriptors File", NULL, NULL, NULL, 7049 TRUE, 'D', ARG_FILE_IN, 0.0, 0, NULL}, 7050 {"Single Table File", NULL, NULL, NULL, 7051 TRUE, 'f', ARG_FILE_IN, 0.0, 0, NULL}, 7052 {"CDS Flags (combine any of the following letters)\n" 7053 " c Annotate Longest ORF\n" 7054 " r Allow Runon ORFs\n" 7055 " m Allow Alternative Starts\n" 7056 " k Set Conflict on Mismatch\n", NULL, NULL, NULL, 7057 TRUE, 'k', ARG_STRING, 0.0, 0, NULL}, 7058 {"Verification (combine any of the following letters)\n" 7059 " v Validate with Normal Stringency\n" 7060 " r Validate without Country Check\n" 7061 " b Generate GenBank Flatfile\n" 7062 " g Generate Gene Report\n", NULL, NULL, NULL, 7063 TRUE, 'V', ARG_STRING, 0.0, 0, NULL}, 7064 {"Validate (obsolete: use -V v)", "F", NULL, NULL, 7065 TRUE, 'v', ARG_BOOLEAN, 0.0, 0, NULL}, 7066 {"Generate GenBank File (obsolete: use -V b)", "F", NULL, NULL, 7067 TRUE, 'b', ARG_BOOLEAN, 0.0, 0, NULL}, 7068 {"Seq ID from File Name", "F", NULL, NULL, 7069 TRUE, 'q', ARG_BOOLEAN, 0.0, 0, NULL}, 7070 {"GenProdSet to NucProtSet", "F", NULL, NULL, 7071 TRUE, 'u', ARG_BOOLEAN, 0.0, 0, NULL}, 7072 {"General ID to Note", "F", NULL, NULL, 7073 TRUE, 'h', ARG_BOOLEAN, 0.0, 0, NULL}, 7074 {"Alignment Gap Flags (comma separated fields, e.g., p,-,-,-,?,. )\n" 7075 " n Nucleotide or p Protein,\n" 7076 " Begin, Middle, End Gap Characters,\n" 7077 " Missing Characters, Match Characters\n", NULL, NULL, NULL, 7078 TRUE, 'G', ARG_STRING, 0.0, 0, NULL}, 7079 {"Remote Sequence Record Fetching from ID", "F", NULL, NULL, 7080 TRUE, 'R', ARG_BOOLEAN, 0.0, 0, NULL}, 7081 {"Smart Feature Annotation", "F", NULL, NULL, 7082 TRUE, 'S', ARG_BOOLEAN, 0.0, 0, NULL}, 7083 {"Special mRNA Titles", "F", NULL, NULL, 7084 TRUE, 'Q', ARG_BOOLEAN, 0.0, 0, NULL}, 7085 {"Remove Unnecessary Gene Xref", "F", NULL, NULL, 7086 TRUE, 'U', ARG_BOOLEAN, 0.0, 0, NULL}, 7087 {"Force Local protein_id/transcript_id", "F", NULL, NULL, 7088 TRUE, 'L', ARG_BOOLEAN, 0.0, 0, NULL}, 7089 {"Remote Taxonomy Lookup", "F", NULL, NULL, 7090 TRUE, 'T', ARG_BOOLEAN, 0.0, 0, NULL}, 7091 {"Remote Publication Lookup", "F", NULL, NULL, 7092 TRUE, 'P', ARG_BOOLEAN, 0.0, 0, NULL}, 7093 {"Log Progress", "F", NULL, NULL, 7094 TRUE, 'W', ARG_BOOLEAN, 0.0, 0, NULL}, 7095 {"Save Bioseq-set", "F", NULL, NULL, 7096 TRUE, 'K', ARG_BOOLEAN, 0.0, 0, NULL}, 7097 {"Hold Until Publish\n" 7098 " y Hold for One Year\n" 7099 " mm/dd/yyyy\n", NULL, NULL, NULL, 7100 TRUE, 'H', ARG_STRING, 0.0, 0, NULL}, 7101 {"Discrepancy Report Output File", NULL, NULL, NULL, 7102 TRUE, 'Z', ARG_FILE_OUT, 0.0, 0, NULL}, 7103 {"Cleanup (combine any of the following letters)\n" 7104 " d Correct Collection Dates (assume month first)\n" 7105 " D Correct Collection Dates (assume day first)\n" 7106 " b Append note to coding regions that overlap other coding regions with similar product names and do not contain 'ABC'", 7107 NULL, NULL, NULL, 7108 TRUE, 'c', ARG_STRING, 0.0, 0, NULL}, 7109 }; 7110 7111 Int2 Main (void) 7112 7113 { 7114 AsnIoPtr aip = NULL; 7115 Char app [64]; 7116 CharPtr base; 7117 AsnTypePtr bssp_atp = NULL; 7118 CitSubPtr csp; 7119 Pointer dataptr; 7120 Uint2 datatype; 7121 CharPtr descrs; 7122 CharPtr directory; 7123 DatePtr dp; 7124 FILE *fp; 7125 Char gapstring [128]; 7126 TextFsaPtr gotags; 7127 CharPtr hold; 7128 CharPtr os; 7129 CharPtr outfile; 7130 Pubdesc pd; 7131 PubdescPtr pdp = NULL; 7132 ValNode pb; 7133 CharPtr ptr; 7134 Boolean recurse; 7135 Boolean remote; 7136 CharPtr results; 7137 SubmitBlockPtr sbp = NULL; 7138 SeqDescrPtr sdphead = NULL; 7139 SeqEntryPtr sep; 7140 Char sfx [32]; 7141 BioSourcePtr src = NULL; 7142 SeqSubmitPtr ssp = NULL; 7143 AsnTypePtr ssp_atp = NULL; 7144 Char str [64]; 7145 CharPtr suffix; 7146 TblArgs tbl; 7147 CharPtr tmp; 7148 CharPtr tmplate; 7149 CharPtr disc_rep_file = NULL; 7150 7151 /* standard setup */ 7152 7153 ErrSetFatalLevel (SEV_MAX); 7154 ErrSetMessageLevel (SEV_MAX); 7155 ErrClearOptFlags (EO_SHOW_USERSTR); 7156 UseLocalAsnloadDataAndErrMsg (); 7157 ErrPathReset (); 7158 7159 /* finish resolving internal connections in ASN.1 parse tables */ 7160 7161 if (! AllObjLoad ()) { 7162 Message (MSG_FATAL, "AllObjLoad failed"); 7163 return 1; 7164 } 7165 if (! SubmitAsnLoad ()) { 7166 Message (MSG_FATAL, "SubmitAsnLoad failed"); 7167 return 1; 7168 } 7169 if (! FeatDefSetLoad ()) { 7170 Message (MSG_FATAL, "FeatDefSetLoad failed"); 7171 return 1; 7172 } 7173 if (! SeqCodeSetLoad ()) { 7174 Message (MSG_FATAL, "SeqCodeSetLoad failed"); 7175 return 1; 7176 } 7177 if (! GeneticCodeTableLoad ()) { 7178 Message (MSG_FATAL, "GeneticCodeTableLoad failed"); 7179 return 1; 7180 } 7181 7182 /* process command line arguments */ 7183 7184 sprintf (app, "tbl2asn %s", TBL2ASN_APPLICATION); 7185 if (! GetArgs (app, sizeof (myargs) / sizeof (Args), myargs)) { 7186 return 0; 7187 } 7188 7189 directory = (CharPtr) myargs [p_argInputPath].strvalue; 7190 results = (CharPtr) myargs [r_argOutputPath].strvalue; 7191 if (StringHasNoText (results)) { 7192 results = NULL; 7193 } 7194 suffix = (CharPtr) myargs [x_argSuffix].strvalue; 7195 recurse = (Boolean) myargs [E_argRecurse].intvalue; 7196 base = (CharPtr) myargs [i_argInputFile].strvalue; 7197 outfile = (CharPtr) myargs [o_argOutputFile].strvalue; 7198 if (StringHasNoText (outfile)) { 7199 outfile = NULL; 7200 } 7201 tmplate = (CharPtr) myargs [t_argTemplate].strvalue; 7202 descrs = (CharPtr) myargs [D_argDescrsFile].strvalue; 7203 7204 hold = (CharPtr) myargs [H_argHoldUntilPub].strvalue; 7205 7206 if (StringHasNoText(directory) && StringHasNoText(base)) { 7207 Message (MSG_FATAL, "You must supply either an input file (-i) or an input directory (-p).\nUse -p . to specify the current directory.\n\n"); 7208 return 1; 7209 } 7210 remote = (Boolean) myargs [R_argRemote].intvalue; 7211 7212 MemSet ((Pointer) &tbl, 0, sizeof (TblArgs)); 7213 7214 /* -s is heavily used and will remain as an alternative to -a s */ 7215 7216 tbl.fastaset = (Boolean) myargs [s_argFastaSet].intvalue; 7217 7218 /* process new -a type argument */ 7219 7220 ptr = myargs [a_argType].strvalue; 7221 if (StringICmp (ptr, "r20u") == 0) { 7222 tbl.raw2delt = TRUE; 7223 tbl.r2dmin = 20; 7224 tbl.r2dunk100 = TRUE; 7225 } else if (StringICmp (ptr, "r20k") == 0) { 7226 tbl.raw2delt = TRUE; 7227 tbl.r2dmin = 20; 7228 tbl.r2dunk100 = FALSE; 7229 } else if (StringICmp (ptr, "s") == 0) { 7230 tbl.fastaset = TRUE; 7231 } else if (StringICmp (ptr, "w1") == 0 || StringICmp (ptr, "s1") == 0) { 7232 tbl.fastaset = TRUE; 7233 tbl.whichclass = 1; 7234 } else if (StringICmp (ptr, "w2") == 0 || StringICmp (ptr, "s2") == 0) { 7235 tbl.fastaset = TRUE; 7236 tbl.whichclass = 2; 7237 } else if (StringICmp (ptr, "w3") == 0 || StringICmp (ptr, "s3") == 0) { 7238 tbl.fastaset = TRUE; 7239 tbl.whichclass = 3; 7240 } else if (StringICmp (ptr, "w4") == 0 || StringICmp (ptr, "s4") == 0) { 7241 tbl.fastaset = TRUE; 7242 tbl.whichclass = 4; 7243 } else if (StringICmp (ptr, "d") == 0) { 7244 tbl.deltaset = TRUE; 7245 } else if (StringICmp (ptr, "di") == 0) { 7246 tbl.deltaset = TRUE; 7247 tbl.implicitgaps = TRUE; 7248 } else if (StringICmp (ptr, "l") == 0) { 7249 tbl.alignset = TRUE; 7250 } else if (StringICmp (ptr, "z") == 0) { 7251 tbl.gapped = TRUE; 7252 } else if (StringICmp (ptr, "e") == 0) { 7253 tbl.phrapace = TRUE; 7254 } 7255 7256 tbl.genprodset = (Boolean) myargs [g_argGenProdSet].intvalue; 7257 ptr = myargs [F_argFeatIdLinks].strvalue; 7258 if (StringICmp (ptr, "o") == 0) { 7259 tbl.linkbyoverlap = TRUE; 7260 } else if (StringICmp (ptr, "p") == 0) { 7261 tbl.linkbyproduct = TRUE; 7262 } 7263 tbl.forcelocalid = (Boolean) myargs [L_argLocalID].intvalue; 7264 tbl.gpstonps = (Boolean) myargs [u_argUndoGPS].intvalue; 7265 tbl.gnltonote = (Boolean) myargs [h_argGnlToNote].intvalue; 7266 tbl.accn = (CharPtr) myargs [A_argAccession].strvalue; 7267 tbl.center = (CharPtr) myargs [C_argCenter].strvalue; 7268 tbl.organism = (CharPtr) myargs [n_argOrgName].strvalue; 7269 tbl.srcquals = (CharPtr) myargs [j_argSrcQuals].strvalue; 7270 tbl.comment = (CharPtr) myargs [y_argComment].strvalue; 7271 tbl.commentFile = ReadCommentFile ((CharPtr) myargs [Y_argCommentFile].strvalue); 7272 7273 ptr = myargs [k_argCdsFlags].strvalue; 7274 if (StringChr (ptr, 'c') != NULL) { 7275 tbl.findorf = TRUE; 7276 } 7277 if (StringChr (ptr, 'r') != NULL) { 7278 tbl.runonorf = TRUE; 7279 tbl.findorf = TRUE; 7280 } 7281 if (StringChr (ptr, 'm') != NULL) { 7282 tbl.altstart = TRUE; 7283 } 7284 if (StringChr (ptr, 'k') != NULL) { 7285 tbl.conflict = TRUE; 7286 } 7287 /* 7288 if (!tbl.findorf && tbl.runonorf) { 7289 Message (MSG_FATAL, "-k r cannot be used without -k c"); 7290 return 1; 7291 } 7292 */ 7293 7294 /* process obsolete validate/flatfile arguments first, warn if used */ 7295 7296 tbl.validate = (Boolean) myargs [v_argValidate].intvalue; 7297 if (tbl.validate) { 7298 Message (MSG_POST, "-v is obsolete, use -V v instead"); 7299 } 7300 tbl.flatfile = (Boolean) myargs [b_argGenBank].intvalue; 7301 if (tbl.flatfile) { 7302 Message (MSG_POST, "-b is obsolete, use -V b instead"); 7303 } 7304 7305 ptr = myargs [V_argVerify].strvalue; 7306 if (StringChr (ptr, 'v') != NULL) { 7307 tbl.validate = TRUE; 7308 } 7309 if (StringChr (ptr, 'r') != NULL) { 7310 tbl.validate = TRUE; 7311 tbl.relaxed = TRUE; 7312 } 7313 if (StringChr (ptr, 'b') != NULL) { 7314 tbl.flatfile = TRUE; 7315 } 7316 if (StringChr (ptr, 'g') != NULL) { 7317 tbl.genereport = TRUE; 7318 } 7319 if (StringChr (ptr, 'c') != NULL) { 7320 tbl.validate_barcode = TRUE; 7321 } 7322 7323 7324 tbl.seqidfromfile = (Boolean) myargs [q_argFileID].intvalue; 7325 tbl.smartfeats = (Boolean) myargs [S_argSmartFeats].intvalue; 7326 tbl.smarttitle = (Boolean) myargs [Q_argSmartTitle].intvalue; 7327 tbl.removeunnecxref = (Boolean) myargs [U_argUnnecXref].intvalue; 7328 tbl.dotaxlookup = (Boolean) myargs [T_argTaxLookup].intvalue; 7329 tbl.dopublookup = (Boolean) myargs [P_argPubLookup].intvalue; 7330 tbl.logtoterminal = (Boolean) myargs [W_argLogProgress].intvalue; 7331 7332 tbl.save_bioseq_set = (Boolean) myargs [K_argBioseqSet].intvalue; 7333 7334 disc_rep_file = (CharPtr) myargs [Z_argDiscRepFile].strvalue; 7335 if (StringHasNoText (disc_rep_file)) { 7336 tbl.global_report = NULL; 7337 } else { 7338 tbl.global_report = GlobalDiscrepReportNew(); 7339 tbl.global_report->test_config = DiscrepancyConfigNew (); 7340 DisableTRNATests (tbl.global_report->test_config); 7341 ConfigureForGenomes (tbl.global_report->test_config); 7342 tbl.global_report->taxlookup = taxlookup; 7343 tbl.global_report->output_config->summary_report = FALSE; 7344 tbl.global_report->output_config->expand_report_categories[DISC_SUPERFLUOUS_GENE] = TRUE; 7345 tbl.global_report->output_config->expand_report_categories[DISC_RNA_CDS_OVERLAP] = TRUE; 7346 tbl.global_report->output_config->expand_report_categories[DISC_SUSPECT_PRODUCT_NAME] = TRUE; 7347 tbl.global_report->output_config->expand_report_categories[DISC_OVERLAPPING_CDS] = TRUE; 7348 } 7349 7350 7351 /* arguments for alignment reading, e.g., "p,-,-,-,?,." */ 7352 7353 gapstring [0] = '\0'; 7354 ptr = (CharPtr) myargs [G_argGapFields].strvalue; 7355 StringNCpy_0 (gapstring, ptr, sizeof (gapstring)); 7356 7357 ptr = gapstring; 7358 tmp = ParseCommaField (&ptr); 7359 if (tmp != NULL) { 7360 if (StringChr (tmp, 'p') != NULL) { 7361 tbl.aln_is_protein = TRUE; 7362 } else if (StringChr (tmp, 'n') == NULL) { 7363 Message (MSG_FATAL, "-G must start with p for Protein or n for Nucleotide"); 7364 return 1; 7365 } 7366 } 7367 tbl.aln_beginning_gap = ParseCommaField (&ptr); 7368 tbl.aln_middle_gap = ParseCommaField (&ptr); 7369 tbl.aln_end_gap = ParseCommaField (&ptr); 7370 tbl.aln_missing = ParseCommaField (&ptr); 7371 tbl.aln_match = ParseCommaField (&ptr); 7372 7373 if (StringHasNoText (tbl.accn)) { 7374 tbl.accn = NULL; 7375 } 7376 if (StringHasNoText (tbl.organism)) { 7377 tbl.organism = NULL; 7378 } 7379 if (StringHasNoText (tbl.srcquals)) { 7380 tbl.srcquals = NULL; 7381 } 7382 if (StringHasNoText (tbl.comment)) { 7383 tbl.comment = NULL; 7384 } 7385 if (StringHasNoText (tbl.commentFile)) { 7386 tbl.commentFile = NULL; 7387 } 7388 7389 if (tbl.fastaset && 7390 (tbl.deltaset || tbl.phrapace || tbl.genprodset || 7391 tbl.alignset || tbl.gapped)) { 7392 Message (MSG_FATAL, "-s cannot be used with -d, -e, -g, -l or -z"); 7393 return 1; 7394 } 7395 7396 if (! tbl.alignset && (StringDoesHaveText (tbl.aln_beginning_gap) 7397 || StringDoesHaveText (tbl.aln_end_gap) 7398 || StringDoesHaveText (tbl.aln_middle_gap) 7399 || StringDoesHaveText (tbl.aln_missing) 7400 || StringDoesHaveText (tbl.aln_match) 7401 || tbl.aln_is_protein)) { 7402 Message (MSG_FATAL, "-G can only be used with -a l"); 7403 return 1; 7404 } 7405 7406 /* arguments for cleanup */ 7407 MemSet (&(tbl.cleanup_args), 0, sizeof (CleanupArgsData)); 7408 ptr = (CharPtr) myargs [c_argCleanupOptions].strvalue; 7409 if (StringChr (ptr, 'd') != NULL) { 7410 if (StringChr (ptr, 'D') != NULL) { 7411 Message (MSG_FATAL, "Cannot use both d and D options for cleanup. Choose one."); 7412 return 1; 7413 } 7414 tbl.cleanup_args.collection_dates = TRUE; 7415 tbl.cleanup_args.collection_dates_month_first = TRUE; 7416 } else if (StringChr (ptr, 'D') != NULL) { 7417 tbl.cleanup_args.collection_dates = TRUE; 7418 tbl.cleanup_args.collection_dates_month_first = FALSE; 7419 } 7420 7421 if (StringChr (ptr, 'b') != NULL) { 7422 tbl.cleanup_args.add_notes_to_overlapping_cds_without_abc = TRUE; 7423 } 7424 7425 if (StringHasNoText (base) && (StringDoesHaveText (tbl.accn))) { 7426 Message (MSG_FATAL, "Accession can be entered only for a single record"); 7427 return 1; 7428 } 7429 7430 /* Seq-submit or Submit-block template is optional */ 7431 7432 if (StringDoesHaveText (tmplate)) { 7433 if (TemplateOverwriteRisk (tmplate, base, directory, suffix)) { 7434 if (Message (MSG_YN, overwriteMsg) == ANS_NO) return 0; 7435 } 7436 fp = FileOpen (tmplate, "r"); 7437 if (fp != NULL) { 7438 while ((dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, TRUE, FALSE)) != NULL) { 7439 if (datatype == OBJ_SEQSUB) { 7440 ssp = (SeqSubmitPtr) dataptr; 7441 } else if (datatype == OBJ_SUBMIT_BLOCK) { 7442 sbp = (SubmitBlockPtr) dataptr; 7443 } else if (datatype == OBJ_SEQDESC) { 7444 ValNodeLink (&sdphead, (SeqDescrPtr) dataptr); 7445 } else { 7446 ObjMgrFree (datatype, dataptr); 7447 } 7448 } 7449 FileClose (fp); 7450 } 7451 7452 if (ssp != NULL && sbp == NULL) { 7453 sbp = ssp->sub; 7454 } 7455 if (sbp == NULL) { 7456 Message (MSG_FATAL, "Unable to read required template file"); 7457 return 1; 7458 } 7459 7460 if (sbp != NULL) { 7461 if (ssp != NULL) { 7462 7463 /* copy submit block, will free SeqSubmit before processing */ 7464 7465 sbp = AsnIoMemCopy ((Pointer) sbp, 7466 (AsnReadFunc) SubmitBlockAsnRead, 7467 (AsnWriteFunc) SubmitBlockAsnWrite); 7468 } 7469 sbp->tool = MemFree (sbp->tool); 7470 os = GetOpSysString (); 7471 if (os != NULL) { 7472 sprintf (str, "tbl2asn %s - %s", TBL2ASN_APPLICATION, os); 7473 } else { 7474 sprintf (str, "tbl2asn %s", TBL2ASN_APPLICATION); 7475 } 7476 sbp->tool = StringSave (str); 7477 MemFree (os); 7478 sbp->hup = FALSE; 7479 sbp->reldate = DateFree (sbp->reldate); 7480 if (StringDoesHaveText (hold)) { 7481 if (StringICmp (hold, "y") == 0) { 7482 sbp->hup = TRUE; 7483 dp = DateCurr (); 7484 sbp->reldate = dp; 7485 if (dp != NULL) { 7486 if (dp->data [0] == 1) { 7487 (dp->data [1])++; 7488 } 7489 } 7490 } else { 7491 dp = DateParse (hold); 7492 if (dp != NULL) { 7493 sbp->hup = TRUE; 7494 sbp->reldate = dp; 7495 } 7496 } 7497 } 7498 csp = sbp->cit; 7499 if (csp != NULL) { 7500 csp->date = DateFree (csp->date); 7501 csp->date = DateCurr (); 7502 MemSet ((Pointer) &pd, 0, sizeof (Pubdesc)); 7503 MemSet ((Pointer) &pb, 0, sizeof (ValNode)); 7504 pb.choice = PUB_Sub; 7505 pb.data.ptrvalue = (Pointer) csp; 7506 pd.pub = &pb; 7507 pdp = &pd; 7508 } 7509 } 7510 if (ssp != NULL && ssp->datatype == 1) { 7511 sep = (SeqEntryPtr) ssp->data; 7512 if (sep != NULL) { 7513 VisitBioSourcesInSep (sep, (Pointer) &src, GetFirstBiop); 7514 if (src != NULL) { 7515 7516 /* copy top biosource */ 7517 7518 src = AsnIoMemCopy ((Pointer) src, 7519 (AsnReadFunc) BioSourceAsnRead, 7520 (AsnWriteFunc) BioSourceAsnWrite); 7521 } 7522 } 7523 7524 /* in case template has colliding ID, free it now */ 7525 7526 SeqSubmitFree (ssp); 7527 } 7528 } 7529 7530 if (StringDoesHaveText (descrs)) { 7531 if (TemplateOverwriteRisk (descrs, base, directory, suffix)) { 7532 if (Message (MSG_YN, overwriteMsg) == ANS_NO) return 0; 7533 } 7534 fp = FileOpen (descrs, "r"); 7535 if (fp != NULL) { 7536 while ((dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, TRUE, FALSE)) != NULL) { 7537 if (datatype == OBJ_SEQDESC) { 7538 ValNodeLink (&sdphead, (SeqDescrPtr) dataptr); 7539 } else { 7540 ObjMgrFree (datatype, dataptr); 7541 } 7542 } 7543 FileClose (fp); 7544 } 7545 } 7546 7547 gotags = TextFsaNew (); 7548 TextFsaAdd (gotags, "go_component"); 7549 TextFsaAdd (gotags, "go_function"); 7550 TextFsaAdd (gotags, "go_process"); 7551 7552 /* register fetch functions */ 7553 7554 if (remote) { 7555 #ifdef INTERNAL_NCBI_TBL2ASN 7556 if (! PUBSEQBioseqFetchEnable ("tbl2asn", FALSE)) { 7557 Message (MSG_POSTERR, "PUBSEQBioseqFetchEnable failed"); 7558 return 1; 7559 } 7560 #else 7561 PubSeqFetchEnable (); 7562 #endif 7563 } 7564 7565 if (remote || tbl.dopublookup) { 7566 PubMedFetchEnable (); 7567 } 7568 7569 /* process one or more records */ 7570 7571 if (StringDoesHaveText (outfile) && StringHasNoText (base)) { 7572 aip = AsnIoOpen (outfile, "w"); 7573 if (aip == NULL) { 7574 Message (MSG_FATAL, "Unable to open single output file"); 7575 return 1; 7576 } 7577 ssp_atp = DoFirstPrefix (aip, sbp); 7578 bssp_atp = DoSecondPrefix (aip, &tbl); 7579 } 7580 7581 if (StringDoesHaveText (base)) { 7582 ptr = StringRChr (base, '.'); 7583 sfx[0] = '\0'; 7584 if (ptr != NULL) { 7585 StringNCpy_0 (sfx, ptr, sizeof (sfx)); 7586 *ptr = '\0'; 7587 } 7588 tbl.tableFile = (CharPtr) myargs [f_argTableFile].strvalue; 7589 ProcessOneRecord (sbp, pdp, src, directory, results, base, sfx, sdphead, &tbl, gotags, aip, outfile); 7590 7591 } else { 7592 7593 FileRecurse (sbp, pdp, src, directory, results, suffix, recurse, sdphead, &tbl, gotags, aip, NULL); 7594 } 7595 7596 if (aip != NULL) { 7597 DoSecondSuffix (aip, bssp_atp); 7598 DoFirstSuffix (aip, ssp_atp); 7599 AsnIoClose (aip); 7600 } 7601 7602 if (tbl.global_report != NULL) { 7603 fp = FileOpen (disc_rep_file, "w"); 7604 WriteGlobalDiscrepancyReport (tbl.global_report, fp); 7605 FileClose (fp); 7606 tbl.global_report = GlobalDiscrepReportFree (tbl.global_report); 7607 } 7608 7609 if (sbp != NULL) { 7610 SubmitBlockFree (sbp); 7611 } 7612 if (src != NULL) { 7613 BioSourceFree (src); 7614 } 7615 7616 SeqDescrFree (sdphead); 7617 7618 TransTableFreeAll (); 7619 7620 ECNumberFSAFreeAll (); 7621 7622 TextFsaFree (gotags); 7623 7624 /* close fetch functions */ 7625 7626 if (remote || tbl.dopublookup) { 7627 PubMedFetchDisable (); 7628 } 7629 7630 if (remote) { 7631 #ifdef INTERNAL_NCBI_TBL2ASN 7632 PUBSEQBioseqFetchDisable (); 7633 #else 7634 PubSeqFetchDisable (); 7635 #endif 7636 } 7637 7638 return 0; 7639 } 7640 7641
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |