NCBI C Toolkit Cross Reference

C/demo/tbl2asn.c


  1 /*   tbl2asn.c
  2 * ===========================================================================
  3 *
  4 *                            PUBLIC DOMAIN NOTICE
  5 *            National Center for Biotechnology Information (NCBI)
  6 *
  7 *  This software/database is a "United States Government Work" under the
  8 *  terms of the United States Copyright Act.  It was written as part of
  9 *  the author's official duties as a United States Government employee and
 10 *  thus cannot be copyrighted.  This software/database is freely available
 11 *  to the public for use. The National Library of Medicine and the U.S.
 12 *  Government do not place any restriction on its use or reproduction.
 13 *  We would, however, appreciate having the NCBI and the author cited in
 14 *  any work or product based on this material
 15 *
 16 *  Although all reasonable efforts have been taken to ensure the accuracy
 17 *  and reliability of the software and data, the NLM and the U.S.
 18 *  Government do not and cannot warrant the performance or results that
 19 *  may be obtained by using this software or data. The NLM and the U.S.
 20 *  Government disclaim all warranties, express or implied, including
 21 *  warranties of performance, merchantability or fitness for any particular
 22 *  purpose.
 23 *
 24 * ===========================================================================
 25 *
 26 * File Name:  tbl2asn.c
 27 *
 28 * Author:  Jonathan Kans
 29 *
 30 * Version Creation Date:   5/5/00
 31 *
 32 * $Revision: 6.277 $
 33 *
 34 * File Description:
 35 *
 36 * Modifications:
 37 * --------------------------------------------------------------------------
 38 * Date     Name        Description of modification
 39 * -------  ----------  -----------------------------------------------------
 40 *
 41 *
 42 * ==========================================================================
 43 */
 44 
 45 #include <ncbi.h>
 46 #include <objall.h>
 47 #include <objsset.h>
 48 #include <objsub.h>
 49 #include <objfdef.h>
 50 #include <sequtil.h>
 51 #include <edutil.h>
 52 #include <seqport.h>
 53 #include <gather.h>
 54 #include <sqnutils.h>
 55 #include <subutil.h>
 56 #include <toasn3.h>
 57 #include <valid.h>
 58 #include <asn2gnbk.h>
 59 #include <explore.h>
 60 #include <tofasta.h>
 61 #include <simple.h>
 62 #include <suggslp.h>
 63 #include <aliparse.h>
 64 #include <util/creaders/alnread.h>
 65 #include <pmfapi.h>
 66 #include <tax3api.h>
 67 #ifdef INTERNAL_NCBI_TBL2ASN
 68 #include <accpubseq.h>
 69 #endif
 70 #define NLM_GENERATED_CODE_PROTO
 71 #include <asnmacro.h>
 72 #include <objmacro.h>
 73 #include <macroapi.h>
 74 
 75 #define TBL2ASN_APP_VER "13.2"
 76 
 77 CharPtr TBL2ASN_APPLICATION = TBL2ASN_APP_VER;
 78 
 79 typedef struct cleanupargs {
 80   Boolean collection_dates;
 81   Boolean collection_dates_month_first;
 82   Boolean add_notes_to_overlapping_cds_without_abc;
 83 } CleanupArgsData, PNTR CleanupArgsPtr;
 84 
 85 typedef struct tblargs {
 86   Boolean     raw2delt;
 87   Int2        r2dmin;
 88   Boolean     r2dunk100;
 89   Boolean     fastaset;
 90   Int2        whichclass;
 91   Boolean     deltaset;
 92   Boolean     alignset;
 93   Boolean     gapped;
 94   Boolean     phrapace;
 95   Boolean     genprodset;
 96   Boolean     linkbyoverlap;
 97   Boolean     linkbyproduct;
 98   Boolean     implicitgaps;
 99   Boolean     forcelocalid;
100   Boolean     gpstonps;
101   Boolean     gnltonote;
102   Boolean     removeunnecxref;
103   Boolean     dotaxlookup;
104   Boolean     dopublookup;
105   CharPtr     accn;
106   CharPtr     center;
107   CharPtr     organism;
108   CharPtr     srcquals;
109   CharPtr     comment;
110   CharPtr     commentFile;
111   CharPtr     tableFile;
112   Boolean     findorf;
113   Boolean     runonorf;
114   Boolean     altstart;
115   Boolean     conflict;
116   Boolean     validate;
117   Boolean     relaxed;
118   Boolean     validate_barcode;
119   Boolean     flatfile;
120   Boolean     genereport;
121   Boolean     seqidfromfile;
122   Boolean     smartfeats;
123   Boolean     smarttitle;
124   Boolean     logtoterminal;
125   CharPtr     aln_beginning_gap;
126   CharPtr     aln_end_gap;
127   CharPtr     aln_middle_gap;
128   CharPtr     aln_missing;
129   CharPtr     aln_match;
130   Boolean     aln_is_protein;
131   Boolean     save_bioseq_set;
132 
133   GlobalDiscrepReportPtr global_report;
134 
135   CleanupArgsData cleanup_args;
136 } TblArgs, PNTR TblArgsPtr;
137 
138 static FILE* OpenOneFile (
139   CharPtr directory,
140   CharPtr base,
141   CharPtr suffix
142 )
143 
144 {
145   Char  file [FILENAME_MAX], path [PATH_MAX];
146 
147   if (base == NULL) {
148     base = "";
149   }
150   if (suffix == NULL) {
151     suffix = "";
152   }
153 
154   StringNCpy_0 (path, directory, sizeof (path));
155   sprintf (file, "%s%s", base, suffix);
156   FileBuildPath (path, NULL, file);
157 
158   return FileOpen (path, "r");
159 }
160 
161 static void WriteOneFile (
162   CharPtr results,
163   CharPtr base,
164   CharPtr suffix,
165   CharPtr outfile,
166   SeqEntryPtr sep,
167   SubmitBlockPtr sbp,
168   Boolean save_bioseq_set
169 )
170 
171 {
172   AsnIoPtr      aip;
173   BioseqSetPtr  bssp;
174   Char          file [FILENAME_MAX], path [PATH_MAX];
175   SeqSubmit     ssb;
176 
177   if (sep == NULL || sep->data.ptrvalue == NULL) return;
178 
179   MemSet ((Pointer) &ssb, 0, sizeof (SeqSubmit));
180   ssb.sub = sbp;
181   ssb.datatype = 1;
182   ssb.data = (Pointer) sep;
183 
184   if (StringDoesHaveText (outfile)) {
185     StringNCpy_0 (path, outfile, sizeof (path));
186   } else {
187     StringNCpy_0 (path, results, sizeof (path));
188     sprintf (file, "%s%s", base, suffix);
189     FileBuildPath (path, NULL, file);
190   }
191 
192   aip = AsnIoOpen (path, "w");
193   if (aip == NULL) return;
194 
195   if (sbp != NULL) {
196     SeqSubmitAsnWrite (&ssb, aip, NULL);
197   } else if (save_bioseq_set && IS_Bioseq_set (sep)) {
198     bssp = (BioseqSetPtr) sep->data.ptrvalue;
199     BioseqSetAsnWrite (bssp, aip, NULL);
200   } else {
201     SeqEntryAsnWrite (sep, aip, NULL);
202   }
203 
204   AsnIoFlush (aip);
205   AsnIoClose (aip);
206 }
207 
208 static CharPtr compatSeverityLabel [] = {
209   "NONE", "NOTE: valid", "WARNING: valid", "ERROR: valid", "REJECT: valid", "FATAL: valid", "MAX", NULL
210 };
211 
212 static void LIBCALLBACK ValidCallback (
213   ErrSev severity,
214   int errcode,
215   int subcode,
216   Uint2 entityID,
217   Uint2 itemtype,
218   Uint4 itemID,
219   CharPtr accession,
220   CharPtr message,
221   CharPtr objtype,
222   CharPtr label,
223   CharPtr context,
224   CharPtr location,
225   CharPtr product,
226   Pointer userdata
227 )
228 
229 {
230   CharPtr  catname, errname;
231   FILE     *fp;
232 
233   fp = (FILE *) userdata;
234   if (fp == NULL) return;
235 
236   if (severity < SEV_NONE || severity > SEV_MAX) {
237     severity = SEV_MAX;
238   }
239 
240   catname = GetValidCategoryName (errcode);
241   errname = GetValidErrorName (errcode, subcode);
242 
243   if (catname == NULL) {
244     catname = "?";
245   }
246   if (errname == NULL) {
247     errname = "?";
248   }
249 
250   if (accession == NULL) {
251     accession = "";
252   }
253   if (message == NULL) {
254     message = "";
255   }
256   if (objtype == NULL) {
257     objtype = "";
258   }
259   if (label == NULL) {
260     label = "";
261   }
262 
263   fprintf (fp, "%s [%s.%s] %s %s: %s",
264            compatSeverityLabel [severity],
265            catname, errname, message, objtype, label);
266   if (location != NULL) {
267     fprintf (fp, " %s", location);
268   }
269   if (context != NULL) {
270     fprintf (fp, " %s", context);
271   }
272   if (product != NULL) {
273     fprintf (fp, " -> %s", product);
274   }
275   fprintf (fp, "\n");
276 }
277 
278 
279 static void ValidateOneFile (
280   CharPtr results,
281   CharPtr base,
282   CharPtr suffix,
283   SeqEntryPtr sep,
284   Boolean standard,
285   Boolean relaxed,
286   Boolean barcode
287 )
288 
289 {
290   Char            file [FILENAME_MAX], path [PATH_MAX];
291   FILE            *ofp;
292   ErrSev          oldErrSev;
293   ValidStructPtr  vsp;
294 
295   StringNCpy_0 (path, results, sizeof (path));
296   sprintf (file, "%s%s", base, suffix);
297   FileBuildPath (path, NULL, file);
298 
299   ofp = FileOpen (path, "w");
300 
301   if (standard) {
302     vsp = ValidStructNew ();
303     if (vsp != NULL) {
304       vsp->useSeqMgrIndexes = TRUE;
305       vsp->suppressContext = TRUE;
306       vsp->seqSubmitParent = TRUE;
307       if (! relaxed) {
308         vsp->testLatLonSubregion = TRUE;
309       }
310       oldErrSev = ErrSetMessageLevel (SEV_NONE);
311       vsp->errfunc = ValidCallback;
312       vsp->userdata = (Pointer) ofp;
313       /* vsp->convertGiToAccn = FALSE; */
314       ValidateSeqEntry (sep, vsp);
315       ValidStructFree (vsp);
316       ErrSetMessageLevel (oldErrSev);
317     }
318   }
319   /* Barcode results if requested */
320   if (barcode) {
321     BarcodeValidateOneSeqEntry (ofp, sep, TRUE, FALSE, TRUE, NULL);
322   }
323 
324   FileClose (ofp);
325 }
326 
327 static void FlatfileOneFile (
328   CharPtr results,
329   CharPtr base,
330   CharPtr suffix,
331   SeqEntryPtr sep
332 )
333 
334 {
335   Char    file [FILENAME_MAX], path [PATH_MAX];
336   FILE    *fp;
337   ErrSev  oldErrSev;
338 
339   StringNCpy_0 (path, results, sizeof (path));
340   sprintf (file, "%s%s", base, suffix);
341   FileBuildPath (path, NULL, file);
342 
343   fp = FileOpen (path, "w");
344   if (fp == NULL) return;
345 
346   oldErrSev = ErrSetMessageLevel (SEV_MAX);
347   SeqEntryToGnbk (sep, NULL, GENBANK_FMT, ENTREZ_MODE, NORMAL_STYLE, 0, 0, 0, NULL, fp);
348   ErrSetMessageLevel (oldErrSev);
349 
350   FileClose (fp);
351 }
352 
353 /* for full-length cDNAs, allow automatic annotation of largest internal ORF */
354 
355 typedef struct orfdata {
356   Int4     curlen [6], bestlen [6], currstart [6], beststart [6], sublen [6];
357   Boolean  inorf [6], altstart, runonorf;
358   Int4     bioseq_len;
359 } OrfData, PNTR OrfDataPtr;
360 
361 static Boolean TreatLikeStop (Int2 frame, Int4 pos, Uint1 strand, Int4 len)
362 {
363   Int4 remainder = len % 3;
364   Boolean like_stop = FALSE;
365 
366   if (strand == Seq_strand_minus) {
367     if (pos < 3) {
368       like_stop = TRUE;
369     }
370   } else {
371     if (pos >= len - remainder - 3) {
372       like_stop = TRUE;
373     }
374   }
375   return like_stop;
376 }
377 
378 static void LIBCALLBACK LookForOrfs (
379   Int4 position,
380   Char residue,
381   Boolean atgStart,
382   Boolean altStart,
383   Boolean orfStop,
384   Int2 frame,
385   Uint1 strand,
386   Pointer userdata
387 )
388 
389 {
390   Int2        idx;
391   OrfDataPtr  odp;
392   Boolean     start_of_seq = FALSE;
393 
394   odp = (OrfDataPtr) userdata;
395   if (strand == Seq_strand_plus) {
396 
397     /* top strand */
398 
399     idx = frame;
400     if (odp->inorf [idx]) {
401       if (!orfStop && odp->runonorf) {
402         /* treat the end of the sequence like a stop codon */
403         if (TreatLikeStop(frame, position, strand, odp->bioseq_len)) {
404           (odp->curlen[idx])++;
405           orfStop = TRUE;
406         }
407       }
408 
409       if (orfStop) {
410         odp->inorf [idx] = FALSE;
411         if (odp->curlen [idx] > odp->bestlen [idx]) {
412           odp->bestlen [idx] = odp->curlen [idx];
413           odp->beststart [idx] = odp->currstart [idx];
414         }
415       } else {
416         (odp->curlen [idx])++;
417       }
418     } else if (atgStart || (altStart && odp->altstart)) {
419       odp->inorf [idx] = TRUE;
420       odp->curlen [idx] = 1;
421       odp->currstart [idx] = position - frame;
422     }
423   } else {
424 
425     /* bottom strand */
426 
427     idx = frame + 3;
428 
429     if (!orfStop && odp->runonorf) {
430       start_of_seq = TreatLikeStop (frame, position, strand, odp->bioseq_len);
431     }
432 
433     if (orfStop) {
434       odp->curlen [idx] = 0;
435       odp->sublen [idx] = 0;
436       odp->currstart [idx] = position - frame;
437     } else if (start_of_seq) {
438       odp->curlen [idx] = 1;
439       odp->sublen [idx] = 1;
440       odp->currstart [idx] = position - frame - 3;
441       if (odp->curlen [idx] > odp->bestlen [idx]) {
442         odp->bestlen [idx] = odp->curlen [idx];
443         odp->beststart [idx] = odp->currstart [idx];
444       }
445     } else if (atgStart || (altStart && odp->altstart)) {
446       (odp->sublen [idx])++;
447       odp->curlen [idx] = odp->sublen [idx];
448       if (odp->curlen [idx] > odp->bestlen [idx]) {
449         odp->bestlen [idx] = odp->curlen [idx];
450         odp->beststart [idx] = odp->currstart [idx];
451       }
452     } else {
453       (odp->sublen [idx])++;
454     }
455   }
456 }
457 
458 static SeqFeatPtr AnnotateBestOrf (
459   BioseqPtr bsp,
460   Int2 genCode,
461   Boolean altstart,
462   Boolean runonorf,
463   SqnTagPtr stp
464 )
465 
466 {
467   SeqFeatPtr      cds = NULL;
468   CdRegionPtr     crp;
469   GeneRefPtr      grp;
470   Int2            i, best, idx;
471   OrfData         od;
472   ProtRefPtr      prp;
473   SeqFeatPtr      sfp;
474   SeqInt          sint;
475   CharPtr         str;
476   TransTablePtr   ttp;
477   ValNode         vn;
478   SeqFeatXrefPtr  xref;
479   Boolean         partial5 = FALSE, partial3 = FALSE;
480 
481   if (bsp == NULL) return NULL;
482   for (i = 0; i < 6; i++) {
483     od.curlen [i] = INT4_MIN;
484     od.bestlen [i] = 0;
485     od.currstart [i] = 0;
486     od.beststart [i] = 0;
487     od.sublen [i] = INT4_MIN;
488     od.inorf [i] = FALSE;
489   }
490   od.altstart = altstart;
491   od.runonorf = runonorf;
492   od.bioseq_len = bsp->length;
493 
494   /* use simultaneous 6-frame translation finite state machine */
495 
496   ttp = PersistentTransTableByGenCode (genCode);
497   if (ttp != NULL) {
498     TransTableProcessBioseq (ttp, LookForOrfs, (Pointer) &od, bsp);
499   }
500   /* TransTableFree (tbl); - now using persistent tables, free at end */
501   best = -1;
502   idx = -1;
503   for (i = 0; i < 6; i++) {
504     if (od.bestlen [i] > best) {
505       best = od.bestlen [i];
506       idx = i;
507     }
508   }
509   if (idx == -1) return NULL;
510 
511   /* make feature location on largest ORF */
512 
513   if (idx < 3) {
514     MemSet ((Pointer) &sint, 0, sizeof (SeqInt));
515     sint.from = od.beststart [idx] + idx;
516     sint.to = sint.from + (od.bestlen [idx]) * 3 + 2;
517     if (sint.to > od.bioseq_len - 1) {
518       sint.to = od.bioseq_len - 1;
519       partial3 = TRUE;
520     }
521     sint.id = SeqIdFindBest (bsp->id, 0);
522     sint.strand = Seq_strand_plus;
523     vn.choice = SEQLOC_INT;
524     vn.extended = 0;
525     vn.data.ptrvalue = (Pointer) &sint;
526     vn.next = NULL;
527   } else {
528     MemSet ((Pointer) &sint, 0, sizeof (SeqInt));
529     sint.from = od.beststart [idx] + idx - 3;
530     sint.to = sint.from + (od.bestlen [idx]) * 3 + 2;
531     if (sint.from < 0) {
532       sint.from = 0;
533       partial3 = TRUE;
534     }
535     sint.id = SeqIdFindBest (bsp->id, 0);
536     sint.strand = Seq_strand_minus;
537     vn.choice = SEQLOC_INT;
538     vn.extended = 0;
539     vn.data.ptrvalue = (Pointer) &sint;
540     vn.next = NULL;
541   }
542 
543   SetSeqLocPartial (&vn, partial5, partial3);
544 
545   /* make CDS feature with unknown product - now check [protein=...] */
546 
547   cds = CreateNewFeatureOnBioseq (bsp, SEQFEAT_CDREGION, &vn);
548   if (cds == NULL) return NULL;
549   if (partial5 || partial3) {
550     cds->partial = TRUE;
551   }
552   crp = CreateNewCdRgn (1, FALSE, genCode);
553   if (crp == NULL) return NULL;
554   crp->frame = 1;
555   cds->data.value.ptrvalue = (Pointer) crp;
556 
557   prp = ProtRefNew ();
558   if (prp == NULL) return cds;
559   xref = SeqFeatXrefNew ();
560   if (xref == NULL) return cds;
561   xref->data.choice = SEQFEAT_PROT;
562   xref->data.value.ptrvalue = (Pointer) prp;
563   xref->next = cds->xref;
564   cds->xref = xref;
565   prp = ParseTitleIntoProtRef (stp, prp);
566   if (prp->name == NULL && prp->desc == NULL) {
567     prp->name = ValNodeCopyStr (NULL, 0, "unknown");
568   }
569 
570   /* parse CDS comment ("note" goes to biosource) and experimental evidence */
571 
572   str = SqnTagFind (stp, "comment");
573   if (StringDoesHaveText (str)) {
574     cds->comment = StringSave (str);
575   }
576 
577   str = SqnTagFind (stp, "evidence");
578   if (StringICmp (str, "experimental") == 0) {
579     cds->exp_ev = 1;
580   }
581 
582   /* now check [gene=...], make gene feature if locus or synonym present */
583 
584   grp = GeneRefNew ();
585   if (grp == NULL) return cds;
586   grp = ParseTitleIntoGeneRef (stp, grp);
587   if (grp->locus == NULL && grp->syn == NULL) {
588     GeneRefFree (grp);
589     return cds;
590   }
591   sfp = CreateNewFeatureOnBioseq (bsp, SEQFEAT_GENE, NULL);
592   if (sfp == NULL) return cds;
593   sfp->data.value.ptrvalue = (Pointer) grp;
594 
595   return cds;
596 }
597 
598 /* change all feature IDs to entered accession */
599 
600 static void PromoteSeqId (
601   SeqIdPtr sip,
602   Pointer userdata
603 )
604 
605 {
606   SeqIdPtr  bestid, newid, oldid;
607 
608   bestid = (SeqIdPtr) userdata;
609 
610   newid = SeqIdDup (bestid);
611   if (newid == NULL) return;
612 
613   oldid = ValNodeNew (NULL);
614   if (oldid == NULL) return;
615 
616   MemCopy (oldid, sip, sizeof (ValNode));
617   oldid->next = NULL;
618 
619   sip->choice = newid->choice;
620   sip->data.ptrvalue = newid->data.ptrvalue;
621 
622   SeqIdFree (oldid);
623   ValNodeFree (newid);
624 
625   SeqIdStripLocus (sip);
626 }
627 
628 static void CorrectFeatureSeqIds (
629   SeqFeatPtr sfp,
630   Pointer userdata
631 )
632 
633 {
634   VisitSeqIdsInSeqLoc (sfp->location, userdata, PromoteSeqId);
635 }
636 
637 static void CorrectGraphSeqIds (
638   SeqGraphPtr sgp,
639   Pointer userdata
640 )
641 
642 {
643   VisitSeqIdsInSeqGraph (sgp, userdata, PromoteSeqId);
644 }
645 
646 /* source information for several common organisms sequenced by genome centers */
647 
648 typedef struct orgstuff {
649   CharPtr  taxname;
650   CharPtr  common;
651   CharPtr  lineage;
652   CharPtr  division;
653   Uint1    gcode;
654   Uint1    mgcode;
655   Int4     taxID;
656 } OrgStuff, PNTR OrfStuffPtr;
657 
658 static OrgStuff commonOrgStuff [] = {
659   {
660     "Saccharomyces cerevisiae", "baker's yeast",
661     "Eukaryota; Fungi; Ascomycota; Saccharomycetes; Saccharomycetales; Saccharomycetaceae; Saccharomyces",
662     "PLN", 1, 3, 4932
663   },
664   {
665     "Drosophila melanogaster", "fruit fly",
666     "Eukaryota; Metazoa; Arthropoda; Tracheata; Hexapoda; Insecta; Pterygota; Neoptera; Endopterygota; Diptera; Brachycera; Muscomorpha; Ephydroidea; Drosophilidae; Drosophila",
667     "INV", 1, 5, 7227
668   },
669   {
670     "Homo sapiens", "human",
671     "Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Primates; Catarrhini; Hominidae; Homo",
672     "PRI", 1, 2, 9606
673   },
674   {
675     "Escherichia coli", "",
676     "Bacteria; Proteobacteria; gamma subdivision; Enterobacteriaceae; Escherichia",
677     "BCT", 11, 0, 562
678   },
679   {
680     "Helicobacter pylori", "",
681     "Bacteria; Proteobacteria; epsilon subdivision; Helicobacter group; Helicobacter",
682     "BCT", 11, 0, 210
683   },
684   {
685     "Arabidopsis thaliana", "thale cress",
686     "Eukaryota; Viridiplantae; Embryophyta; Tracheophyta; Spermatophyta; Magnoliophyta; eudicotyledons; core eudicots; Rosidae; eurosids II; Brassicales; Brassicaceae; Arabidopsis",
687     "PLN", 1, 1, 3702
688   },
689   {
690     "Mus musculus", "house mouse",
691     "Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Rodentia; Sciurognathi; Muridae; Murinae; Mus",
692     "ROD", 1, 2, 10090
693   },
694   {
695     "Rattus norvegicus", "Norway rat",
696     "Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Rodentia; Sciurognathi; Muridae; Murinae; Rattus",
697     "ROD", 1, 2, 10116
698   },
699   {
700     "Danio rerio", "zebrafish",
701     "Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Actinopterygii; Neopterygii; Teleostei; Euteleostei; Ostariophysi; Cypriniformes; Cyprinidae; Rasborinae; Danio",
702     "VRT", 1, 2, 7955
703   },
704   {
705     "Zea mays", "",
706     "Eukaryota; Viridiplantae; Embryophyta; Tracheophyta; Spermatophyta; Magnoliophyta; Liliopsida; Poales; Poaceae; Zea",
707     "PLN", 1, 1, 4577
708   },
709   {
710     "Caenorhabditis elegans", "",
711     "Eukaryota; Metazoa; Nematoda; Chromadorea; Rhabditida; Rhabditoidea; Rhabditidae; Peloderinae; Caenorhabditis",
712     "INV", 1, 5, 6239
713   },
714   {
715     "Caenorhabditis briggsae", "",
716     "Eukaryota; Metazoa; Nematoda; Chromadorea; Rhabditida; Rhabditoidea; Rhabditidae; Peloderinae; Caenorhabditis",
717     "INV", 1, 5, 6238
718   },
719   {
720     "Anopheles gambiae", "African malaria mosquito",
721     "Eukaryota; Metazoa; Arthropoda; Tracheata; Hexapoda; Insecta; Pterygota; Neoptera; Endopterygota; Diptera; Nematocera; Culicoidea; Anopheles",
722     "INV", 1, 5, 7165
723   },
724   {
725     "Anopheles gambiae str. PEST", "African malaria mosquito",
726     "Eukaryota; Metazoa; Arthropoda; Tracheata; Hexapoda; Insecta; Pterygota; Neoptera; Endopterygota; Diptera; Nematocera; Culicoidea; Anopheles",
727     "INV", 1, 5, 180454
728   },
729   {
730     "Tetrahymena thermophila", "",
731     "Eukaryota; Alveolata; Ciliophora; Oligohymenophorea; Hymenostomatida; Tetrahymenina; Tetrahymena",
732     "INV", 6, 4, 5911
733   },
734   {
735     "Pan troglodytes", "chimpanzee",
736     "Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Primates; Catarrhini; Hominidae; Pan",
737     "PRI", 1, 2, 9598
738   },
739   {
740     "Candida albicans", "",
741     "Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; Saccharomycetales; mitosporic Saccharomycetales; Candida",
742     "PLN", 12, 4, 5476
743   },
744   {
745     "Candida albicans SC5314", "",
746     "Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; Saccharomycetales; mitosporic Saccharomycetales; Candida",
747     "PLN", 12, 4, 237561
748   },
749   {
750     "Trypanosoma brucei", "",
751     "Eukaryota; Euglenozoa; Kinetoplastida; Trypanosomatidae; Trypanosoma",
752     "INV", 1, 4, 5691
753   },
754   {
755     "Trypanosoma cruzi", "",
756     "Eukaryota; Euglenozoa; Kinetoplastida; Trypanosomatidae; Trypanosoma; Schizotrypanum",
757     "INV", 1, 4, 5693
758   },
759   {
760     "Oryza sativa", "",
761     "Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; Spermatophyta; Magnoliophyta; Liliopsida; Poales; Poaceae; Ehrhartoideae; Oryzeae; Oryza",
762     "PLN", 1, 1, 4530
763   },
764   {
765     "Oryza sativa (indica cultivar-group)", "",
766     "Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; Spermatophyta; Magnoliophyta; Liliopsida; Poales; Poaceae; Ehrhartoideae; Oryzeae; Oryza",
767     "PLN", 1, 1, 39946
768   },
769   {
770     "Oryza sativa (japonica cultivar-group)", "",
771     "Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; Spermatophyta; Magnoliophyta; Liliopsida; Poales; Poaceae; Ehrhartoideae; Oryzeae; Oryza",
772     "PLN", 1, 1, 39947
773   },
774   {
775     "Aspergillus nidulans FGSC A4", "",
776     "Eukaryota; Fungi; Ascomycota; Pezizomycotina; Eurotiomycetes; Eurotiales; Trichocomaceae; Emericella",
777     "PLN", 1, 4, 227321
778   },
779   {
780     "environmental sequence", "",
781     "unclassified; environmental samples",
782     "UNA", 1, 2, 256318
783   },
784   {
785     NULL, NULL, NULL, NULL, 0, 0, 0
786   }
787 };
788 
789 static Boolean HasTaxon (
790   OrgRefPtr orp
791 )
792 
793 {
794   ValNodePtr  db;
795   DbtagPtr    dbt;
796 
797   if (orp == FALSE) return FALSE;
798   for (db = orp->db; db != NULL; db = db->next) {
799     dbt = (DbtagPtr) db->data.ptrvalue;
800     if (dbt != NULL && dbt->db != NULL &&
801         StringICmp (dbt->db, "taxon") == 0) return TRUE;
802   }
803   return FALSE;
804 }
805 
806 static void AddMissingSourceInfo (
807   BioSourcePtr biop
808 )
809 
810 {
811   ValNodePtr   db;
812   DbtagPtr     dbt;
813   Int2         idx;
814   ObjectIdPtr  oip;
815   OrgNamePtr   onp;
816   OrgRefPtr    orp;
817   OrfStuffPtr  osp;
818 
819   if (biop == NULL) return;
820   orp = biop->org;
821   if (orp == NULL) return;
822   onp = orp->orgname;
823   if (onp == NULL) return;
824 
825   /* look for entry of organisms in commonOrgStuff table */
826 
827   for (idx = 0; commonOrgStuff [idx].taxname != NULL; idx++) {
828     osp = &(commonOrgStuff [idx]);
829     if (StringICmp (orp->taxname, osp->taxname) == 0) {
830       if (StringCmp (orp->taxname, osp->taxname) != 0) {
831         /* fix capitalization of supplied name if in common organism list */
832         StringCpy (orp->taxname, osp->taxname);
833       }
834       if (StringHasNoText (orp->common) && StringDoesHaveText (osp->common)) {
835         orp->common = StringSave (osp->common);
836       }
837       if (onp->gcode == 0) {
838         onp->gcode = osp->gcode;
839       }
840       if (onp->mgcode == 0) {
841         onp->mgcode = osp->mgcode;
842       }
843       if (StringHasNoText (onp->div)) {
844         onp->div = StringSave (osp->division);
845       }
846       if (StringHasNoText (onp->lineage)) {
847         onp->lineage = StringSave (osp->lineage);
848       }
849       if (! HasTaxon (orp)) {
850         db = ValNodeNew (NULL);
851         if (db != NULL) {
852           dbt = DbtagNew ();
853           if (dbt != NULL) {
854             oip = ObjectIdNew ();
855             if (oip != NULL) {
856               oip->id = osp->taxID;
857               dbt->db = StringSave ("taxon");
858               dbt->tag = oip;
859               db->data.ptrvalue = (Pointer) dbt;
860               orp->db = db;
861             }
862           }
863         }
864       }
865     }
866   }
867 }
868 
869 static BioseqPtr GetBioseqReferencedByAnnot (
870   SeqAnnotPtr sap,
871   Uint2 entityID
872 )
873 
874 {
875   SeqAlignPtr   align;
876   BioseqPtr     bsp;
877   DenseDiagPtr  ddp;
878   DenseSegPtr   dsp;
879   SeqFeatPtr    feat;
880   SeqGraphPtr   graph;
881   SeqIdPtr      sip;
882   SeqLocPtr     slp;
883   StdSegPtr     ssp;
884   SeqLocPtr     tloc;
885 
886   if (sap == NULL) return NULL;
887   switch (sap->type) {
888     case 1 :
889       feat = (SeqFeatPtr) sap->data;
890       while (feat != NULL) {
891         slp = feat->location;
892         if (slp != NULL) {
893           bsp = BioseqFindFromSeqLoc (slp);
894           if (bsp != NULL) return bsp;
895         }
896         feat = feat->next;
897       }
898       break;
899     case 2 :
900       align = (SeqAlignPtr) sap->data;
901       while (align != NULL) {
902         if (align->segtype == 1) {
903           ddp = (DenseDiagPtr) align->segs;
904           if (ddp != NULL) {
905             for (sip = ddp->id; sip != NULL; sip = sip->next) {
906               bsp = BioseqFind (sip);
907               if (bsp != NULL) return bsp;
908             }
909           }
910         } else if (align->segtype == 2) {
911           dsp = (DenseSegPtr) align->segs;
912           if (dsp != NULL) {
913             for (sip = dsp->ids; sip != NULL; sip = sip->next) {
914               bsp = BioseqFind (sip);
915               if (bsp != NULL) return bsp;
916             }
917           }
918         } else if (align->segtype == 3) {
919           ssp = (StdSegPtr) align->segs;
920           if (ssp != NULL && ssp->loc != NULL) {
921             for (tloc = ssp->loc; tloc != NULL; tloc = tloc->next) {
922               bsp = BioseqFindFromSeqLoc (tloc);
923               if (bsp != NULL) return bsp;
924             }
925           }
926         }
927         align = align->next;
928       }
929       break;
930     case 3 :
931       graph = (SeqGraphPtr) sap->data;
932       while (graph != NULL) {
933         slp = graph->loc;
934         if (slp != NULL) {
935           bsp = BioseqFindFromSeqLoc (slp);
936           if (bsp != NULL) return bsp;
937         }
938         graph = graph->next;
939       }
940       break;
941     default :
942       break;
943   }
944   return NULL;
945 }
946 
947 static Int2 GetGenCodeForBsp (
948   BioseqPtr bsp
949 )
950 
951 {
952   BioSourcePtr  biop;
953   Boolean       mito;
954   OrgNamePtr    onp;
955   OrgRefPtr     orp;
956   SeqDescrPtr   sdp;
957 
958   sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_source, NULL);
959   if (sdp == NULL) return 1;
960   biop = (BioSourcePtr) sdp->data.ptrvalue;
961   if (biop == NULL) return 1;
962   orp = biop->org;
963   if (orp == NULL) return 1;
964   onp = orp->orgname;
965   if (onp == NULL) return 1;
966   mito = (Boolean) (biop->genome == 4 || biop->genome == 5);
967   if (mito) {
968     if (onp->mgcode == 0) {
969       return 1;
970     }
971     return onp->mgcode;
972   }
973   if (onp->gcode == 0) {
974     return 1;
975   }
976   return onp->gcode;
977 }
978 
979 typedef struct gcmdata {
980   SeqFeatPtr  gene;
981   SeqFeatPtr  feat;
982   CharPtr     label;
983 } GmcData, PNTR GmcDataPtr;
984 
985 static int LIBCALLBACK SortByGenePtr (
986   VoidPtr vp1,
987   VoidPtr vp2
988 )
989 
990 {
991   GmcDataPtr gdp1, gdp2;
992 
993   if (vp1 == NULL || vp2 == NULL) return 0;
994   gdp1 = (GmcDataPtr) vp1;
995   gdp2 = (GmcDataPtr) vp2;
996   if (gdp1 == NULL || gdp2 == NULL) return 0;
997 
998   if (gdp1->gene > gdp2->gene) return -1;
999   if (gdp1->gene < gdp2->gene) return 1;
1000 
1001   if (gdp1->feat > gdp2->feat) return -1;
1002   if (gdp1->feat < gdp2->feat) return 1;
1003 
1004   return 0;
1005 }
1006 
1007 static void PrintOneGeneLine (
1008   SeqFeatPtr gene,
1009   SeqFeatPtr cds,
1010   SeqFeatPtr rna,
1011   CharPtr cdslabel,
1012   CharPtr rnalabel,
1013   FILE *fp
1014 )
1015 
1016 {
1017   BioseqPtr     bsp;
1018   ValNodePtr    db, old_locus_tag, vnp;
1019   DbtagPtr      dbt;
1020   CharPtr       desc, locus, locus_tag, cdslcl, cdsaccn, cdsgnl,
1021                 rnaaccn, rnagnl, fbgn, gene_type, rna_type, prefix;
1022   GBQualPtr     gbq;
1023   GeneRefPtr    grp;
1024   ObjectIdPtr   oip;
1025   SeqIdPtr      sip;
1026   CharPtr       str;
1027   TextSeqIdPtr  tsip;
1028 
1029   if (fp == NULL) return; 
1030 
1031   locus = NULL;
1032   desc = NULL;
1033   locus_tag = NULL;
1034   old_locus_tag = NULL;
1035 
1036   cdslcl = NULL;
1037   cdsaccn = NULL;
1038   cdsgnl = NULL;
1039   rnaaccn = NULL;
1040   rnagnl = NULL;
1041 
1042   db = NULL;
1043   fbgn = NULL;
1044 
1045   gene_type = NULL;
1046   rna_type = NULL;
1047 
1048   if (gene != NULL) {
1049     gene_type = "gene";
1050     if (gene->pseudo) {
1051       gene_type = "pseudogene";
1052     }
1053     grp = (GeneRefPtr) gene->data.value.ptrvalue;
1054     if (grp != NULL) {
1055       if (grp->pseudo) {
1056         gene_type = "pseudogene";
1057       }
1058       locus = grp->locus;
1059       desc = grp->desc;
1060       locus_tag = grp->locus_tag;
1061       db = grp->db;
1062     }
1063     if (db == NULL) {
1064       db = gene->dbxref;
1065     }
1066     for (gbq = gene->qual; gbq != NULL; gbq = gbq->next) {
1067       if (StringICmp (gbq->qual, "old_locus_tag") != 0) continue;
1068       if (StringHasNoText (gbq->val)) continue;
1069       ValNodeCopyStr(&old_locus_tag, 0, gbq->val);
1070     }
1071     for (vnp = db; vnp != NULL; vnp = vnp->next) {
1072       dbt = (DbtagPtr) vnp->data.ptrvalue;
1073       if (dbt == NULL) continue;
1074       if (StringICmp (dbt->db, "FLYBASE") != 0) continue;
1075       oip = dbt->tag;
1076       if (oip == NULL) continue;
1077       fbgn = oip->str;
1078     }
1079   }
1080 
1081   if (cds != NULL) {
1082     if (cds->product != NULL) {
1083       bsp = BioseqFindFromSeqLoc (cds->product);
1084       if (bsp != NULL) {
1085         for (sip = bsp->id; sip != NULL; sip = sip->next) {
1086           switch (sip->choice) {
1087             case SEQID_LOCAL :
1088               oip = (ObjectIdPtr) sip->data.ptrvalue;
1089               if (oip == NULL) continue;
1090               cdslcl = oip->str;
1091               break;
1092             case SEQID_GENBANK :
1093             case SEQID_TPG :
1094               tsip = (TextSeqIdPtr) sip->data.ptrvalue;
1095               if (tsip == NULL) continue;
1096               cdsaccn = tsip->accession;
1097               break;
1098             case SEQID_GENERAL :
1099               dbt = (DbtagPtr) sip->data.ptrvalue;
1100               if (dbt == NULL) continue;
1101               if (IsSkippableDbtag (dbt)) continue;
1102               oip = dbt->tag;
1103               if (oip == NULL) continue;
1104               cdsgnl = oip->str;
1105               break;
1106             default :
1107               break;
1108           }
1109         }
1110       }
1111     }
1112   }
1113 
1114   if (rna != NULL) {
1115     switch (rna->idx.subtype) {
1116       case FEATDEF_preRNA :
1117         rna_type = "precursor RNA";
1118         break;
1119       case FEATDEF_mRNA :
1120         rna_type = "mRNA";
1121         break;
1122       case FEATDEF_tRNA :
1123         rna_type = "tRNA";
1124         break;
1125       case FEATDEF_rRNA :
1126         rna_type = "rRNA";
1127         break;
1128       case FEATDEF_otherRNA :
1129         rna_type = "misc RNA";
1130         break;
1131       case FEATDEF_ncRNA :
1132         rna_type = "ncRNA";
1133         for (gbq = rna->qual; gbq != NULL; gbq = gbq->next) {
1134           if (StringICmp (gbq->qual, "ncRNA_class") != 0) continue;
1135           if (StringDoesHaveText (gbq->val)) {
1136             rna_type = gbq->val;
1137           }
1138         }
1139         break;
1140       case FEATDEF_tmRNA :
1141         rna_type = "tmRNA";
1142         break;
1143       default :
1144         break;
1145     }
1146     if (rna->pseudo) {
1147       rna_type = "pseudo RNA";
1148     }
1149     if (rna->product != NULL) {
1150       bsp = BioseqFindFromSeqLoc (rna->product);
1151       if (bsp != NULL) {
1152         for (sip = bsp->id; sip != NULL; sip = sip->next) {
1153           switch (sip->choice) {
1154             case SEQID_GENBANK :
1155             case SEQID_TPG :
1156               tsip = (TextSeqIdPtr) sip->data.ptrvalue;
1157               if (tsip == NULL) continue;
1158               rnaaccn = tsip->accession;
1159               break;
1160             case SEQID_GENERAL :
1161               dbt = (DbtagPtr) sip->data.ptrvalue;
1162               if (dbt == NULL) continue;
1163               if (IsSkippableDbtag (dbt)) continue;
1164               oip = dbt->tag;
1165               if (oip == NULL) continue;
1166               rnagnl = oip->str;
1167               break;
1168             default :
1169               break;
1170           }
1171         }
1172       }
1173     }
1174   }
1175 
1176   if (StringDoesHaveText (locus_tag)) {
1177     fprintf (fp, "%s", locus_tag);
1178   } else {
1179     fprintf (fp, "null_gene_ltag");
1180   }
1181 
1182   fprintf (fp, "\t");
1183   if (StringDoesHaveText (locus)) {
1184     fprintf (fp, "%s", locus);
1185   } else {
1186     fprintf (fp, "null_gene_locus");
1187   }
1188 
1189   fprintf (fp, "\t");
1190   if (StringDoesHaveText (desc)) {
1191     fprintf (fp, "%s", desc);
1192   } else {
1193     fprintf (fp, "null_gene_desc");
1194   }
1195 
1196   fprintf (fp, "\t");
1197   if (StringDoesHaveText (fbgn)) {
1198     fprintf (fp, "%s", fbgn);
1199   } else {
1200     fprintf (fp, "null_fbgn");
1201   }
1202 
1203   fprintf (fp, "\t");
1204   if (old_locus_tag != NULL) {
1205     prefix = "";
1206     for (vnp = old_locus_tag; vnp != NULL; vnp = vnp->next) {
1207       str = (CharPtr) vnp->data.ptrvalue;
1208       if (StringHasNoText (str)) continue;
1209       fprintf (fp, "%s%s", prefix, str);
1210       prefix = ",";
1211     }
1212   } else {
1213     fprintf (fp, "null_old_ltag");
1214   }
1215 
1216   fprintf (fp, "\t");
1217   if (StringDoesHaveText (cdslcl)) {
1218     fprintf (fp, "%s", cdslcl);
1219   } else {
1220     fprintf (fp, "null_cds_lcl");
1221   }
1222 
1223   fprintf (fp, "\t");
1224   if (StringDoesHaveText (cdsaccn)) {
1225     fprintf (fp, "%s", cdsaccn);
1226   } else {
1227     fprintf (fp, "null_cds_accn");
1228   }
1229 
1230   fprintf (fp, "\t");
1231   if (StringDoesHaveText (cdsgnl)) {
1232     fprintf (fp, "%s", cdsgnl);
1233   } else {
1234     fprintf (fp, "null_cds_gnl");
1235   }
1236 
1237   fprintf (fp, "\t");
1238   if (StringDoesHaveText (rnaaccn)) {
1239     fprintf (fp, "%s", rnaaccn);
1240   } else {
1241     fprintf (fp, "null_rna_accn");
1242   }
1243 
1244   fprintf (fp, "\t");
1245   if (StringDoesHaveText (rnagnl)) {
1246     fprintf (fp, "%s", rnagnl);
1247   } else {
1248     fprintf (fp, "null_rna_gnl");
1249   }
1250 
1251   fprintf (fp, "\t");
1252   if (StringDoesHaveText (cdslabel)) {
1253     fprintf (fp, "%s", cdslabel);
1254   } else {
1255     fprintf (fp, "null_cds_product");
1256   }
1257 
1258   fprintf (fp, "\t");
1259   if (StringDoesHaveText (rnalabel)) {
1260     fprintf (fp, "%s", rnalabel);
1261   } else {
1262     fprintf (fp, "null_rna_product");
1263   }
1264 
1265   fprintf (fp, "\t");
1266   if (StringDoesHaveText (gene_type)) {
1267     fprintf (fp, "%s", gene_type);
1268   } else {
1269     fprintf (fp, "null_gene_type");
1270   }
1271 
1272   fprintf (fp, "\t");
1273   if (StringDoesHaveText (rna_type)) {
1274     fprintf (fp, "%s", rna_type);
1275   } else {
1276     fprintf (fp, "null_rna_type");
1277   }
1278 
1279   fprintf (fp, "\n");
1280 }
1281 
1282 static void GeneReportOneBsp (
1283   BioseqPtr bsp,
1284   FILE *fp
1285 )
1286 
1287 {
1288   CharPtr            cdslabel, rnalabel;
1289   SeqMgrFeatContext  fcontext;
1290   GmcDataPtr         gdp, head;
1291   GeneRefPtr         grp;
1292   Int2               i, j, k, numgene, numcds, numrna, total;
1293   SeqFeatPtr         matchsfp, sfp, tmp;
1294   SeqFeatXrefPtr     xref;
1295 
1296   if (bsp == NULL || fp == NULL) return;
1297 
1298   numgene = 0;
1299   numcds = 0;
1300   numrna = 0;
1301 
1302   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
1303   while (sfp != NULL) {
1304     switch (sfp->data.choice) {
1305       case SEQFEAT_GENE :
1306         numgene++;
1307         break;
1308       case SEQFEAT_CDREGION :
1309         numcds++;
1310         break;
1311       case SEQFEAT_RNA :
1312         numrna++;
1313         break;
1314       default :
1315         break;
1316     }
1317     sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext);
1318   }
1319 
1320   if (numgene == 0) return;
1321   total = numgene + numcds + numrna;
1322   if (total == 0) return;
1323 
1324   head = (GmcDataPtr) MemNew (sizeof (GmcData) * (size_t) (total + 1));
1325   if (head == NULL) return;
1326 
1327   gdp = head;
1328   total = 0;
1329   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
1330   while (sfp != NULL) {
1331     if (sfp->data.choice == SEQFEAT_CDREGION || sfp->data.choice == SEQFEAT_RNA) {
1332       gdp->feat = sfp;
1333       gdp->label = fcontext.label;
1334       grp = SeqMgrGetGeneXref (sfp);
1335       if (grp == NULL) {
1336         gdp->gene = SeqMgrGetOverlappingGene (sfp->location, NULL);
1337       } else if (! SeqMgrGeneIsSuppressed (grp)) {
1338         if (StringDoesHaveText (grp->locus_tag)) {
1339           gdp->gene = SeqMgrGetGeneByLocusTag (bsp, grp->locus_tag, NULL);
1340         } else if (StringDoesHaveText (grp->locus)) {
1341           gdp->gene = SeqMgrGetFeatureByLabel (bsp, grp->locus, SEQFEAT_GENE, 0, NULL);
1342         }
1343       }
1344       gdp++;
1345       total++;
1346     } else if (sfp->data.choice == SEQFEAT_GENE) {
1347       gdp->gene = sfp;
1348       gdp++;
1349       total++;
1350     }
1351     sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext);
1352   }
1353 
1354   HeapSort (head, (size_t) total, sizeof (GmcData), SortByGenePtr);
1355 
1356   for (i = 0; i < total; i += j) {
1357     sfp = head [i].gene;
1358     if (sfp == NULL) continue;
1359     numcds = 0;
1360     numrna = 0;
1361     for (j = 0; i + j < total && sfp == head [i + j].gene; j++) {
1362       tmp = head [i + j].feat;
1363       if (tmp == NULL) continue;
1364       if (tmp->data.choice == SEQFEAT_CDREGION) {
1365         numcds++;
1366       } else if (tmp->data.choice == SEQFEAT_RNA) {
1367         numrna++;
1368       }
1369     }
1370     cdslabel = NULL;
1371     rnalabel = NULL;
1372     if (numcds > 0) {
1373       for (k = 0; k < j; k++) {
1374         tmp = head [i + k].feat;
1375         if (tmp == NULL) continue;
1376         if (tmp->data.choice != SEQFEAT_CDREGION) continue;
1377         cdslabel = head [i + k].label;
1378         matchsfp = NULL;
1379         for (xref = tmp->xref; xref != NULL && matchsfp == NULL; xref = xref->next) {
1380           if (xref->id.choice != 0) {
1381             matchsfp = SeqMgrGetFeatureByFeatID (tmp->idx.entityID, NULL, NULL, xref, &fcontext);
1382             rnalabel = fcontext.label;
1383           }
1384         }
1385         PrintOneGeneLine (sfp, tmp, matchsfp, cdslabel, rnalabel, fp);
1386       }
1387     } else if (numrna > 0) {
1388       for (k = 0; k < j; k++) {
1389         tmp = head [i + k].feat;
1390         if (tmp == NULL) continue;
1391         if (tmp->data.choice != SEQFEAT_RNA) continue;
1392         rnalabel = head [i + k].label;
1393         PrintOneGeneLine (sfp, NULL, tmp, NULL, rnalabel, fp);
1394       }
1395     } else {
1396       PrintOneGeneLine (sfp, NULL, NULL, NULL, NULL, fp);
1397     }
1398   }
1399 
1400   MemFree (head);
1401 }
1402 
1403 static void GeneReportGenomicBsp (
1404   BioseqPtr bsp,
1405   Pointer userdata
1406 )
1407 
1408 {
1409   SeqMgrDescContext  dcontext;
1410   MolInfoPtr         mip;
1411   SeqDescrPtr        sdp;
1412 
1413   if (bsp == NULL) return;
1414 
1415   if (ISA_aa (bsp->mol)) return;
1416   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
1417   if (sdp == NULL) return;
1418   mip = (MolInfoPtr) sdp->data.ptrvalue;
1419   if (mip == NULL) return;
1420   if (mip->biomol != MOLECULE_TYPE_GENOMIC) return;
1421 
1422   GeneReportOneBsp (bsp, (FILE *) userdata);
1423 }
1424 
1425 static void GeneReportOneFile (
1426   CharPtr results,
1427   CharPtr base,
1428   CharPtr suffix,
1429   SeqEntryPtr sep
1430 )
1431 
1432 {
1433   Char    file [FILENAME_MAX], path [PATH_MAX];
1434   FILE    *fp;
1435   ErrSev  oldErrSev;
1436 
1437   StringNCpy_0 (path, results, sizeof (path));
1438   sprintf (file, "%s%s", base, suffix);
1439   FileBuildPath (path, NULL, file);
1440 
1441   fp = FileOpen (path, "w");
1442   if (fp == NULL) return;
1443 
1444   oldErrSev = ErrSetMessageLevel (SEV_MAX);
1445   VisitBioseqsInSep (sep, (Pointer) fp, GeneReportGenomicBsp);
1446   ErrSetMessageLevel (oldErrSev);
1447 
1448   FileClose (fp);
1449 }
1450 
1451 static void EnhanceOneCDS (
1452   SeqFeatPtr sfp,
1453   Boolean alt_splice
1454 )
1455 
1456 {
1457   DbtagPtr        dbt;
1458   GBQualPtr       gbq;
1459   Char            id [64];
1460   SeqIdPtr        ids, sip;
1461   size_t          len;
1462   CharPtr         name, nwstr, ptr, str;
1463   ObjectIdPtr     oip;
1464   ProtRefPtr      prp;
1465   Char            tmp [256];
1466   ValNodePtr      vnp;
1467   SeqFeatXrefPtr  xref;
1468 
1469   if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION) return;
1470 
1471   name = NULL;
1472   vnp = NULL;
1473   prp = NULL;
1474 
1475   for (xref = sfp->xref; xref != NULL; xref = xref->next) {
1476     if (xref->data.choice == SEQFEAT_PROT) {
1477       prp = (ProtRefPtr) xref->data.value.ptrvalue;
1478     }
1479   }
1480 
1481   id [0] = '\0';
1482   for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
1483     if (StringICmp (gbq->qual, "protein_id") == 0) {
1484       StringNCpy_0 (id, gbq->val, sizeof (id));
1485     }
1486   }
1487   if (StringDoesHaveText (id) && StringChr (id, '|') != NULL) {
1488     str = NULL;
1489     ids = SeqIdParse (id);
1490     for (sip = ids; sip != NULL; sip = sip->next) {
1491       if (sip->choice != SEQID_GENERAL) continue;
1492       dbt = (DbtagPtr) sip->data.ptrvalue;
1493       if (dbt == NULL) continue;
1494       if (IsSkippableDbtag (dbt)) continue;
1495       oip = dbt->tag;
1496       if (oip == NULL) continue;
1497       str = oip->str;
1498     }
1499 
1500     if (StringDoesHaveText (str)) {
1501       if (prp != NULL && prp->name != NULL) {
1502         vnp = prp->name;
1503         name = (CharPtr) vnp->data.ptrvalue;
1504       }
1505       if (StringDoesHaveText (name) && vnp != NULL) {
1506         if (alt_splice) {
1507           ptr = StringChr (str, '-');
1508           if (ptr != NULL && StringLen (ptr) == 3) {
1509             ptr++;
1510             ptr++;
1511             sprintf (tmp, "%s, isoform %s", str, ptr);
1512             len = StringLen (name) + StringLen (", ") + StringLen (tmp);
1513             nwstr = (CharPtr) MemNew (sizeof (Char) * (len + 2));
1514             if (nwstr != NULL) {
1515               StringCpy (nwstr, name);
1516               /*
1517               StringCat (nwstr, ", ");
1518               */
1519               StringCat (nwstr, " ");
1520               StringCat (nwstr, tmp);
1521               vnp->data.ptrvalue = (Pointer) nwstr;
1522               MemFree (name);
1523             }
1524           } else {
1525             AddQualifierToFeature (sfp, "product", str);
1526           }
1527         } else {
1528           len = StringLen (name) + StringLen (", ") + StringLen (str);
1529           nwstr = (CharPtr) MemNew (sizeof (Char) * (len + 2));
1530           if (nwstr != NULL) {
1531             StringCpy (nwstr, name);
1532             /*
1533             StringCat (nwstr, ", ");
1534             */
1535             StringCat (nwstr, " ");
1536             StringCat (nwstr, str);
1537             vnp->data.ptrvalue = (Pointer) nwstr;
1538             MemFree (name);
1539           }
1540         }
1541       } else {
1542         if (alt_splice) {
1543           ptr = StringChr (str, '-');
1544           if (ptr != NULL && StringLen (ptr) == 3) {
1545             ptr++;
1546             ptr++;
1547             sprintf (tmp, "%s, isoform %s", str, ptr);
1548             AddQualifierToFeature (sfp, "product", tmp);
1549           } else {
1550             AddQualifierToFeature (sfp, "product", str);
1551           }
1552         } else {
1553           AddQualifierToFeature (sfp, "product", str);
1554         }
1555       }
1556     }
1557 
1558     SeqIdSetFree (ids);
1559   }
1560 }
1561 
1562 static void EnhanceOneRna (
1563   SeqFeatPtr sfp,
1564   Boolean alt_splice
1565 )
1566 
1567 {
1568   DbtagPtr     dbt;
1569   GBQualPtr    gbq, nm_gbq;
1570   Char         id [64];
1571   SeqIdPtr     ids, sip;
1572   size_t       len;
1573   CharPtr      name, nwstr, ptr, str;
1574   ObjectIdPtr  oip;
1575   RnaRefPtr    rrp;
1576   Char         tmp [256];
1577 
1578   if (sfp == NULL || sfp->data.choice != SEQFEAT_RNA) return;
1579 
1580   name = NULL;
1581   nm_gbq = NULL;
1582 
1583   rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
1584   if (rrp != NULL && rrp->ext.choice == 1) {
1585     switch (rrp->type) {
1586       case 1 :  /* precurrsor_RNA */
1587       case 2 :  /* mRNA */
1588       case 4 :  /* rRNA */
1589         name = rrp->ext.value.ptrvalue;
1590         break;
1591       case 255 :  /* misc_RNA, ncRNA, tmRNA */
1592         for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
1593           if (StringICmp (gbq->qual, "product") == 0) {
1594             nm_gbq = gbq;
1595             name = gbq->val;
1596           }
1597         }
1598         break;
1599       case 3:  /* tRNA */
1600         return;
1601       default :
1602         break;
1603     }
1604   }
1605 
1606   id [0] = '\0';
1607   for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
1608     if (StringICmp (gbq->qual, "transcript_id") == 0) {
1609       StringNCpy_0 (id, gbq->val, sizeof (id));
1610     }
1611   }
1612   if (StringDoesHaveText (id) && StringChr (id, '|') != NULL) {
1613     str = NULL;
1614     ids = SeqIdParse (id);
1615     for (sip = ids; sip != NULL; sip = sip->next) {
1616       if (sip->choice != SEQID_GENERAL) continue;
1617       dbt = (DbtagPtr) sip->data.ptrvalue;
1618       if (dbt == NULL) continue;
1619       if (IsSkippableDbtag(dbt)) continue;
1620       oip = dbt->tag;
1621       if (oip == NULL) continue;
1622       str = oip->str;
1623     }
1624 
1625     if (StringDoesHaveText (str)) {
1626       if (StringDoesHaveText (name) && StringCmp (str, name) != 0) {
1627         if (alt_splice) {
1628           ptr = StringChr (str, '-');
1629           if (ptr != NULL && StringLen (ptr) == 3) {
1630             ptr++;
1631             ptr++;
1632             sprintf (tmp, "%s, transcript variant %s", str, ptr);
1633             len = StringLen (name) + StringLen (", ") + StringLen (tmp);
1634             nwstr = (CharPtr) MemNew (sizeof (Char) * (len + 2));
1635             if (nwstr != NULL) {
1636               StringCpy (nwstr, name);
1637               /*
1638               StringCat (nwstr, ", ");
1639               */
1640               StringCat (nwstr, " ");
1641               StringCat (nwstr, tmp);
1642               if (nm_gbq != NULL) {
1643                 nm_gbq->val = (Pointer) nwstr;
1644               } else {
1645                 rrp->ext.value.ptrvalue = (Pointer) nwstr;
1646               }
1647               MemFree (name);
1648             }
1649           } else {
1650             AddQualifierToFeature (sfp, "product", str);
1651           }
1652         } else {
1653           len = StringLen (name) + StringLen (", ") + StringLen (str);
1654           nwstr = (CharPtr) MemNew (sizeof (Char) * (len + 2));
1655           if (nwstr != NULL) {
1656             StringCpy (nwstr, name);
1657             /*
1658             StringCat (nwstr, ", ");
1659             */
1660             StringCat (nwstr, " ");
1661             StringCat (nwstr, str);
1662             if (nm_gbq != NULL) {
1663               nm_gbq->val = (Pointer) nwstr;
1664             } else {
1665               rrp->ext.value.ptrvalue = (Pointer) nwstr;
1666             }
1667             MemFree (name);
1668           }
1669         }
1670       } else {
1671         if (alt_splice) {
1672           ptr = StringChr (str, '-');
1673           if (ptr != NULL && StringLen (ptr) == 3) {
1674             ptr++;
1675             ptr++;
1676             sprintf (tmp, "%s, transcript variant %s", str, ptr);
1677             AddQualifierToFeature (sfp, "product", tmp);
1678           } else {
1679             AddQualifierToFeature (sfp, "product", str);
1680           }
1681         } else {
1682           AddQualifierToFeature (sfp, "product", str);
1683         }
1684       }
1685     }
1686 
1687     SeqIdSetFree (ids);
1688   }
1689 }
1690 
1691 static void EnhanceFeatureAnnotation (
1692   SeqFeatPtr features,
1693   BioseqPtr bsp
1694 )
1695 
1696 {
1697   GmcDataPtr  gdp, head;
1698   GeneRefPtr  grp;
1699   Int2        i, j, k, numgene, numcds, numrna;
1700   SeqFeatPtr  sfp;
1701 
1702   if (features == NULL || bsp == NULL) return;
1703 
1704   numgene = 0;
1705   numcds = 0;
1706   numrna = 0;
1707 
1708   for (sfp = features; sfp != NULL; sfp = sfp->next) {
1709     switch (sfp->data.choice) {
1710       case SEQFEAT_GENE :
1711         numgene++;
1712         break;
1713       case SEQFEAT_CDREGION :
1714         numcds++;
1715         break;
1716       case SEQFEAT_RNA :
1717         numrna++;
1718         break;
1719       default :
1720         break;
1721     }
1722   }
1723 
1724   if (numgene == 0) return;
1725 
1726   if (numcds > 0) {
1727     head = (GmcDataPtr) MemNew (sizeof (GmcData) * (size_t) (numcds + 1));
1728     if (head != NULL) {
1729       gdp = head;
1730       for (sfp = features; sfp != NULL; sfp = sfp->next) {
1731         if (sfp->idx.subtype == FEATDEF_CDS) {
1732           gdp->feat = sfp;
1733           grp = SeqMgrGetGeneXref (sfp);
1734           if (grp == NULL || (! SeqMgrGeneIsSuppressed (grp))) {
1735             gdp->gene = SeqMgrGetOverlappingGene (sfp->location, NULL);
1736           }
1737           gdp++;
1738         }
1739       }
1740       HeapSort (head, (size_t) numcds, sizeof (GmcData), SortByGenePtr);
1741       for (i = 0; i < numcds; i += j) {
1742         sfp = head [i].gene;
1743         for (j = 1; i + j < numcds && sfp == head [i + j].gene; j++) continue;
1744         if (j == 1) {
1745           /* no alt splicing */
1746           EnhanceOneCDS (head [i].feat, FALSE);
1747         } else {
1748           /* is alt splicing */
1749           for (k = 0; k < j; k++) {
1750             EnhanceOneCDS (head [i + k].feat, TRUE);
1751           }
1752         }
1753       }
1754     }
1755     MemFree (head);
1756   }
1757 
1758   if (numrna > 0) {
1759     head = (GmcDataPtr) MemNew (sizeof (GmcData) * (size_t) (numrna + 1));
1760     if (head != NULL) {
1761       gdp = head;
1762       for (sfp = features; sfp != NULL; sfp = sfp->next) {
1763         if (sfp->data.choice == SEQFEAT_RNA) {
1764           gdp->feat = sfp;
1765           grp = SeqMgrGetGeneXref (sfp);
1766           if (grp == NULL || (! SeqMgrGeneIsSuppressed (grp))) {
1767             gdp->gene = SeqMgrGetOverlappingGene (sfp->location, NULL);
1768           }
1769           gdp++;
1770         }
1771       }
1772       HeapSort (head, (size_t) numrna, sizeof (GmcData), SortByGenePtr);
1773       for (i = 0; i < numrna; i += j) {
1774         sfp = head [i].gene;
1775         for (j = 1; i + j < numrna && sfp == head [i + j].gene; j++) continue;
1776         if (j == 1) {
1777           /* no alt splicing */
1778           EnhanceOneRna (head [i].feat, FALSE);
1779         } else {
1780           /* is alt splicing */
1781           for (k = 0; k < j; k++) {
1782             EnhanceOneRna (head [i + k].feat, TRUE);
1783           }
1784         }
1785       }
1786     }
1787     MemFree (head);
1788   }
1789 }
1790 
1791 static BioseqPtr AttachSeqAnnotEntity (
1792   Uint2 entityID,
1793   SeqAnnotPtr sap,
1794   TblArgsPtr tbl
1795 )
1796 
1797 {
1798   SeqAnnotPtr  anp;
1799   BioseqPtr    bsp;
1800   Char         buf [80];
1801   Int2         genCode;
1802   SeqEntryPtr  oldscope;
1803   SeqEntryPtr  sep;
1804   SeqFeatPtr   sfp = NULL;
1805   SeqIdPtr     sip;
1806   SeqLocPtr    slp;
1807 
1808   if (sap == NULL || tbl == NULL) return NULL;
1809 
1810   bsp = GetBioseqReferencedByAnnot (sap, entityID);
1811   if (bsp == NULL) {
1812     oldscope = SeqEntrySetScope (NULL);
1813     if (oldscope != NULL) {
1814       bsp = GetBioseqReferencedByAnnot (sap, entityID);
1815       SeqEntrySetScope (oldscope);
1816     }
1817   }
1818   if (bsp != NULL) {
1819     sep = SeqMgrGetSeqEntryForData (bsp);
1820     entityID = ObjMgrGetEntityIDForChoice (sep);
1821     if (sap->type == 1) {
1822       sfp = (SeqFeatPtr) sap->data;
1823       genCode = GetGenCodeForBsp (bsp);
1824       SetEmptyGeneticCodes (sap, genCode);
1825     }
1826     if (bsp->annot == NULL) {
1827       bsp->annot = sap;
1828     } else {
1829       anp = bsp->annot;
1830       while (anp->next != NULL) {
1831         anp = anp->next;
1832       }
1833       anp->next = sap;
1834     }
1835     if (sfp != NULL) {
1836       if (tbl->smartfeats) {
1837 
1838         /* indexing needed to find mRNA and CDS within each gene */
1839 
1840         SeqMgrIndexFeatures (entityID, NULL);
1841 
1842         EnhanceFeatureAnnotation (sfp, bsp);
1843       }
1844 
1845       PromoteXrefsExEx (sfp, bsp, entityID, TRUE, FALSE, tbl->genprodset, tbl->forcelocalid);
1846       sep = GetTopSeqEntryForEntityID (entityID);
1847     }
1848   } else {
1849     buf [0] = '\0';
1850     if (sap->type == 1) {
1851       sfp = (SeqFeatPtr) sap->data;
1852       if (sfp != NULL && sfp->location != NULL) {
1853         slp = SeqLocFindNext (sfp->location, NULL);
1854         if (slp != NULL) {
1855           sip = SeqLocId (slp);
1856           if (sip != NULL) {
1857             SeqIdWrite (sip, buf, PRINTID_FASTA_LONG, sizeof (buf) - 1);
1858           }
1859         }
1860       }
1861     }
1862     Message (MSG_POSTERR, "Feature table identifiers %s do not match record", buf);
1863   }
1864   sep = GetTopSeqEntryForEntityID (entityID);
1865   return bsp;
1866 }
1867 
1868 static CharPtr TrimBracketsFromString (
1869   CharPtr str,
1870   SqnTagPtr stp
1871 )
1872 
1873 {
1874   Uchar    ch;    /* to use 8bit characters in multibyte languages */
1875   Int2     count;
1876   CharPtr  dst;
1877   CharPtr  ptr;
1878 
1879   if (StringHasNoText (str) || stp == NULL) return str;
1880 
1881   /* remove bracketed fields */
1882 
1883   count = 0;
1884   dst = str;
1885   ptr = str;
1886   ch = *ptr;
1887   while (ch != '\0') {
1888     if (ch == '[') {
1889       if (count < stp->num_tags && (! stp->used [count])) {
1890         *dst = ch;
1891         dst++;
1892         ptr++;
1893         ch = *ptr;
1894         while (ch != '\0' && ch != ']') {
1895           *dst = ch;
1896           dst++;
1897           ptr++;
1898           ch = *ptr;
1899         }
1900         *dst = ch;
1901         dst++;
1902         ptr++;
1903         ch = *ptr;
1904       } else {
1905         ptr++;
1906         ch = *ptr;
1907         while (ch != '\0' && ch != ']' && ch != '"') {
1908           ptr++;
1909           ch = *ptr;
1910         }
1911         if (ch == '"') {
1912           ptr++;
1913           ch = *ptr;
1914           while (ch != '\0' && ch != '"') {
1915             ptr++;
1916             ch = *ptr;
1917           }
1918         }
1919         while (ch != '\0' && ch != ']') {
1920           ptr++;
1921           ch = *ptr;
1922         }
1923         ptr++;
1924         ch = *ptr;
1925       }
1926       count++;
1927     } else {
1928       *dst = ch;
1929       dst++;
1930       ptr++;
1931       ch = *ptr;
1932     }
1933   }
1934   *dst = '\0';
1935 
1936   /* remove runs of whitespace characters */
1937 
1938   dst = str;
1939   ptr = str;
1940   ch = *ptr;
1941   while (ch != '\0') {
1942     if (IS_WHITESP (ch)) {
1943       *dst = ch;
1944       dst++;
1945       ptr++;
1946       ch = *ptr;
1947       while (IS_WHITESP (ch)) {
1948         ptr++;
1949         ch = *ptr;
1950       }
1951     } else {
1952       *dst = ch;
1953       dst++;
1954       ptr++;
1955       ch = *ptr;
1956     }
1957   }
1958   *dst = '\0';
1959 
1960   return str;
1961 }
1962 
1963 static Boolean HasTpaAccession (
1964   UserObjectPtr uop
1965 )
1966 
1967 {
1968   UserFieldPtr  curr;
1969   ObjectIdPtr   oip;
1970   CharPtr       str;
1971   UserFieldPtr  ufp;
1972 
1973   if (uop == NULL) return FALSE;
1974   if ((oip = uop->type) == NULL) return FALSE;
1975   if (StringCmp (oip->str, "TpaAssembly") != 0) return FALSE;
1976 
1977   for (curr = uop->data; curr != NULL; curr = curr->next) {
1978     if (curr->choice != 11) continue;
1979     for (ufp = curr->data.ptrvalue; ufp != NULL; ufp = ufp->next) {
1980       if (ufp->choice != 1) continue;
1981       oip = ufp->label;
1982       if (oip == NULL || StringICmp (oip->str, "accession") != 0) continue;
1983       str = (CharPtr) ufp->data.ptrvalue;
1984       if (StringDoesHaveText (str)) return TRUE;
1985     }
1986   }
1987 
1988   return FALSE;
1989 }
1990 
1991 static Boolean HasGenomeProjectDB (
1992   UserObjectPtr uop
1993 )
1994 
1995 {
1996   UserFieldPtr  curr;
1997   ObjectIdPtr   oip;
1998   Int4          val;
1999 
2000   if (uop == NULL) return FALSE;
2001   if ((oip = uop->type) == NULL) return FALSE;
2002   if (StringCmp (oip->str, "GenomeProjectsDB") != 0) return FALSE;
2003 
2004   for (curr = uop->data; curr != NULL; curr = curr->next) {
2005     oip = curr->label;
2006     if (oip == NULL || StringICmp (oip->str, "ProjectID") != 0) continue;
2007     if (curr->choice != 2) continue;
2008     val = (Int4) curr->data.intvalue;
2009     if (val > 0) return TRUE;
2010   }
2011 
2012   return FALSE;
2013 }
2014 
2015 static void GetFirstBiop (
2016   BioSourcePtr biop,
2017   Pointer userdata
2018 )
2019 
2020 {
2021   BioSourcePtr PNTR biopp;
2022 
2023   biopp = (BioSourcePtr PNTR) userdata;
2024   if (biop == NULL || biopp == NULL) return;
2025   if (*biopp != NULL) return;
2026   *biopp = biop;
2027 }
2028 
2029 static void ProcessOneNuc (
2030   Uint2 entityID,
2031   BioseqPtr bsp,
2032   BioSourcePtr src,
2033   TblArgsPtr tbl,
2034   MolInfoPtr template_molinfo
2035 )
2036 
2037 {
2038   Boolean        addNewBiop = TRUE;
2039   Boolean        addNewMip = TRUE;
2040   BioSourcePtr   biop = NULL;
2041   SeqFeatPtr     cds;
2042   GBBlockPtr     gbp;
2043   Int2           genCode;
2044   size_t         len;
2045   MolInfoPtr     mip = NULL;
2046   Boolean        mito;
2047   OrgNamePtr     onp;
2048   OrgRefPtr      orp;
2049   SeqDescrPtr    sdp;
2050   SeqHistPtr     shp;
2051   SqnTagPtr      stp = NULL;
2052   CharPtr        str;
2053   CharPtr        tmp;
2054   CharPtr        ttl = NULL;
2055   UserObjectPtr  uop;
2056   ValNodePtr     vnp;
2057   SeqMgrDescContext dcontext;
2058 
2059   if (bsp == NULL) return;
2060 
2061   genCode = GetGenCodeForBsp (bsp);
2062 
2063   if (bsp->mol == Seq_mol_na) {
2064     bsp->mol = Seq_mol_dna;
2065   }
2066 
2067   if (src != NULL) {
2068     src = AsnIoMemCopy ((Pointer) src,
2069                         (AsnReadFunc) BioSourceAsnRead,
2070                         (AsnWriteFunc) BioSourceAsnWrite);
2071   } else {
2072     sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
2073     if (sdp != NULL) {
2074       src = sdp->data.ptrvalue;
2075       if (src != NULL) {
2076         addNewBiop = FALSE;
2077       }
2078     }
2079   }
2080 
2081   vnp = ValNodeExtract (&(bsp->descr), Seq_descr_title);
2082   if (vnp != NULL) {
2083     ttl = (CharPtr) vnp->data.ptrvalue;
2084   }
2085 
2086   if (ttl != NULL || tbl->srcquals != NULL) {
2087     len = StringLen (ttl) + StringLen (tbl->srcquals) + 5;
2088     str = (CharPtr) MemNew (len * sizeof (Char));
2089     if (str != NULL) {
2090       StringCpy (str, ttl);
2091       if (ttl != NULL && tbl->srcquals != NULL) {
2092         StringCat (str, "; ");
2093       }
2094       StringCat (str, tbl->srcquals);
2095       stp = SqnTagParse (str);
2096     }
2097     MemFree (str);
2098   }
2099 
2100   if (stp != NULL) {
2101     biop = ParseTitleIntoBioSource (stp, tbl->organism, src);
2102     ParseTitleIntoBioseq (stp, bsp);
2103     str = SqnTagFind (stp, "comment");
2104     if (str != NULL) {
2105       tmp = StringSave (str);
2106       SeqDescrAddPointer (&(bsp->descr), Seq_descr_comment, (Pointer) tmp);
2107     }
2108   }
2109   if (biop == NULL) {
2110     biop = ParseTitleIntoBioSource (NULL, tbl->organism, src);
2111   }
2112   if (biop != NULL && addNewBiop) {
2113     SeqDescrAddPointer (&(bsp->descr), Seq_descr_source, (Pointer) biop);
2114   }
2115   if (biop != NULL) {
2116     AddMissingSourceInfo (biop);
2117   }
2118 
2119   sdp = BioseqGetSeqDescr (bsp, Seq_descr_molinfo, NULL);
2120   if (sdp != NULL && sdp->choice == Seq_descr_molinfo) {
2121     mip = (MolInfoPtr) sdp->data.ptrvalue;
2122     addNewMip = FALSE;
2123   } else {
2124     mip = MolInfoNew ();
2125   }
2126   if (mip != NULL) {
2127     if (stp != NULL) {
2128       mip = ParseTitleIntoMolInfo (stp, mip);
2129     }
2130     if (mip->biomol == 0 && template_molinfo != NULL)
2131     {
2132       mip->biomol = template_molinfo->biomol;
2133     }
2134     if (mip->biomol == 0) {
2135       mip->biomol = MOLECULE_TYPE_GENOMIC;
2136     }
2137     if (addNewMip) {
2138       SeqDescrAddPointer (&(bsp->descr), Seq_descr_molinfo, (Pointer) mip);
2139     }
2140     switch (mip->biomol) {
2141       case MOLECULE_TYPE_PRE_MRNA :
2142       case MOLECULE_TYPE_MRNA :
2143       case MOLECULE_TYPE_RRNA :
2144       case MOLECULE_TYPE_TRNA :
2145       case MOLECULE_TYPE_SNRNA :
2146       case MOLECULE_TYPE_SCRNA :
2147       case MOLECULE_TYPE_CRNA :
2148       case MOLECULE_TYPE_SNORNA :
2149       case MOLECULE_TYPE_TRANSCRIBED_RNA :
2150       case MOLECULE_TYPE_NCRNA :
2151       case MOLECULE_TYPE_TMRNA :
2152         if (bsp->mol == Seq_mol_dna) {
2153           str = SqnTagFind (stp, "molecule");
2154           if (str == NULL) {
2155             str = SqnTagFind (stp, "mol");
2156           }
2157           if (str != NULL) {
2158             if (StringICmp (str, "dna") == 0) break;
2159           }
2160           bsp->mol = Seq_mol_rna;
2161         }
2162         break;
2163       default :
2164         break;
2165     }
2166   }
2167 
2168   if (genCode == 0 && biop != NULL) {
2169     orp = biop->org;
2170     if (orp != NULL) {
2171       onp = orp->orgname;
2172       if (onp != NULL) {
2173         mito = (Boolean) (biop->genome == 4 || biop->genome == 5);
2174         if (mito) {
2175           genCode = onp->mgcode;
2176         } else {
2177           genCode = onp->gcode;
2178         }
2179       }
2180     }
2181   }
2182 
2183   if (StringDoesHaveText (tbl->comment)) {
2184     str = StringSave (tbl->comment);
2185     SeqDescrAddPointer (&(bsp->descr), Seq_descr_comment, (Pointer) str);
2186   }
2187   if (StringDoesHaveText (tbl->commentFile)) {
2188     str = StringSave (tbl->commentFile);
2189     SeqDescrAddPointer (&(bsp->descr), Seq_descr_comment, (Pointer) str);
2190   }
2191 
2192   if (stp != NULL) {
2193     gbp = ParseTitleIntoGenBank (stp, NULL);
2194     if (gbp != NULL && (gbp->extra_accessions != NULL || gbp->keywords != NULL)) {
2195       SeqDescrAddPointer (&(bsp->descr), Seq_descr_genbank, (Pointer) gbp);
2196     } else {
2197       gbp = GBBlockFree (gbp);
2198     }
2199 
2200     shp = ParseTitleIntoSeqHist (stp, NULL);
2201     if (shp != NULL && shp->replace_ids != NULL) {
2202       bsp->hist = SeqHistFree (bsp->hist);
2203       bsp->hist = shp;
2204     } else {
2205       shp = SeqHistFree (shp);
2206     }
2207   }
2208 
2209   if (stp != NULL) {
2210     uop = ParseTitleIntoTpaAssembly (stp, NULL);
2211     if (uop != NULL && HasTpaAccession (uop)) {
2212       SeqDescrAddPointer (&(bsp->descr), Seq_descr_user, (Pointer) uop);
2213     } else {
2214       uop = UserObjectFree (uop);
2215     }
2216   }
2217 
2218   if (stp != NULL) {
2219     uop = ParseTitleIntoGenomeProjectsDB (stp, NULL);
2220     if (uop != NULL && HasGenomeProjectDB (uop)) {
2221       SeqDescrAddPointer (&(bsp->descr), Seq_descr_user, (Pointer) uop);
2222     } else {
2223       uop = UserObjectFree (uop);
2224     }
2225   }
2226 
2227   /* look for pubmed IDs */
2228   if (stp != NULL) {
2229     AddPubsFromTitle (stp, &(bsp->descr)); 
2230   }
2231 
2232   if (tbl->findorf) {
2233     cds = AnnotateBestOrf (bsp, genCode, tbl->altstart, tbl->runonorf, stp);
2234     if (cds != NULL) {
2235       PromoteXrefsExEx (cds, bsp, entityID, TRUE, FALSE, FALSE, tbl->forcelocalid);
2236     }
2237   }
2238 
2239   TrimBracketsFromString (ttl, stp);
2240   if (StringDoesHaveText (ttl)) {
2241     str = StringSave (ttl);
2242     SeqDescrAddPointer (&(bsp->descr), Seq_descr_title, (Pointer) str);
2243   }
2244 
2245   if (stp != NULL) {
2246     SqnTagFree (stp);
2247   }
2248 
2249   ValNodeFreeData (vnp);
2250 }
2251 
2252 static void ProcessNucBioseqs (SeqEntryPtr top_sep, Uint2 entityID, BioSourcePtr src, TblArgsPtr tbl, MolInfoPtr template_molinfo)
2253 {
2254   BioseqPtr bsp;
2255   BioseqSetPtr bssp;
2256   SeqEntryPtr sep;
2257 
2258   if (top_sep == NULL || top_sep->data.ptrvalue == NULL) return;
2259   if (IS_Bioseq (top_sep)) {
2260     bsp = (BioseqPtr) top_sep->data.ptrvalue;
2261     if (!ISA_aa (bsp->mol)) {
2262       ProcessOneNuc (entityID, bsp, src, tbl, template_molinfo);
2263     }
2264   } else if (IS_Bioseq_set (top_sep)) {
2265     bssp = (BioseqSetPtr) top_sep->data.ptrvalue;
2266     for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
2267       ProcessNucBioseqs (sep, entityID, src, tbl, template_molinfo);
2268     }
2269   }
2270 }
2271 
2272 
2273 static void ProcessOneAnnot (
2274   SeqAnnotPtr sap,
2275   Uint2 entityID,
2276   TblArgsPtr tbl
2277 )
2278 
2279 {
2280   BioseqPtr   bsp;
2281   Int2        genCode;
2282   SeqFeatPtr  sfp;
2283   SeqEntryPtr sep;
2284 
2285   if (sap == NULL || tbl == NULL) return;
2286 
2287   bsp = AttachSeqAnnotEntity (entityID, sap, tbl);
2288   if (bsp == NULL) return;
2289 
2290     sep = GetTopSeqEntryForEntityID (entityID);
2291 
2292   /* correct all idx parent pointers */
2293 
2294   AssignIDsInEntity (entityID, 0, NULL);
2295 
2296   genCode = GetGenCodeForBsp (bsp);
2297 
2298   /* coercion of SeqIds to accession moved to ProcessOneRecord->MakeAccessionID */
2299 
2300   /* for parsed in features or best ORF, promote CDS products to protein bioseq */
2301 
2302   for (sap = bsp->annot; sap != NULL; sap = sap->next) {
2303     if (sap->type == 1) {
2304       SetEmptyGeneticCodes (sap, genCode);
2305       sfp = (SeqFeatPtr) sap->data;
2306       PromoteXrefsExEx (sfp, bsp, entityID, TRUE, FALSE, tbl->genprodset, tbl->forcelocalid);
2307     }
2308   }
2309   sep = GetTopSeqEntryForEntityID (entityID);
2310 }
2311 
2312 static void UpdateException (
2313   SeqFeatPtr sfp,
2314   CharPtr text
2315 )
2316 
2317 {
2318   size_t   len;
2319   CharPtr  str;
2320 
2321   if (sfp == NULL) return;
2322 
2323   sfp->excpt = TRUE;
2324 
2325   if (sfp->except_text == NULL) {
2326     sfp->except_text = StringSave (text);
2327   } else {
2328     len = StringLen (sfp->except_text) + StringLen (text) + 5;
2329     str = MemNew (sizeof (Char) * len);
2330     StringCpy (str, sfp->except_text);
2331     StringCat (str, ",");
2332     StringCat (str, text);
2333     sfp->except_text = MemFree (sfp->except_text);
2334     sfp->except_text = str;
2335   }
2336 }
2337 
2338 static void ReplaceOnePeptide (
2339   SimpleSeqPtr ssp,
2340   Boolean conflict,
2341   Boolean genprodset
2342 )
2343 
2344 {
2345   Uint1              aa;
2346   ByteStorePtr       bs;
2347   BioseqPtr          bsp, gen;
2348   SeqFeatPtr         cds;
2349   CdRegionPtr        crp;
2350   SeqMgrDescContext  dcontext;
2351   MolInfoPtr         mip;
2352   SeqFeatPtr         prt;
2353   SeqDescrPtr        sdp;
2354   SeqIntPtr          sintp;
2355   SeqIdPtr           sip;
2356   SeqLocPtr          slp;
2357   CharPtr            str, str1, str2;
2358   ValNodePtr         vnp;
2359 
2360   if (ssp == NULL || ssp->numid < 1) return;
2361 
2362   str = ssp->id [0];
2363   if (StringHasNoText (str)) {
2364     str = "?";
2365   }
2366   sip = MakeSeqID (str);
2367   bsp = BioseqFind (sip);
2368   SeqIdFree (sip);
2369   if (bsp == NULL) {
2370     Message (MSG_POSTERR, "Unable to find protein sequence %s", str);
2371   }
2372   if (bsp == NULL || bsp->repr != Seq_repr_raw) return;
2373 
2374   if (bsp->seq_data_type == Seq_code_gap) return;
2375 
2376   if (! ISA_aa (bsp->mol)) {
2377     Message (MSG_POSTERR, "Will not replace mRNA sequence %s with protein", str);
2378     return;
2379   }
2380 
2381   /* remove trailing X and * - now just trailing star */
2382 
2383   bs = ssp->seq;
2384   BSSeek (bs, -1, SEEK_END);
2385   aa = (Uint1) BSGetByte (bs);
2386   while (( /* aa == 'X' || */ aa == '*') && ssp->seqlen > 0) {
2387     BSSeek (bs, -1, SEEK_END);
2388     BSDelete (bs, 1);
2389     BSSeek (bs, -1, SEEK_END);
2390     aa = (Uint1) BSGetByte (bs);
2391   }
2392   ssp->seqlen = BSLen (bs);
2393 
2394   str1 = BSMerge (ssp->seq, NULL);
2395   str2 = BSMerge ((ByteStorePtr) bsp->seq_data, NULL);
2396 
2397   if (StringCmp (str1, str2) != 0) {
2398 
2399     /* swap sequence byte stores */
2400 
2401     bs = (ByteStorePtr) bsp->seq_data;
2402     bsp->seq_data = (SeqDataPtr) ssp->seq;
2403     ssp->seq = bs;
2404     bsp->length = BSLen ((ByteStorePtr) bsp->seq_data);
2405     bsp->seq_data_type = Seq_code_ncbieaa;
2406 
2407     if (genprodset) {
2408 
2409       /* SeqMgrGetCDSgivenProduct here would return CDS within nuc-prot set, not genomic */
2410 
2411       for (vnp = SeqMgrGetSfpProductList (bsp); vnp != NULL; vnp = vnp->next) {
2412         cds = (SeqFeatPtr) vnp->data.ptrvalue;
2413         if (cds == NULL || cds->data.choice != SEQFEAT_CDREGION) continue;
2414         gen = BioseqFindFromSeqLoc (cds->location);
2415         if (gen == NULL) continue;
2416 
2417         sdp = SeqMgrGetNextDescriptor (gen, NULL, Seq_descr_molinfo, &dcontext);
2418         if (sdp == NULL) continue;
2419         mip = (MolInfoPtr) sdp->data.ptrvalue;
2420         if (mip == NULL) continue;
2421 
2422         if (mip->biomol == MOLECULE_TYPE_GENOMIC) {
2423 
2424           UpdateException (cds, "translated product replaced");
2425 
2426         } else if (mip->biomol == MOLECULE_TYPE_MRNA) {
2427 
2428           crp = (CdRegionPtr) cds->data.value.ptrvalue;
2429           if (crp != NULL && conflict) {
2430 
2431             /* mark CDS in nuc-prot set for coordinate adjustment */
2432 
2433             crp->conflict = TRUE;
2434           }
2435         }
2436       }
2437 
2438     } else {
2439 
2440       cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
2441       if (cds != NULL) {
2442         UpdateException (cds, "translated product replaced");
2443       }
2444     }
2445 
2446     prt = SeqMgrGetBestProteinFeature (bsp, NULL);
2447     if (prt != NULL) {
2448       slp = prt->location;
2449       if (slp != NULL && slp->choice == SEQLOC_INT) {
2450         sintp = (SeqIntPtr) slp->data.ptrvalue;
2451         if (sintp != NULL) {
2452           sintp->to = bsp->length - 1;
2453         }
2454       }
2455     }
2456   }
2457 
2458   MemFree (str1);
2459   MemFree (str2);
2460 }
2461 
2462 static void ReplaceOneRNA (
2463   SimpleSeqPtr ssp,
2464   Boolean conflict
2465 )
2466 
2467 {
2468   ByteStorePtr       bs;
2469   BioseqPtr          bsp;
2470   SeqMgrFeatContext  ccontext;
2471   SeqFeatPtr         cds, mrna;
2472   SeqIntPtr          sintp;
2473   SeqIdPtr           sip;
2474   SeqLocPtr          slp;
2475   CharPtr            str, str1, str2;
2476 
2477   if (ssp == NULL || ssp->numid < 1) return;
2478 
2479   str = ssp->id [0];
2480   if (StringHasNoText (str)) {
2481     str = "?";
2482   }
2483   sip = MakeSeqID (str);
2484   bsp = BioseqFind (sip);
2485   SeqIdFree (sip);
2486   if (bsp == NULL) {
2487     Message (MSG_POSTERR, "Unable to find mRNA sequence %s", str);
2488   }
2489   if (bsp == NULL || bsp->repr != Seq_repr_raw) return;
2490   if (! ISA_na (bsp->mol)) {
2491     Message (MSG_POSTERR, "Will not replace protein sequence %s with mRNA", str);
2492     return;
2493   }
2494 
2495   /* remove trailing X and * */
2496 
2497   bs = ssp->seq;
2498   ssp->seqlen = BSLen (bs);
2499 
2500   str1 = BSMerge (ssp->seq, NULL);
2501   str2 = GetSequenceByBsp (bsp);
2502 
2503   if (StringCmp (str1, str2) != 0) {
2504 
2505     /* swap sequence byte stores */
2506 
2507     bs = (ByteStorePtr) bsp->seq_data;
2508     bsp->seq_data = (SeqDataPtr) ssp->seq;
2509     ssp->seq = bs;
2510     bsp->length = BSLen ((ByteStorePtr) bsp->seq_data);
2511     bsp->seq_data_type = Seq_code_iupacna;
2512     BioseqPack (bsp);
2513 
2514     mrna = SeqMgrGetRNAgivenProduct (bsp, NULL);
2515     if (mrna != NULL) {
2516       UpdateException (mrna, "transcribed product replaced");
2517 
2518       /*
2519       if (conflict) {
2520         mrna->excpt = TRUE;
2521         if (StringHasNoText (mrna->except_text)) {
2522           mrna->except_text = StringSave ("RNA editing");
2523         }
2524       }
2525       */
2526     }
2527 
2528     /* make sure CDS in nuc-prot set is not longer than just-replaced RNA */
2529 
2530     cds = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &ccontext);
2531     if (cds != NULL) {
2532       slp = cds->location;
2533       if (slp != NULL && slp->choice == SEQLOC_INT) {
2534         sintp = (SeqIntPtr) slp->data.ptrvalue;
2535         if (sintp != NULL) {
2536           if (sintp->from == 0 && sintp->to > bsp->length - 1) {
2537             sintp->to = bsp->length - 1;
2538           }
2539         }
2540       }
2541     }
2542   }
2543 
2544   MemFree (str1);
2545   MemFree (str2);
2546 }
2547 
2548 static SeqLocPtr PredictOneCodingRegion (BioseqPtr nucbsp, BioseqPtr protbsp, Int2 genCode)
2549 
2550 {
2551   BioseqPtr    bsp;
2552   SeqLocPtr    oldslp;
2553   SeqAnnotPtr  sap;
2554   SeqFeatPtr   sfp;
2555   SeqIdPtr     sip;
2556   SeqLocPtr    slp;
2557 
2558   slp = NULL;
2559   sap = SuggestCodingRegion (nucbsp, protbsp, genCode);
2560   if (sap != NULL && sap->type == 1) {
2561     sfp = (SeqFeatPtr) sap->data;
2562     if (sfp != NULL && sfp->data.choice == SEQFEAT_CDREGION) {
2563       slp = sfp->location;
2564       sfp->location = NULL;
2565       sip = SeqLocId (slp);
2566       if (sip != NULL) {
2567         bsp = BioseqFind (sip);
2568         if (bsp != NULL) {
2569           if (bsp->repr == Seq_repr_seg) {
2570             oldslp = slp;
2571             slp = SegLocToParts (bsp, oldslp);
2572             FreeAllFuzz (slp);
2573             SeqLocFree (oldslp);
2574           }
2575         }
2576       }
2577     }
2578   }
2579   sap = SeqAnnotFree (sap);
2580   StripLocusFromSeqLoc (slp);
2581   return slp;
2582 }
2583 
2584 static void SuggestOnePeptide (
2585   BioseqPtr nucbsp,
2586   BioseqPtr protbsp,
2587   Int2 genCode
2588 )
2589 
2590 {
2591   SeqFeatPtr   cds;
2592   CdRegionPtr  crp;
2593   SeqFeatPtr   gene;
2594   GeneRefPtr   grp;
2595   Boolean      partial5;
2596   Boolean      partial3;
2597   ProtRefPtr   prp;
2598   SeqFeatPtr   prt;
2599   SeqLocPtr    slp;
2600   SqnTagPtr    stp;
2601   CharPtr      ttl;
2602   ValNodePtr   vnp;
2603 
2604   if (nucbsp == NULL || protbsp == NULL) return;
2605   slp = PredictOneCodingRegion (nucbsp, protbsp, genCode);
2606   if (slp == NULL) return;
2607 
2608   crp = CreateNewCdRgn (0, FALSE, genCode);
2609   if (crp != NULL) {
2610     CheckSeqLocForPartial (slp, &partial5, &partial3);
2611 
2612     cds = CreateNewFeatureOnBioseq (nucbsp, SEQFEAT_CDREGION, slp);
2613     if (cds != NULL) {
2614       cds->data.value.ptrvalue = (Pointer) crp;
2615       cds->partial |= partial5 | partial3;
2616       SetSeqFeatProduct (cds, protbsp);
2617     }
2618 
2619     if (protbsp->descr != NULL) {
2620       vnp = ValNodeExtract (&(protbsp->descr), Seq_descr_title);
2621       if (vnp != NULL) {
2622         ttl = (CharPtr) vnp->data.ptrvalue;
2623         if (ttl != NULL) {
2624           stp = SqnTagParse (ttl);
2625           if (stp != NULL) {
2626 
2627             prp = ProtRefNew ();
2628             prp = ParseTitleIntoProtRef (stp, prp);
2629             if (prp != NULL) {
2630               if (prp->name == NULL && prp->desc == NULL) {
2631                 prp->name = ValNodeCopyStr (NULL, 0, "unknown");
2632               }
2633               prt = CreateNewFeatureOnBioseq (protbsp, SEQFEAT_PROT, NULL);
2634               if (prt != NULL) {
2635                 prt->data.value.ptrvalue = (Pointer) prp;
2636                 prt->partial |= partial5 | partial3;
2637               }
2638             }
2639 
2640             grp = GeneRefNew ();
2641             grp = ParseTitleIntoGeneRef (stp, grp);
2642             if (grp != NULL) {
2643               if (grp->locus == NULL && grp->syn == NULL) {
2644                 GeneRefFree (grp);
2645               } else {
2646                 gene = CreateNewFeatureOnBioseq (nucbsp, SEQFEAT_GENE, NULL);
2647                 if (gene != NULL) {
2648                   gene->data.value.ptrvalue = (Pointer) grp;
2649                   gene->partial |= partial5 | partial3;
2650                   gene->location = SeqLocFree (gene->location);
2651                   gene->location = SeqLocMerge (nucbsp, slp, NULL, TRUE, TRUE, TRUE);
2652                 }
2653               }
2654             }
2655 
2656             SqnTagFree (stp);
2657           }
2658         }
2659 
2660         ValNodeFreeData (vnp);
2661       }
2662     }
2663   }
2664 
2665   SeqLocFree (slp);
2666 }
2667 
2668 static void RnaProtTrailingCommaFix (SeqFeatPtr sfp, Pointer userdata)
2669 
2670 {
2671   Char        ch;
2672   size_t      len;
2673   ProtRefPtr  prp;
2674   RnaRefPtr   rrp;
2675   CharPtr     str;
2676   ValNodePtr  vnp;
2677 
2678   if (sfp == NULL) return;
2679 
2680   if (sfp->data.choice == SEQFEAT_PROT) {
2681     prp = (ProtRefPtr) sfp->data.value.ptrvalue;
2682     /* turn trailing space into trailing underscore for validator */
2683     for (vnp = prp->name; vnp != NULL; vnp = vnp->next) {
2684       str = (CharPtr) vnp->data.ptrvalue;
2685       if (StringHasNoText (str)) continue;
2686       len = StringLen (str);
2687       if (len < 1) continue;
2688       ch = str [len - 1];
2689       while (ch == ' ' && len > 2) {
2690         len--;
2691         ch = str [len - 1];
2692       }
2693       if (ch == ',') {
2694         str [len - 1] = '_';
2695         str [len] = '\0';
2696       }
2697     }
2698   } else if (sfp->data.choice == SEQFEAT_RNA) {
2699     rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
2700     /* turn trailing space into trailing underscore for validator */
2701     if (rrp->ext.choice == 1) {
2702       str = rrp->ext.value.ptrvalue;
2703       if (StringDoesHaveText (str)) {
2704         len = StringLen (str);
2705         if (len > 0) {
2706           ch = str [len - 1];
2707           while (ch == ' ' && len > 2) {
2708             len--;
2709             ch = str [len - 1];
2710           }
2711           if (ch == ',') {
2712             str [len - 1] = '_';
2713             str [len] = '\0';
2714           }
2715         }
2716       }
2717     }
2718   }
2719 }
2720 
2721 static Uint2 ProcessOneAsn (
2722   FILE* fp,
2723   BioSourcePtr src,
2724   TblArgsPtr tbl,
2725   CharPtr localname,
2726   SeqEntryPtr gsep,
2727   MolInfoPtr template_molinfo
2728 )
2729 
2730 {
2731   BioseqPtr      bsp = NULL;
2732   BioseqSetPtr   bssp;
2733   Pointer        dataptr;
2734   Uint2          datatype, entityID;
2735   ObjMgrDataPtr  omdptop;
2736   ObjMgrData     omdata;
2737   Uint2          parenttype;
2738   Pointer        parentptr;
2739   SeqEntryPtr    sep;
2740   SeqIdPtr       sip;
2741 
2742   if (fp == NULL) return 0;
2743 
2744   if (gsep != NULL) {
2745     bssp = (BioseqSetPtr) gsep->data.ptrvalue;
2746     if (bssp == NULL) return 0;
2747 
2748     SaveSeqEntryObjMgrData (gsep, &omdptop, &omdata);
2749     GetSeqEntryParent (gsep, &parentptr, &parenttype);
2750 
2751     dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, TRUE, FALSE, TRUE, FALSE);
2752     if (datatype == OBJ_BIOSEQ) {
2753       bssp->seq_set = SeqMgrGetSeqEntryForData (dataptr);
2754       SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) dataptr, gsep);
2755     } else if (datatype == OBJ_BIOSEQSET) {
2756       bssp->seq_set = SeqMgrGetSeqEntryForData (dataptr);
2757       SeqMgrSeqEntry (SM_BIOSEQSET, (Pointer) dataptr, gsep);
2758     } else if (datatype == OBJ_SEQENTRY) {
2759       sep = (SeqEntryPtr) dataptr;
2760       bssp->seq_set = sep;
2761       if (IS_Bioseq (sep)) {
2762         SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) sep->data.ptrvalue, gsep);
2763       } else if (IS_Bioseq_set (sep)) {
2764         SeqMgrSeqEntry (SM_BIOSEQSET, (Pointer) sep->data.ptrvalue, gsep);
2765       } else return 0;
2766     } else return 0;
2767 
2768     SeqMgrLinkSeqEntry (gsep, parenttype, parentptr);
2769     RestoreSeqEntryObjMgrData (gsep, omdptop, &omdata);
2770 
2771     entityID = ObjMgrRegister (OBJ_BIOSEQSET, (Pointer) bssp);
2772   } else {
2773     dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, &entityID, TRUE, FALSE, TRUE, FALSE);
2774   }
2775   if (dataptr == NULL) return 0;
2776 
2777   sep = GetTopSeqEntryForEntityID (entityID);
2778   bsp = FindNucBioseq (sep);
2779   if (bsp == NULL) {
2780     ObjMgrFreeByEntityID (entityID);
2781     return 0;
2782   }
2783 
2784   VisitFeaturesInSep (sep, NULL, RnaProtTrailingCommaFix);
2785 
2786   if (StringDoesHaveText (localname)) {
2787     sip = MakeSeqID (localname);
2788     if (sip != NULL) {
2789       bsp->id = SeqIdSetFree (bsp->id);
2790       bsp->id = sip;
2791       SeqMgrReplaceInBioseqIndex (bsp);
2792       VisitFeaturesOnBsp (bsp, (Pointer) bsp->id, CorrectFeatureSeqIds);
2793     }
2794   }
2795 
2796   ProcessNucBioseqs (sep, entityID, src, tbl, template_molinfo);
2797 
2798   return entityID;
2799 }
2800 
2801 static Uint2 ProcessRaw2Delt (
2802   FILE* fp,
2803   BioSourcePtr src,
2804   TblArgsPtr tbl,
2805   CharPtr localname,
2806   SeqEntryPtr gsep,
2807   MolInfoPtr template_molinfo
2808 )
2809 
2810 {
2811   BioseqPtr      bsp = NULL;
2812   BioseqSetPtr   bssp;
2813   Pointer        dataptr;
2814   Uint2          datatype, entityID;
2815   Int4           gap_sizes [2];
2816   ObjMgrDataPtr  omdptop;
2817   ObjMgrData     omdata;
2818   Uint2          parenttype;
2819   Pointer        parentptr;
2820   SeqEntryPtr    sep;
2821   SeqIdPtr       sip;
2822 
2823   if (fp == NULL) return 0;
2824 
2825   if (gsep != NULL) {
2826     bssp = (BioseqSetPtr) gsep->data.ptrvalue;
2827     if (bssp == NULL) return 0;
2828 
2829     SaveSeqEntryObjMgrData (gsep, &omdptop, &omdata);
2830     GetSeqEntryParent (gsep, &parentptr, &parenttype);
2831 
2832     dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, TRUE, FALSE, TRUE, FALSE);
2833     if (datatype == OBJ_BIOSEQ) {
2834       bssp->seq_set = SeqMgrGetSeqEntryForData (dataptr);
2835       SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) dataptr, gsep);
2836     } else if (datatype == OBJ_BIOSEQSET) {
2837       bssp->seq_set = SeqMgrGetSeqEntryForData (dataptr);
2838       SeqMgrSeqEntry (SM_BIOSEQSET, (Pointer) dataptr, gsep);
2839     } else if (datatype == OBJ_SEQENTRY) {
2840       sep = (SeqEntryPtr) dataptr;
2841       bssp->seq_set = sep;
2842       if (IS_Bioseq (sep)) {
2843         SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) sep->data.ptrvalue, gsep);
2844       } else if (IS_Bioseq_set (sep)) {
2845         SeqMgrSeqEntry (SM_BIOSEQSET, (Pointer) sep->data.ptrvalue, gsep);
2846       } else return 0;
2847     } else return 0;
2848 
2849     SeqMgrLinkSeqEntry (gsep, parenttype, parentptr);
2850     RestoreSeqEntryObjMgrData (gsep, omdptop, &omdata);
2851 
2852     entityID = ObjMgrRegister (OBJ_BIOSEQSET, (Pointer) bssp);
2853   } else {
2854     dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, &entityID, TRUE, FALSE, TRUE, FALSE);
2855   }
2856   if (dataptr == NULL) return 0;
2857 
2858   sep = GetTopSeqEntryForEntityID (entityID);
2859   bsp = FindNucBioseq (sep);
2860   if (bsp == NULL) {
2861     ObjMgrFreeByEntityID (entityID);
2862     return 0;
2863   }
2864 
2865   VisitFeaturesInSep (sep, NULL, RnaProtTrailingCommaFix);
2866 
2867   if (StringDoesHaveText (localname)) {
2868     sip = MakeSeqID (localname);
2869     if (sip != NULL) {
2870       bsp->id = SeqIdSetFree (bsp->id);
2871       bsp->id = sip;
2872       SeqMgrReplaceInBioseqIndex (bsp);
2873       VisitFeaturesOnBsp (bsp, (Pointer) bsp->id, CorrectFeatureSeqIds);
2874     }
2875   }
2876 
2877   if (bsp->repr == Seq_repr_raw) {
2878     if (tbl->r2dunk100) {
2879       gap_sizes [0] = 100;
2880     } else {
2881       gap_sizes [0] = 0;
2882     }
2883     gap_sizes [1] = -(tbl->r2dmin);
2884 
2885     ConvertNsToGaps (bsp, gap_sizes);
2886   }
2887 
2888   ProcessOneNuc (entityID, bsp, src, tbl, template_molinfo);
2889 
2890   return entityID;
2891 }
2892 
2893 static Uint2 ProcessGappedSet (
2894   FILE* fp,
2895   BioSourcePtr src,
2896   TblArgsPtr tbl,
2897   SeqEntryPtr gsep,
2898   MolInfoPtr template_molinfo
2899 )
2900 
2901 {
2902   BioseqPtr      bsp = NULL;
2903   BioseqSetPtr   bssp;
2904   Uint2          entityID;
2905   ObjMgrDataPtr  omdptop;
2906   ObjMgrData     omdata;
2907   Uint2          parenttype;
2908   Pointer        parentptr;
2909   SeqEntryPtr    sep;
2910 
2911   if (fp == NULL) return 0;
2912 
2913   if (gsep != NULL) {
2914     bssp = (BioseqSetPtr) gsep->data.ptrvalue;
2915     if (bssp == NULL) return 0;
2916 
2917     SaveSeqEntryObjMgrData (gsep, &omdptop, &omdata);
2918     GetSeqEntryParent (gsep, &parentptr, &parenttype);
2919 
2920     bsp = ReadDeltaFasta (fp, NULL);
2921     if (bsp != NULL) {
2922       sep = SeqMgrGetSeqEntryForData (bsp);
2923       bssp->seq_set = sep;
2924       SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, gsep);
2925     } else return 0;
2926 
2927     SeqMgrLinkSeqEntry (gsep, parenttype, parentptr);
2928     RestoreSeqEntryObjMgrData (gsep, omdptop, &omdata);
2929 
2930     entityID = ObjMgrRegister (OBJ_BIOSEQSET, (Pointer) bssp);
2931   } else {
2932     bsp = ReadDeltaFasta (fp, NULL);
2933     if (bsp != NULL) {
2934       entityID = ObjMgrRegister (OBJ_BIOSEQ, (Pointer) bsp);
2935     }
2936   }
2937   if (bsp == NULL) return 0;
2938 
2939   sep = GetTopSeqEntryForEntityID (entityID);
2940   bsp = FindNucBioseq (sep);
2941   if (bsp == NULL) {
2942     ObjMgrFreeByEntityID (entityID);
2943     return 0;
2944   }
2945 
2946   VisitFeaturesInSep (sep, NULL, RnaProtTrailingCommaFix);
2947 
2948   ProcessOneNuc (entityID, bsp, src, tbl, template_molinfo);
2949 
2950   return entityID;
2951 }
2952 
2953 typedef struct resqseqgph {
2954   Int2         index;
2955   SeqGraphPtr  sgp;
2956 } ResqSeqgph, PNTR ResqSeqgphPtr;
2957 
2958 static void RescueSeqGraphs (
2959   BioseqPtr bsp,
2960   Int2 index,
2961   ValNodePtr PNTR vnpp
2962 )
2963 
2964 {
2965   SeqAnnotPtr    nextsap;
2966   SeqGraphPtr    nextsgp;
2967   Pointer PNTR   prevsap;
2968   Pointer PNTR   prevsgp;
2969   ResqSeqgphPtr  rsp;
2970   SeqAnnotPtr    sap;
2971   SeqGraphPtr    sgp;
2972 
2973   if (bsp == NULL || vnpp == NULL) return;
2974   sap = bsp->annot;
2975   prevsap = (Pointer PNTR) &(bsp->annot);
2976   while (sap != NULL) {
2977     nextsap = sap->next;
2978     if (sap->type == 3) {
2979       sgp = (SeqGraphPtr) sap->data;
2980       prevsgp = (Pointer PNTR) &(sap->data);
2981       while (sgp != NULL) {
2982         nextsgp = sgp->next;
2983         *(prevsgp) = sgp->next;
2984         sgp->next = NULL;
2985         rsp = (ResqSeqgphPtr) MemNew (sizeof (ResqSeqgph));
2986         rsp->index = index;
2987         rsp->sgp = sgp;
2988         ValNodeAddPointer (vnpp, 0, (Pointer) rsp);
2989         sgp = nextsgp;
2990       }
2991     }
2992     if (sap->data == NULL) {
2993       *(prevsap) = sap->next;
2994       sap->next = NULL;
2995       SeqAnnotFree (sap);
2996     } else {
2997       prevsap = (Pointer PNTR) &(sap->next);
2998     }
2999     sap = nextsap;
3000   }
3001 }
3002 
3003 static SeqAnnotPtr NewSeqAnnotType3 (
3004   CharPtr name,
3005   SeqGraphPtr sgp
3006 )
3007 
3008 {
3009   SeqAnnotPtr  sap = NULL;
3010 
3011   if (sgp == NULL) return NULL;
3012   sap = SeqAnnotNew ();
3013   if (sap == NULL) return NULL;
3014 
3015   if (StringDoesHaveText (name)) {
3016     SeqDescrAddPointer (&(sap->desc), Annot_descr_name, StringSave (name));
3017   }
3018   sap->type = 3;
3019   sap->data = (Pointer) sgp;
3020 
3021   return sap;
3022 }
3023 
3024 static void OffsetAndLinkSeqGraph (
3025   BioseqPtr bsp,
3026   SeqGraphPtr sgp,
3027   Int2 index
3028 )
3029 
3030 {
3031   DeltaSeqPtr  dsp;
3032   SeqGraphPtr  lastsgp;
3033   Int4         len;
3034   SeqLitPtr    litp;
3035   SeqAnnotPtr  sap;
3036   SeqIntPtr    sintp;
3037   SeqLocPtr    slp;
3038 
3039   if (bsp == NULL || sgp == NULL || index < 1) return;
3040   len = 0;
3041   if (bsp->repr == Seq_repr_delta && bsp->seq_ext_type == 4) {
3042     for (dsp = (DeltaSeqPtr) (bsp->seq_ext);
3043          dsp != NULL && index > 1; dsp = dsp->next, index--) {
3044       if (dsp->choice == 1) {
3045         len += SeqLocLen ((SeqLocPtr) dsp->data.ptrvalue);
3046       } else if (dsp->choice == 2) {
3047         litp = (SeqLitPtr) dsp->data.ptrvalue;
3048         if (litp != NULL) {
3049           len += litp->length;
3050         }
3051       }
3052     }
3053   }
3054   slp = sgp->loc;
3055   if (slp != NULL && slp->choice == SEQLOC_INT) {
3056     sintp = (SeqIntPtr) slp->data.ptrvalue;
3057     if (sintp != NULL) {
3058       sintp->from += len;
3059       sintp->to += len;
3060       sintp->id = SeqIdFree (sintp->id);
3061       sintp->id = SeqIdDup (bsp->id);
3062     }
3063   }
3064   for (sap = bsp->annot; sap != NULL; sap = sap->next) {
3065     if (sap->type == 3) {
3066       for (lastsgp = sap->data; lastsgp->next != NULL; lastsgp = lastsgp->next) {
3067         continue;
3068       }
3069       lastsgp->next = sgp;
3070       break;
3071     }
3072   }
3073   if (sap == NULL) {
3074     if (bsp->annot != NULL) {
3075       for (sap = bsp->annot; sap->next != NULL; sap = sap->next) {
3076         continue;
3077       }
3078       sap->next = NewSeqAnnotType3 ("Phrap Graph", sgp);
3079     } else {
3080       bsp->annot = NewSeqAnnotType3 ("Phrap Graph", sgp);
3081     }
3082   }
3083 }
3084 
3085 static CharPtr BioseqGetLocalIdStr (
3086   BioseqPtr bsp
3087 )
3088 
3089 {
3090   ObjectIdPtr  oip;
3091   SeqIdPtr     sip;
3092 
3093   if (bsp == NULL) return NULL;
3094   for (sip = bsp->id; sip != NULL; sip = sip->next) {
3095     if (sip->choice == SEQID_LOCAL) {
3096       oip = (ObjectIdPtr) sip->data.ptrvalue;
3097       if (oip != NULL && oip->str != NULL) {
3098         return oip->str;
3099       }
3100     }
3101   }
3102   return NULL;
3103 }
3104 
3105 typedef struct reqcontig {
3106   Int2  index;
3107   Char  str [41];
3108 } ResqContig, PNTR ResqContigPtr;
3109 
3110 #define MAX_FIELDS  8
3111 
3112 static CharPtr ReadContigFile (
3113   CharPtr directory,
3114   CharPtr base,
3115   ValNodePtr PNTR fragmentgroupsp,
3116   CharPtr dumsp6,
3117   CharPtr dumt7,
3118   CharPtr PNTR sp6_clonep,
3119   CharPtr PNTR sp6_endp,
3120   CharPtr PNTR t7_clonep,
3121   CharPtr PNTR t7_endp
3122 )
3123 
3124 {
3125   Char        buf [256], instr [120];
3126   FileCache   fc;
3127   CharPtr     field [MAX_FIELDS];
3128   FILE        *fp;
3129   int         frg;
3130   Boolean     left_end, right_end, nonewline;
3131   Int4        len;
3132   Int2        numFields;
3133   CharPtr     pstring = NULL, ptr, str, sp6_end = NULL, t7_end = NULL;
3134   ValNodePtr  rescuedcontigs = NULL, vnp;
3135 
3136   fp = OpenOneFile (directory, base, ".ctg");
3137   if (fp == NULL) return NULL;
3138 
3139   FileCacheSetup (&fc, fp);
3140 
3141   str = FileCacheReadLine (&fc, buf, sizeof (buf), &nonewline);
3142   while (str != NULL) {
3143     MemSet ((Pointer) field, 0, sizeof (field));
3144 
3145    /*
3146    *  parse tab-delimited output line into array of fields, avoiding use of
3147    *  strtok so that empty columns (adjacent tabs) are properly assigned to
3148    *  field array
3149    */
3150 
3151     ptr = buf;
3152     for (numFields = 0; numFields < MAX_FIELDS && ptr != NULL; numFields++) {
3153       field [numFields] = ptr;
3154       ptr = StringChr (ptr, '\t');
3155       if (ptr == NULL) {
3156         ptr = StringChr (ptr, '\n');
3157       }
3158       if (ptr == NULL) {
3159         ptr = StringChr (ptr, '\r');
3160       }
3161       if (ptr != NULL) {
3162         *ptr = '\0';
3163         ptr++;
3164       }
3165     }
3166 
3167     if (StringDoesHaveText (field [0])) {
3168       StringNCpy_0 (instr, field [0], sizeof (instr) - 2);
3169       if (StringDoesHaveText (field [1])) {
3170         if (StringNICmp (field [1], "-", 1) == 0) {
3171           StringCat (instr, "-");
3172         }
3173       }
3174       ValNodeCopyStr (&rescuedcontigs, 0, instr);
3175       if (StringDoesHaveText (field [2])) {
3176         if (sscanf (field [2], "%d", &frg) == 1) {
3177           ValNodeCopyStr (fragmentgroupsp, (Uint1) frg, field [0]);
3178         }
3179       }
3180       left_end = FALSE;
3181       right_end = FALSE;
3182       if (StringDoesHaveText (field [3])) {
3183          if (StringDoesHaveText (field [4])) {
3184            if (StringNICmp (field [4], "l", 1) == 0) {
3185             left_end = TRUE;
3186            } else if (StringNICmp (field [4], "r", 1) == 0) {
3187              right_end = TRUE;
3188            }
3189          }
3190          if (StringICmp (field [3], "sp6") == 0) {
3191            StringCpy (dumsp6, field [0]);
3192            if (left_end) {
3193              StringCat (dumsp6, ",left");
3194            } else if (right_end) {
3195              StringCat (dumsp6, ",right");
3196            }
3197            if (sp6_clonep != NULL && *sp6_clonep == NULL) {
3198              *sp6_clonep = dumsp6;
3199            }
3200          } else if (StringICmp (field [3], "t7") == 0) {
3201            StringCpy (dumt7, field [0]);
3202            if (left_end) {
3203              StringCat (dumt7, ",left");
3204            } else if (right_end) {
3205              StringCat (dumt7, ",right");
3206            }
3207            if (t7_clonep != NULL && *t7_clonep == NULL) {
3208              *t7_clonep = dumt7;
3209            }
3210          }
3211       }
3212     }
3213     str = FileCacheReadLine (&fc, buf, sizeof (buf), &nonewline);
3214   }
3215 
3216   FileClose (fp);
3217 
3218   len = 0;
3219   for (vnp = rescuedcontigs; vnp != NULL; vnp = vnp->next) {
3220     len += StringLen ((CharPtr) vnp->data.ptrvalue) + 1;
3221   }
3222   if (len > 1) {
3223     pstring = MemNew ((size_t) (len + 2));
3224     for (vnp = rescuedcontigs; vnp != NULL; vnp = vnp->next) {
3225       if (vnp != rescuedcontigs) {
3226         StringCat (pstring, ",");
3227       }
3228       StringCat (pstring, (CharPtr) vnp->data.ptrvalue);
3229     }
3230   }
3231 
3232    rescuedcontigs = ValNodeFreeData (rescuedcontigs);
3233 
3234   if (sp6_clonep != NULL && *sp6_clonep != NULL) {
3235     sp6_end = StringChr (*sp6_clonep, ',');
3236     if (sp6_end != NULL) {
3237       *sp6_end = '\0';
3238       sp6_end++;
3239       if (StringICmp (sp6_end, "left") == 0) {
3240         sp6_end = "left";
3241       } else if (StringICmp (sp6_end, "right") == 0) {
3242         sp6_end = "right";
3243       } else {
3244         sp6_end = NULL;
3245       }
3246     }
3247     if (sp6_endp != NULL) {
3248       *sp6_endp = sp6_end;
3249     }
3250   }
3251   if (t7_clonep != NULL && *t7_clonep != NULL) {
3252     t7_end = StringChr (*t7_clonep, ',');
3253     if (t7_end != NULL) {
3254       *t7_end = '\0';
3255       t7_end++;
3256       if (StringICmp (t7_end, "left") == 0) {
3257         t7_end = "left";
3258       } else if (StringICmp (t7_end, "right") == 0) {
3259         t7_end = "right";
3260       } else {
3261         t7_end = NULL;
3262       }
3263     }
3264     if (t7_endp != NULL) {
3265       *t7_endp = t7_end;
3266     }
3267   }
3268 
3269   return pstring;
3270 }
3271 
3272 static void MakeAssemblyFragments (
3273   BioseqPtr bsp,
3274   CharPtr name,
3275   Int2 index,
3276   CharPtr sp6_clone,
3277   CharPtr sp6_end,
3278   CharPtr t7_clone,
3279   CharPtr t7_end,
3280   Uint1 frag
3281 )
3282 
3283 {
3284   DeltaSeqPtr  dsp = NULL;
3285   Int4         from, to;
3286   ImpFeatPtr   ifp;
3287   SeqLitPtr    litp;
3288   SeqFeatPtr   sfp;
3289   SeqInt       sint;
3290   Char         str [128];
3291   Char         tmp [32];
3292   ValNode      vn;
3293 
3294   if (bsp == NULL || name == NULL || index < 1) return;
3295   from = 0;
3296   to = 0;
3297   if (bsp->repr == Seq_repr_delta && bsp->seq_ext_type == 4) {
3298     for (dsp = (DeltaSeqPtr) (bsp->seq_ext);
3299          dsp != NULL && index > 1; dsp = dsp->next, index--) {
3300       if (dsp->choice == 1) {
3301         from += SeqLocLen ((SeqLocPtr) dsp->data.ptrvalue);
3302       } else if (dsp->choice == 2) {
3303         litp = (SeqLitPtr) dsp->data.ptrvalue;
3304         if (litp != NULL) {
3305           from += litp->length;
3306         }
3307       }
3308     }
3309   }
3310   if (dsp != NULL && dsp->choice == 2) {
3311     litp = (SeqLitPtr) dsp->data.ptrvalue;
3312     if (litp != NULL) {
3313       to = litp->length + from - 1;
3314     }
3315   }
3316   MemSet ((Pointer) &vn, 0, sizeof (ValNode));
3317   vn.choice = SEQLOC_INT;
3318   vn.data.ptrvalue = &sint;
3319 
3320   MemSet ((Pointer) &sint, 0, sizeof (SeqInt));
3321   sint.id = SeqIdDup (SeqIdFindBest (bsp->id, 0));
3322 
3323   sint.from = from;
3324   sint.to = to;
3325   sint.strand = Seq_strand_plus;
3326 
3327   ifp = ImpFeatNew ();
3328   if (ifp == NULL) return;
3329   sfp = CreateNewFeatureOnBioseq (bsp, SEQFEAT_IMP, &vn);
3330   if (sfp == NULL) return;
3331   sfp->data.value.ptrvalue = (Pointer) ifp;
3332   ifp->key = StringSave ("misc_feature");
3333 
3334   sprintf (str, "assembly_name:%s", name);
3335   if (frag > 0) {
3336     sprintf (tmp, "~fragment_group:%d", (int) frag);
3337     StringCat (str, tmp);
3338   }
3339   if (StringICmp (name, sp6_clone) == 0) {
3340     StringCat (str, "~clone_end:SP6");
3341     if (sp6_end != NULL) {
3342       StringCat (str, "~vector_side:");
3343       StringCat (str, sp6_end);
3344     }
3345   } else if (StringICmp (name, t7_clone) == 0) {
3346     StringCat (str, "~clone_end:T7");
3347     if (t7_end != NULL) {
3348       StringCat (str, "~vector_side:");
3349       StringCat (str, t7_end);
3350     }
3351   }
3352   sfp->comment = StringSaveNoNull (str);
3353 }
3354 
3355 static Uint2 ProcessPhrapAce (
3356   FILE* fp,
3357   BioSourcePtr src,
3358   TblArgsPtr tbl,
3359   CharPtr localname,
3360   SeqEntryPtr gsep,
3361   MolInfoPtr template_molinfo,
3362   CharPtr directory,
3363   CharPtr base
3364 )
3365 
3366 {
3367   BioseqPtr      bsp, deltabsp;
3368   BioseqSetPtr   bssp;
3369   CharPtr        contigs;
3370   Boolean        do_contig = FALSE;
3371   Char           dumsp6 [64], dumt7 [64];
3372   Uint2          entityID;
3373   SeqEntryPtr    firstsep, nextsep, sep, topsep;
3374   Uint1          frag;
3375   IntFuzzPtr     ifp;
3376   Int2           index = 0;
3377   Boolean        is_unk100, lastwasraw;
3378   ObjMgrDataPtr  omdptop;
3379   ObjMgrData     omdata;
3380   Uint2          parenttype;
3381   Pointer        parentptr;
3382   ResqContigPtr  rcp;
3383   ResqSeqgphPtr  rsp;
3384   CharPtr        seqbuf;
3385   SeqIdPtr       sip;
3386   SeqLitPtr      slp;
3387   CharPtr        sp6_clone = NULL, t7_clone = NULL, sp6_end = NULL, t7_end = NULL;
3388   ValNodePtr     rescuedcontigs = NULL, rescuedsgps = NULL, fragmentgroups = NULL, vnp, vnp2;
3389 
3390   if (fp == NULL) return 0;
3391 
3392   firstsep = ReadPhrapFile (fp);
3393   if (firstsep == NULL) return 0;
3394 
3395   dumsp6 [0] = '\0';
3396   dumt7 [0] = '\0';
3397   contigs = ReadContigFile (directory, base, &fragmentgroups, dumsp6,
3398                             dumt7, &sp6_clone, &sp6_end, &t7_clone, &t7_end);
3399   firstsep = SetPhrapContigOrder (firstsep, contigs);
3400   if (firstsep == NULL) return 0;
3401   if (contigs != NULL) {
3402     do_contig = TRUE;
3403   }
3404 
3405   /* always make delta, even if one component */
3406 
3407   bsp = FindNucBioseq (firstsep);
3408   if (bsp == NULL) return 0;
3409 
3410   sip = SeqIdSetDup (bsp->id);
3411   vnp = ValNodeExtract (&(bsp->descr), Seq_descr_title);
3412 
3413   deltabsp = BioseqNew ();
3414   if (deltabsp == NULL) return 0;
3415   deltabsp->repr = Seq_repr_delta;
3416   deltabsp->seq_ext_type = 4;
3417   deltabsp->mol = Seq_mol_dna;
3418   deltabsp->length = 0;
3419 
3420   topsep = SeqEntryNew ();
3421   if (topsep == NULL) return 0;
3422   topsep->choice = 1;
3423   topsep->data.ptrvalue = (Pointer) deltabsp;
3424 
3425   SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) deltabsp, topsep);
3426 
3427   if (gsep != NULL) {
3428     bssp = (BioseqSetPtr) gsep->data.ptrvalue;
3429     if (bssp == NULL) return 0;
3430 
3431     SaveSeqEntryObjMgrData (gsep, &omdptop, &omdata);
3432     GetSeqEntryParent (gsep, &parentptr, &parenttype);
3433 
3434     bssp->seq_set = topsep;
3435     SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) deltabsp, gsep);
3436 
3437     SeqMgrLinkSeqEntry (gsep, parenttype, parentptr);
3438     RestoreSeqEntryObjMgrData (gsep, omdptop, &omdata);
3439 
3440     entityID = ObjMgrRegister (OBJ_BIOSEQSET, (Pointer) bssp);
3441   } else {
3442     entityID = ObjMgrRegister (OBJ_BIOSEQ, (Pointer) deltabsp);
3443   }
3444 
3445   lastwasraw = FALSE;
3446   for (sep = firstsep; sep != NULL; sep = nextsep) {
3447     nextsep = sep->next;
3448     sep->next = NULL;
3449 
3450     bsp = (BioseqPtr) sep->data.ptrvalue;
3451     if (bsp == NULL) continue;
3452 
3453     if (bsp->repr == Seq_repr_raw) {
3454 
3455       if (lastwasraw) {
3456         slp = (SeqLitPtr) MemNew (sizeof (SeqLit));
3457         if (slp == NULL) break;
3458 
3459         slp->length = 100 ;
3460         is_unk100 = TRUE;
3461 
3462         if (slp->length < 1 || is_unk100) {
3463           if (slp->length < 1) {
3464             slp->length = 0;
3465           }
3466           ifp = IntFuzzNew ();
3467           ifp->choice = 4;
3468           slp->fuzz = ifp;
3469         }
3470 
3471         ValNodeAddPointer ((ValNodePtr PNTR) &(deltabsp->seq_ext), (Int2) 2, (Pointer) slp);
3472 
3473         deltabsp->length += slp->length;
3474         index++;
3475       }
3476 
3477       BioseqRawConvert (bsp, Seq_code_iupacna);
3478       seqbuf = BSMerge ((ByteStorePtr) bsp->seq_data, NULL);
3479       slp = (SeqLitPtr) MemNew (sizeof (SeqLit));
3480       if (slp == NULL) continue;
3481 
3482       slp->length = bsp->length;
3483       ValNodeAddPointer ((ValNodePtr PNTR) &(deltabsp->seq_ext), (Int2) 2, (Pointer) slp);
3484       slp->seq_data = (SeqDataPtr) BSNew (slp->length);
3485       slp->seq_data_type = Seq_code_iupacna;
3486       AddBasesToByteStore ((ByteStorePtr) slp->seq_data, seqbuf);
3487       MemFree (seqbuf);
3488       lastwasraw = TRUE;
3489 
3490       deltabsp->length += slp->length;
3491       index++;
3492 
3493       RescueSeqGraphs (bsp, index, &rescuedsgps);
3494       if (do_contig) {
3495         rcp = (ResqContigPtr) MemNew (sizeof (ResqContig));
3496         if (rcp != NULL) {
3497           rcp->index = index;
3498           StringNCpy_0 (rcp->str, BioseqGetLocalIdStr (bsp), sizeof (rcp->str));
3499           ValNodeAddPointer (&rescuedcontigs, 0, (Pointer) rcp);
3500         }
3501       }
3502     }
3503 
3504     SeqEntryFree (sep);
3505   }
3506 
3507   ValNodeLink (&(deltabsp->descr), vnp);
3508   deltabsp->id = sip;
3509 
3510   if (deltabsp != NULL) {
3511     for (vnp = rescuedsgps; vnp != NULL; vnp = vnp->next) {
3512       rsp = (ResqSeqgphPtr) vnp->data.ptrvalue;
3513       if (rsp != NULL) {
3514         OffsetAndLinkSeqGraph (deltabsp, rsp->sgp, (Int2) rsp->index);
3515       }
3516     }
3517     for (vnp = rescuedcontigs; vnp != NULL; vnp = vnp->next) {
3518       rcp = (ResqContigPtr) vnp->data.ptrvalue;
3519       if (rcp != NULL) {
3520         frag = 0;
3521         for (vnp2 = fragmentgroups; vnp2 != NULL; vnp2 = vnp2->next) {
3522           if (StringICmp ((CharPtr) vnp2->data.ptrvalue, rcp->str) == 0) {
3523             frag = (Uint1) vnp2->choice;
3524           }
3525         }
3526         MakeAssemblyFragments (deltabsp, rcp->str, (Int2) rcp->index,
3527                                sp6_clone, sp6_end, t7_clone, t7_end, frag);
3528       }
3529     }
3530   }
3531   rescuedsgps = ValNodeFreeData (rescuedsgps);
3532   rescuedcontigs = ValNodeFreeData (rescuedcontigs);
3533 
3534 
3535   if (gsep == NULL) {
3536     SeqMgrLinkSeqEntry (topsep, 0, NULL);
3537   }
3538 
3539   if (StringDoesHaveText (localname)) {
3540     sip = MakeSeqID (localname);
3541     if (sip != NULL) {
3542       bsp->id = SeqIdSetFree (bsp->id);
3543       bsp->id = sip;
3544       SeqMgrReplaceInBioseqIndex (bsp);
3545       VisitFeaturesOnBsp (bsp, (Pointer) bsp->id, CorrectFeatureSeqIds);
3546     }
3547   }
3548 
3549   ProcessOneNuc (entityID, deltabsp, src, tbl, template_molinfo);
3550 
3551   return entityID;
3552 }
3553 
3554 static Uint2 ProcessBulkSet (
3555   FILE* fp,
3556   BioSourcePtr src,
3557   TblArgsPtr tbl,
3558   MolInfoPtr template_molinfo
3559 )
3560 
3561 {
3562   BioseqPtr     bsp;
3563   BioseqSetPtr  bssp;
3564   Uint2         entityID;
3565   SeqEntryPtr   lastsep, sep, topsep;
3566   /*
3567   Pointer       dataptr;
3568   Uint2         datatype;
3569   */
3570 
3571   if (fp == NULL || tbl == NULL) return 0;
3572 
3573   bssp = BioseqSetNew ();
3574   if (bssp == NULL) return 0;
3575 
3576   switch (tbl->whichclass) {
3577     case 1 :
3578       bssp->_class = BioseqseqSet_class_pop_set;
3579       break;
3580     case 2 :
3581       bssp->_class = BioseqseqSet_class_phy_set;
3582       break;
3583     case 3 :
3584       bssp->_class = BioseqseqSet_class_mut_set;
3585       break;
3586     case 4 :
3587       bssp->_class = BioseqseqSet_class_eco_set;
3588       break;
3589     default :
3590       bssp->_class = BioseqseqSet_class_genbank;
3591       break;
3592   }
3593 
3594   topsep = SeqEntryNew ();
3595   if (topsep == NULL) return 0;
3596   topsep->choice = 2;
3597   topsep->data.ptrvalue = (Pointer) bssp;
3598 
3599   entityID = ObjMgrRegister (OBJ_BIOSEQSET, (Pointer) bssp);
3600 
3601   lastsep = NULL;
3602 
3603 /*
3604   while ((dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, TRUE, FALSE, TRUE, FALSE)) != NULL) {
3605     if (datatype == OBJ_BIOSEQ) {
3606 
3607       sep = SeqMgrGetSeqEntryForData (dataptr);
3608       if (lastsep == NULL) {
3609         bssp->seq_set = sep;
3610       } else {
3611         lastsep->next = sep;
3612       }
3613       lastsep = sep;
3614 
3615       bsp = (BioseqPtr) dataptr;
3616       ProcessOneNuc (entityID, bsp, src, tbl);
3617 
3618     } else {
3619       ObjMgrFree (datatype, dataptr);
3620     }
3621   }
3622 */
3623 
3624   while ((bsp = ReadDeltaFasta (fp, NULL)) != NULL) {
3625 
3626     sep = SeqMgrGetSeqEntryForData (bsp);
3627     if (lastsep == NULL) {
3628       bssp->seq_set = sep;
3629     } else {
3630       lastsep->next = sep;
3631     }
3632     lastsep = sep;
3633 
3634     ProcessOneNuc (entityID, bsp, src, tbl, template_molinfo);
3635   }
3636 
3637   SeqMgrLinkSeqEntry (topsep, 0, NULL);
3638 
3639   return entityID;
3640 }
3641 
3642 static SeqEntryPtr FA2SEP (
3643   FILE *fp
3644 )
3645 
3646 {
3647   BioseqPtr    bsp;
3648   Pointer      dataptr;
3649   Uint2        datatype;
3650   SeqEntryPtr  sep;
3651 
3652   if (fp == NULL) return NULL;
3653 
3654   dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, TRUE, FALSE, TRUE, FALSE);
3655   if (datatype == OBJ_BIOSEQ) {
3656     sep = SeqMgrGetSeqEntryForData (dataptr);
3657     if (sep == NULL) {
3658       sep = SeqEntryNew ();
3659       if (sep != NULL) {
3660         bsp = (BioseqPtr) dataptr;
3661         sep->choice = 1;
3662         sep->data.ptrvalue = bsp;
3663         SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, sep);
3664       }
3665     }
3666     return sep;
3667   }
3668 
3669   return NULL;
3670 }
3671 
3672 static SeqEntryPtr MakeUnk100GapSep (void)
3673 
3674 {
3675   BioseqPtr    bsp;
3676   SeqEntryPtr  sep;
3677 
3678   sep = SeqEntryNew ();
3679   if (sep == NULL) return NULL;
3680   bsp = BioseqNew ();
3681   if (bsp == NULL) return NULL;
3682   bsp->repr = Seq_repr_virtual;
3683   bsp->mol = Seq_mol_na;
3684   bsp->length = 100;
3685   bsp->id = SeqIdParse ("lcl|unk100");
3686   sep->choice = 1;
3687   sep->data.ptrvalue = bsp;
3688   SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, sep);
3689   return sep;
3690 }
3691 
3692 static Uint2 ProcessDeltaSet (
3693   FILE* fp,
3694   BioSourcePtr src,
3695   TblArgsPtr tbl,
3696   CharPtr localname,
3697   SeqEntryPtr gsep,
3698   MolInfoPtr template_molinfo
3699 )
3700 
3701 {
3702   BioseqPtr      bsp, deltabsp;
3703   BioseqSetPtr   bssp;
3704   Uint2          entityID;
3705   SeqEntryPtr    firstsep, lastsep, nextsep, sep, tmp, topsep;
3706   IntFuzzPtr     ifp;
3707   Boolean        is_unk100;
3708   ObjectIdPtr    oip;
3709   ObjMgrDataPtr  omdptop;
3710   ObjMgrData     omdata;
3711   Uint2          parenttype;
3712   Pointer        parentptr;
3713   CharPtr        seqbuf;
3714   SeqIdPtr       sip, virtid;
3715   SeqLitPtr      slp;
3716   ValNodePtr     vnp;
3717 
3718   if (fp == NULL) return 0;
3719 
3720   firstsep = NULL;
3721   lastsep = NULL;
3722 
3723   /*
3724   sep = FastaToSeqEntry (fp, TRUE);
3725   */
3726   sep = FA2SEP (fp);
3727   if (sep == NULL) return 0;
3728 
3729   /* loop to collect subsequent entries */
3730 
3731   while (sep != NULL) {
3732     if (firstsep == NULL) {
3733       firstsep = sep;
3734     }
3735     if (tbl->implicitgaps && lastsep != NULL) {
3736       tmp = MakeUnk100GapSep ();
3737       if (tmp != NULL) {
3738         ValNodeLink (&lastsep, tmp);
3739         lastsep = tmp;
3740       }
3741     }
3742     if (lastsep != NULL) {
3743       ValNodeLink (&lastsep, sep);
3744     }
3745     lastsep = sep;
3746     /*
3747     sep = FastaToSeqEntry (fp, TRUE);
3748     */
3749     sep = FA2SEP (fp);
3750   }
3751 
3752   /* if only one FASTA, treat as raw */
3753 
3754   if (firstsep->next == NULL) {
3755     bsp = FindNucBioseq (firstsep);
3756     if (bsp == NULL) return 0;
3757 
3758     if (gsep != NULL) {
3759       bssp = (BioseqSetPtr) gsep->data.ptrvalue;
3760       if (bssp == NULL) return 0;
3761 
3762       SaveSeqEntryObjMgrData (gsep, &omdptop, &omdata);
3763       GetSeqEntryParent (gsep, &parentptr, &parenttype);
3764 
3765       bssp->seq_set = SeqMgrGetSeqEntryForData (bsp);
3766       SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, gsep);
3767 
3768       SeqMgrLinkSeqEntry (gsep, parenttype, parentptr);
3769       RestoreSeqEntryObjMgrData (gsep, omdptop, &omdata);
3770 
3771       entityID = ObjMgrRegister (OBJ_BIOSEQSET, (Pointer) bssp);
3772     } else {
3773       entityID = ObjMgrRegister (OBJ_BIOSEQ, (Pointer) bsp);
3774     }
3775 
3776     ProcessOneNuc (entityID, bsp, src, tbl, template_molinfo);
3777     return entityID;
3778   }
3779 
3780   /* now process delta */
3781 
3782   bsp = FindNucBioseq (firstsep);
3783   if (bsp == NULL) return 0;
3784 
3785   sip = SeqIdSetDup (bsp->id);
3786   vnp = ValNodeExtract (&(bsp->descr), Seq_descr_title);
3787 
3788   deltabsp = BioseqNew ();
3789   if (deltabsp == NULL) return 0;
3790   deltabsp->repr = Seq_repr_delta;
3791   deltabsp->seq_ext_type = 4;
3792   deltabsp->mol = Seq_mol_dna;
3793   deltabsp->length = 0;
3794 
3795   topsep = SeqEntryNew ();
3796   if (topsep == NULL) return 0;
3797   topsep->choice = 1;
3798   topsep->data.ptrvalue = (Pointer) deltabsp;
3799 
3800   SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) deltabsp, topsep);
3801 
3802   if (gsep != NULL) {
3803     bssp = (BioseqSetPtr) gsep->data.ptrvalue;
3804     if (bssp == NULL) return 0;
3805 
3806     SaveSeqEntryObjMgrData (gsep, &omdptop, &omdata);
3807     GetSeqEntryParent (gsep, &parentptr, &parenttype);
3808 
3809     bssp->seq_set = topsep;
3810     SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) deltabsp, gsep);
3811 
3812     SeqMgrLinkSeqEntry (gsep, parenttype, parentptr);
3813     RestoreSeqEntryObjMgrData (gsep, omdptop, &omdata);
3814 
3815     entityID = ObjMgrRegister (OBJ_BIOSEQSET, (Pointer) bssp);
3816   } else {
3817     entityID = ObjMgrRegister (OBJ_BIOSEQ, (Pointer) deltabsp);
3818   }
3819 
3820   for (sep = firstsep; sep != NULL; sep = nextsep) {
3821     nextsep = sep->next;
3822     sep->next = NULL;
3823 
3824     bsp = (BioseqPtr) sep->data.ptrvalue;
3825     if (bsp == NULL) continue;
3826 
3827     if (bsp->repr == Seq_repr_raw) {
3828       BioseqRawConvert (bsp, Seq_code_iupacna);
3829       seqbuf = BSMerge ((ByteStorePtr) bsp->seq_data, NULL);
3830       slp = (SeqLitPtr) MemNew (sizeof (SeqLit));
3831       if (slp == NULL) continue;
3832 
3833       slp->length = bsp->length;
3834       ValNodeAddPointer ((ValNodePtr PNTR) &(deltabsp->seq_ext), (Int2) 2, (Pointer) slp);
3835       slp->seq_data = (SeqDataPtr) BSNew (slp->length);
3836       slp->seq_data_type = Seq_code_iupacna;
3837       AddBasesToByteStore ((ByteStorePtr) slp->seq_data, seqbuf);
3838       MemFree(seqbuf);
3839 
3840       deltabsp->length += slp->length;
3841 
3842     } else if (bsp->repr == Seq_repr_virtual) {
3843       slp = (SeqLitPtr) MemNew (sizeof (SeqLit));
3844       if (slp == NULL) continue;
3845 
3846       slp->length = bsp->length;
3847       ValNodeAddPointer ((ValNodePtr PNTR) &(deltabsp->seq_ext), (Int2) 2, (Pointer) slp);
3848 
3849       is_unk100 = FALSE;
3850       virtid = bsp->id;
3851       if (virtid != NULL && virtid->choice == SEQID_LOCAL) {
3852         oip = (ObjectIdPtr) virtid->data.ptrvalue;
3853         if (oip != NULL) {
3854           if (StringCmp (oip->str, "unk100") == 0) {
3855             is_unk100 = TRUE;
3856           }
3857         }
3858       }
3859       if (slp->length < 1 || is_unk100) {
3860         if (slp->length < 1) {
3861           slp->length = 0;
3862         }
3863         ifp = IntFuzzNew ();
3864         ifp->choice = 4;
3865         slp->fuzz = ifp;
3866       }
3867 
3868       deltabsp->length += slp->length;
3869     }
3870 
3871     SeqEntryFree (sep);
3872   }
3873 
3874   ValNodeLink (&(deltabsp->descr), vnp);
3875   deltabsp->id = sip;
3876 
3877   if (gsep == NULL) {
3878     SeqMgrLinkSeqEntry (topsep, 0, NULL);
3879   }
3880 
3881   if (StringDoesHaveText (localname)) {
3882     sip = MakeSeqID (localname);
3883     if (sip != NULL) {
3884       bsp->id = SeqIdSetFree (bsp->id);
3885       bsp->id = sip;
3886       SeqMgrReplaceInBioseqIndex (bsp);
3887       VisitFeaturesOnBsp (bsp, (Pointer) bsp->id, CorrectFeatureSeqIds);
3888     }
3889   }
3890 
3891   ProcessOneNuc (entityID, deltabsp, src, tbl, template_molinfo);
3892 
3893   return entityID;
3894 }
3895 
3896 static Boolean DoSequenceLengthsMatch (
3897   TAlignmentFilePtr afp
3898 )
3899 
3900 {
3901   int    seq_index;
3902   Int4   seq_len;
3903 
3904   if (afp == NULL || afp->sequences == NULL || afp->num_sequences == 0) {
3905     return TRUE;
3906   }
3907   seq_len = StringLen (afp->sequences[0]);
3908   for (seq_index = 1; seq_index < afp->num_sequences; seq_index++) {
3909     if (StringLen (afp->sequences[seq_index]) != seq_len) {
3910       return FALSE;
3911     }
3912   }
3913   return TRUE;
3914 }
3915 
3916 static void ShowAlignmentNotes (
3917   TAlignmentFilePtr afp,
3918   TErrorInfoPtr error_list
3919 )
3920 
3921 {
3922   TErrorInfoPtr eip;
3923   Int4         index;
3924 
3925   for (eip = error_list; eip != NULL; eip = eip->next) {
3926     printf ("*****\nError category %d\n", eip->category);
3927     if (eip->line_num > -1) {
3928       printf ("Line number %d\n", eip->line_num);
3929     }
3930     if (eip->id != NULL) {
3931       printf ("Sequence ID %s\n", eip->id);
3932     }
3933     if (eip->message != NULL) {
3934       printf ("%s\n", eip->message);
3935     }
3936   }
3937   if (afp == NULL) {
3938     printf ("Catastrophic failure during reading\n");
3939   } else {
3940     printf ("Found %d sequences\n", afp->num_sequences);
3941     printf ("Found %d organisms\n", afp->num_organisms);
3942     for (index = 0; index < afp->num_sequences; index++)
3943     {
3944       printf ("\t%s\t", afp->ids [index]);
3945       if (index < afp->num_organisms) {
3946         printf ("%s\n", afp->organisms [index]);
3947       } else {
3948         printf ("No organism information\n");
3949       }
3950     }
3951     while (index < afp->num_organisms) {
3952       printf ("Unclaimed organism: %s\n", afp->organisms [index]);
3953       index++;
3954     }
3955   }
3956 }
3957 
3958 static Uint2 ProcessAlignSet (
3959   FILE *fp,
3960   BioSourcePtr src,
3961   TblArgsPtr tbl,
3962   MolInfoPtr template_molinfo
3963 )
3964 
3965 {
3966   TSequenceInfoPtr  sequence_info;
3967   TErrorInfoPtr     error_list;
3968   ReadBufferData    rbd;
3969   TAlignmentFilePtr afp;
3970   SeqEntryPtr       sep = NULL;
3971   BioseqPtr         bsp;
3972   BioseqSetPtr      bssp;
3973   Char              ch;
3974   Uint2             entityID;
3975   SeqEntryPtr       tmp;
3976   Char              nucleotide_alphabet[] = "ABCDGHKMRSTUVWXYabcdghkmrstuvwxy";
3977   Char              protein_alphabet[] = "ABCDEFGHIKLMPQRSTUVWXYZabcdefghiklmpqrstuvwxyz";
3978   Uint1             moltype = Seq_mol_dna;
3979 
3980   if (fp == NULL) return 0;
3981 
3982   sequence_info = SequenceInfoNew ();
3983   if (sequence_info == NULL) return 0;
3984 
3985   /* format sequence options based on commandline arguments */
3986   /* set sequence alphabet */
3987   if (tbl->aln_is_protein) {
3988     moltype = Seq_mol_aa;
3989     sequence_info->alphabet = protein_alphabet;
3990   } else {
3991     moltype = Seq_mol_dna;
3992     sequence_info->alphabet = nucleotide_alphabet;
3993   }
3994 
3995   sequence_info->beginning_gap = MemFree (sequence_info->beginning_gap);
3996   if (StringHasNoText (tbl->aln_beginning_gap)) {
3997     sequence_info->beginning_gap = StringSave (".-?");
3998   } else {
3999     sequence_info->beginning_gap = StringSave (tbl->aln_beginning_gap);
4000   }
4001   sequence_info->middle_gap = MemFree (sequence_info->middle_gap);
4002   if (StringHasNoText (tbl->aln_middle_gap)) {
4003     sequence_info->middle_gap = StringSave ("-");
4004   } else {
4005     sequence_info->middle_gap = StringSave (tbl->aln_middle_gap);
4006   }
4007   sequence_info->end_gap = MemFree (sequence_info->end_gap);
4008   if (StringHasNoText (tbl->aln_end_gap)) {
4009     sequence_info->end_gap = StringSave (".-?");
4010   } else {
4011     sequence_info->end_gap = StringSave (tbl->aln_end_gap);
4012   }
4013   sequence_info->missing = MemFree (sequence_info->missing);
4014   if (StringHasNoText (tbl->aln_missing)) {
4015     sequence_info->missing = StringSave ("Nn?");
4016   } else {
4017     sequence_info->missing = StringSave (tbl->aln_missing);
4018   }
4019   sequence_info->match = MemFree (sequence_info->match);
4020   if (StringHasNoText (tbl->aln_match)) {
4021     sequence_info->match = StringSave (".");
4022   } else {
4023     sequence_info->match = StringSave (tbl->aln_match);
4024   }
4025 
4026   error_list = NULL;
4027   rbd.fp = fp;
4028   rbd.current_data = NULL;
4029   afp = ReadAlignmentFile ( AbstractReadFunction,
4030                             (Pointer) &rbd,
4031                             AbstractReportError,
4032                             (Pointer) &error_list,
4033                             sequence_info);
4034 
4035   ShowAlignmentNotes (afp, error_list);
4036   ErrorInfoFree (error_list);
4037   if (afp != NULL) {
4038     if (afp->num_organisms == 0 && src == NULL) {
4039       printf ("No organisms supplied!\n");
4040     } else if (afp->num_organisms != 0 && afp->num_organisms != afp->num_sequences) {
4041       printf ( "Number of organisms must match number of sequences!");
4042     } else {
4043       ch = 'y';
4044       if (! DoSequenceLengthsMatch (afp)) {
4045         printf ("Sequences are not all the same length - are you sure you want to continue?");
4046         ch = getchar ();
4047       }
4048       if (ch == 'y' || ch == 'Y') {
4049         sep = MakeSequinDataFromAlignment (afp, moltype);
4050       }
4051     }
4052   }
4053   SequenceInfoFree (sequence_info);
4054 
4055   AlignmentFileFree (afp);
4056 
4057   if (sep == NULL || sep->data.ptrvalue == NULL) return 0;
4058 
4059   if (IS_Bioseq (sep)) {
4060     bsp = (BioseqPtr) sep->data.ptrvalue;
4061       entityID = ObjMgrRegister (OBJ_BIOSEQ, (Pointer) bsp);
4062       ProcessOneNuc (entityID, bsp, src, tbl, template_molinfo);
4063   } else if (IS_Bioseq_set (sep)) {
4064     bssp = (BioseqSetPtr) sep->data.ptrvalue;
4065     bssp->_class = BioseqseqSet_class_phy_set;
4066     entityID = ObjMgrRegister (OBJ_BIOSEQSET, (Pointer) bssp);
4067     for (tmp = bssp->seq_set; tmp != NULL; tmp = tmp->next) {
4068       if (IS_Bioseq (tmp)) {
4069         bsp = (BioseqPtr) tmp->data.ptrvalue;
4070         ProcessOneNuc (entityID, bsp, src, tbl, template_molinfo);
4071       }
4072     }
4073   } else return 0;
4074 
4075   SeqMgrLinkSeqEntry (sep, 0, NULL);
4076 
4077   return entityID;
4078 }
4079 
4080 static SeqAnnotPtr NewGraphSeqAnnot (
4081   CharPtr name,
4082   SeqGraphPtr sgp
4083 )
4084 
4085 {
4086   SeqAnnotPtr  sap = NULL;
4087 
4088   if (sgp == NULL) return NULL;
4089   sap = SeqAnnotNew ();
4090   if (sap == NULL) return NULL;
4091 
4092   if (StringDoesHaveText (name)) {
4093     SeqDescrAddPointer (&(sap->desc), Annot_descr_name, StringSave (name));
4094   }
4095   sap->type = 3;
4096   sap->data = (Pointer) sgp;
4097 
4098   return sap;
4099 }
4100 
4101 typedef struct npsseqs {
4102   BioseqPtr  nuc;
4103   BioseqPtr  prot;
4104 } NpsSeqs, PNTR NpsSeqsPtr;
4105 
4106 static void FindNucProtSeqs (
4107   BioseqPtr bsp,
4108   Pointer userdata
4109 )
4110 
4111 {
4112   NpsSeqsPtr  nsp;
4113 
4114   if (bsp == NULL) return;
4115   nsp = (NpsSeqsPtr) userdata;
4116   if (nsp == NULL) return;
4117 
4118   if (ISA_na (bsp->mol)) {
4119     nsp->nuc = bsp;
4120   } else if (ISA_aa (bsp->mol)) {
4121     nsp->prot = bsp;
4122   }
4123 }
4124 
4125 static Boolean InRightNps (
4126   CharPtr gbqval,
4127   SeqIdPtr protids,
4128   Boolean force_local_id
4129 )
4130 
4131 {
4132   Int2      adv;
4133   Char      id [64];
4134   Char      lcl [64];
4135   SeqIdPtr  sip = NULL;
4136   CharPtr   ptr;
4137   Boolean   rsult;
4138   long int  val;
4139   Uint4     version = 0;
4140 
4141   StringNCpy_0 (id, gbqval, sizeof (id));
4142   if (StringDoesHaveText (id)) {
4143     if (StringChr (id, '|') != NULL) {
4144       sip = SeqIdParse (id);
4145     } else if (force_local_id) {
4146       sprintf (lcl, "lcl|%s", id);
4147       sip = SeqIdParse (lcl);
4148     } else {
4149       adv = ValidateAccnDotVer (id);
4150       if (adv == 0 || adv == -5) {
4151         ptr = StringChr (id, '.');
4152         if (ptr != NULL) {
4153           *ptr = '\0';
4154           ptr++;
4155           if (sscanf (ptr, "%ld", &val) == 1) {
4156             version = (Uint4) val;
4157           }
4158         }
4159         sip = SeqIdFromAccession (id, version, NULL);
4160       } else {
4161         sprintf (lcl, "lcl|%s", id);
4162         sip = SeqIdParse (lcl);
4163       }
4164     }
4165   }
4166   if (sip == NULL) return FALSE;
4167   rsult = SeqIdIn (sip, protids);
4168   SeqIdFree (sip);
4169   return rsult;
4170 }
4171 
4172 static void MakeNucProtCDS (
4173   BioseqSetPtr bssp,
4174   Pointer userdata
4175 )
4176 
4177 {
4178   CodeBreakPtr    cbp;
4179   SeqFeatPtr      cds;
4180   CdRegionPtr     crp;
4181   GBQualPtr       gbq;
4182   Char            id [64];
4183   SeqFeatPtr      mrna;
4184   GBQualPtr       nextqual;
4185   NpsSeqs         ns;
4186   Boolean         partial5, partial3;
4187   GBQualPtr PNTR  prevqual;
4188   SeqFeatPtr      sfp;
4189   SeqIdPtr        sip;
4190   SeqLocPtr       slp;
4191   Int4            start, stop;
4192   TblArgsPtr      tbl;
4193   SeqFeatPtr      temp;
4194 
4195   tbl = (TblArgsPtr) userdata;
4196   if (tbl == NULL) return;
4197 
4198   ns.nuc = NULL;
4199   ns.prot = NULL;
4200   if (VisitBioseqsInSet (bssp, (Pointer) &ns, FindNucProtSeqs) != 2) return;
4201   if (ns.nuc == NULL || ns.prot == NULL) return;
4202 
4203   cds = SeqMgrGetCDSgivenProduct (ns.prot, NULL);
4204   mrna = SeqMgrGetRNAgivenProduct (ns.nuc, NULL);
4205   if (cds == NULL || mrna == NULL) return;
4206 
4207   CheckSeqLocForPartial (cds->location, &partial5, &partial3);
4208 
4209   start = GetOffsetInLoc (cds->location, mrna->location, SEQLOC_START);
4210   stop = GetOffsetInLoc (cds->location, mrna->location, SEQLOC_STOP);
4211 
4212   if (start < 0 || start >= ns.nuc->length ||
4213       stop < 0 || stop >= ns.nuc->length) return;
4214 
4215   sip = SeqIdFindBest (ns.nuc->id, 0);
4216   if (sip == NULL) return;
4217 
4218   /* copy cds feature fields to paste into new cds feature */
4219   temp = AsnIoMemCopy (cds,
4220                        (AsnReadFunc) SeqFeatAsnRead,
4221                        (AsnWriteFunc) SeqFeatAsnWrite);
4222   if (temp == NULL) return;
4223 
4224   sfp = CreateNewFeatureOnBioseq (ns.nuc, SEQFEAT_CDREGION, NULL);
4225   if (sfp == NULL) return;
4226 
4227   sfp->location = SeqLocFree (sfp->location);
4228   if (StringISearch (cds->except_text, "ribosomal slippage") == NULL &&
4229       StringISearch (cds->except_text, "ribosome slippage") == NULL &&
4230       StringISearch (cds->except_text, "trans splicing") == NULL &&
4231       StringISearch (cds->except_text, "trans-splicing") == NULL &&
4232       StringISearch (cds->except_text, "artificial frameshift") == NULL) {
4233     sfp->location = AddIntervalToLocation (NULL, sip, start, stop, partial5, partial3);
4234   } else {
4235     slp = SeqLocFindNext (cds->location, NULL);
4236     while (slp != NULL) {
4237       start = GetOffsetInLoc (slp, mrna->location, SEQLOC_START);
4238       stop = GetOffsetInLoc (slp, mrna->location, SEQLOC_STOP);
4239       sfp->location = AddIntervalToLocation (sfp->location, sip, start, stop, partial5, partial3);
4240       slp = SeqLocFindNext (cds->location, slp);
4241     }
4242     sfp->location = SeqLocMergeEx (ns.nuc, sfp->location, NULL, FALSE, TRUE, FALSE, FALSE);
4243   }
4244   SetSeqFeatProduct (sfp, ns.prot);
4245 
4246   /* paste fields from temp copy of original cds */
4247   crp = (CdRegionPtr) temp->data.value.ptrvalue;
4248   sfp->data.value.ptrvalue = (Pointer) crp;
4249 
4250   sfp->partial = temp->partial;
4251   sfp->excpt = temp->excpt;
4252   sfp->comment = temp->comment;
4253   sfp->qual = temp->qual;
4254   sfp->title = temp->title;
4255   sfp->ext = temp->ext;
4256   sfp->cit = temp->cit;
4257   sfp->exp_ev = temp->exp_ev;
4258   sfp->xref = temp->xref;
4259   sfp->dbxref = temp->dbxref;
4260   sfp->pseudo = temp->pseudo;
4261   sfp->except_text = temp->except_text;
4262 
4263   MemFree (temp); /* do not SeqFeatFree */
4264 
4265   /* update code break locations */
4266   for (cbp = crp->code_break; cbp != NULL; cbp = cbp->next) {
4267     CheckSeqLocForPartial (cbp->loc, &partial5, &partial3);
4268     start = GetOffsetInLoc (cbp->loc, mrna->location, SEQLOC_START);
4269     stop = GetOffsetInLoc (cbp->loc, mrna->location, SEQLOC_STOP);
4270     if (start < 0 || start >= ns.nuc->length ||
4271         stop < 0 || stop >= ns.nuc->length) continue;
4272     cbp->loc = SeqLocFree (cbp->loc);
4273     cbp->loc = AddIntervalToLocation (NULL, sip, start, stop, partial5, partial3);;
4274   }
4275 
4276   /* get rid of protein_id in mRNA if it matches protein Seq-id */
4277   gbq = mrna->qual;
4278   prevqual = (GBQualPtr PNTR) &(mrna->qual);
4279   id [0] = '\0';
4280   sip = NULL;
4281   while (gbq != NULL) {
4282     nextqual = gbq->next;
4283     if (StringICmp (gbq->qual, "protein_id") == 0 &&
4284         InRightNps (gbq->val, ns.prot->id, tbl->forcelocalid)) {
4285       *(prevqual) = gbq->next;
4286       gbq->next = NULL;
4287       StringNCpy_0 (id, gbq->val, sizeof (id));
4288       GBQualFree (gbq);
4289     } else {
4290       prevqual = (GBQualPtr PNTR) &(gbq->next);
4291     }
4292     gbq = nextqual;
4293   }
4294 }
4295 
4296 /* copy gene from contig onto nuc-prot, single interval on cdna bioseq */
4297 
4298 static void CopyGene (
4299   SeqFeatPtr sfp,
4300   Pointer userdata
4301 )
4302 
4303 {
4304   BioseqPtr          bsp;
4305   SeqMgrFeatContext  gcontext;
4306   SeqFeatPtr         gene, copy, temp;
4307   GeneRefPtr         grp, xref;
4308   Boolean            partial5, partial3;
4309 
4310   /* input mrna features are multi-interval on contig */
4311 
4312   if (sfp->data.choice != SEQFEAT_RNA) return;
4313 
4314   /* find cdna product of mrna */
4315 
4316   bsp = BioseqFindFromSeqLoc (sfp->product);
4317   if (bsp == NULL) return;
4318 
4319   /* check for gene xref */
4320 
4321   xref = SeqMgrGetGeneXref (sfp);
4322   if (xref != NULL) {
4323     if (SeqMgrGeneIsSuppressed (xref)) return;
4324 
4325     /* copy gene xref for new gene feature */
4326 
4327     grp = AsnIoMemCopy (xref,
4328                         (AsnReadFunc) GeneRefAsnRead,
4329                         (AsnWriteFunc) GeneRefAsnWrite);
4330     if (grp == NULL) return;
4331 
4332     /* make new gene feature on full-length of cdna */
4333 
4334     copy = CreateNewFeatureOnBioseq (bsp, SEQFEAT_GENE, NULL);
4335     if (copy == NULL) return;
4336 
4337     copy->data.value.ptrvalue = grp;
4338     return;
4339   }
4340 
4341   /* overlapping gene should be single interval on contig */
4342 
4343   gene = SeqMgrGetOverlappingGene (sfp->location, &gcontext);
4344   if (gene == NULL) return;
4345 
4346   CheckSeqLocForPartial (gene->location, &partial5, &partial3);
4347 
4348   /* copy gene feature fields to paste into new gene feature */
4349 
4350   temp = AsnIoMemCopy (gene,
4351                        (AsnReadFunc) SeqFeatAsnRead,
4352                        (AsnWriteFunc) SeqFeatAsnWrite);
4353   if (temp == NULL) return;
4354 
4355   /* make new gene feature on full-length of cdna */
4356 
4357   copy = CreateNewFeatureOnBioseq (bsp, SEQFEAT_GENE, NULL);
4358   if (copy == NULL) {
4359     SeqFeatFree (temp);
4360     return;
4361   }
4362 
4363   /* paste fields from temp copy of original gene */
4364 
4365   copy->data.value.ptrvalue = temp->data.value.ptrvalue;
4366   copy->partial = temp->partial;
4367   copy->excpt = temp->excpt;
4368   copy->comment = temp->comment;
4369   copy->qual = temp->qual;
4370   copy->title = temp->title;
4371   copy->ext = temp->ext;
4372   copy->cit = temp->cit;
4373   copy->exp_ev = temp->exp_ev;
4374   copy->xref = temp->xref;
4375   copy->dbxref = temp->dbxref;
4376   copy->pseudo = temp->pseudo;
4377   copy->except_text = temp->except_text;
4378 
4379   SetSeqLocPartial (copy->location, partial5, partial3);
4380 
4381   SeqLocFree (temp->location);
4382   MemFree (temp); /* do not SeqFeatFree */
4383 }
4384 
4385 static void CopyNcRna (
4386   SeqFeatPtr sfp,
4387   Pointer userdata
4388 )
4389 
4390 {
4391   BioseqPtr   bsp;
4392   SeqFeatPtr  copy, temp;
4393   Boolean     partial5, partial3;
4394 
4395   if (sfp->data.choice != SEQFEAT_RNA) return;
4396   if (sfp->idx.subtype != FEATDEF_ncRNA) return;
4397 
4398   /* find instantiated product of ncRNA */
4399 
4400   bsp = BioseqFindFromSeqLoc (sfp->product);
4401   if (bsp == NULL) return;
4402 
4403   CheckSeqLocForPartial (sfp->location, &partial5, &partial3);
4404 
4405   /* copy ncRNA feature fields to paste into new ncRNA feature */
4406 
4407   temp = AsnIoMemCopy (sfp,
4408                        (AsnReadFunc) SeqFeatAsnRead,
4409                        (AsnWriteFunc) SeqFeatAsnWrite);
4410   if (temp == NULL) return;
4411 
4412   /* make new ncRNA feature on full-length of transcript */
4413 
4414   copy = CreateNewFeatureOnBioseq (bsp, SEQFEAT_RNA, NULL);
4415   if (copy == NULL) {
4416     SeqFeatFree (temp);
4417     return;
4418   }
4419 
4420   /* paste fields from temp copy of original ncRNA */
4421 
4422   copy->data.value.ptrvalue = temp->data.value.ptrvalue;
4423   copy->partial = temp->partial;
4424   copy->excpt = temp->excpt;
4425   copy->comment = temp->comment;
4426   copy->qual = temp->qual;
4427   copy->title = temp->title;
4428   copy->ext = temp->ext;
4429   copy->cit = temp->cit;
4430   copy->exp_ev = temp->exp_ev;
4431   copy->xref = temp->xref;
4432   copy->dbxref = temp->dbxref;
4433   copy->pseudo = temp->pseudo;
4434   copy->except_text = temp->except_text;
4435 
4436   SetSeqLocPartial (copy->location, partial5, partial3);
4437 
4438   SeqLocFree (temp->location);
4439   SeqLocFree (temp->product);
4440   MemFree (temp); /* do not SeqFeatFree */
4441 }
4442 
4443 static void ClearRnaProducts (
4444   SeqFeatPtr sfp,
4445   Pointer userdata
4446 )
4447 
4448 {
4449   if (sfp == NULL || sfp->data.choice != SEQFEAT_RNA) return;
4450   if (sfp->product == NULL) return;
4451 
4452   sfp->product = SeqLocFree (sfp->product);
4453 }
4454 
4455 static void RemoveGBQualIDs (
4456   SeqFeatPtr sfp,
4457   Pointer userdata
4458 )
4459 
4460 {
4461   GBQualPtr       gbq;
4462   GBQualPtr       nextqual;
4463   GBQualPtr PNTR  prevqual;
4464 
4465   if (sfp->data.choice != SEQFEAT_CDREGION && sfp->data.choice != SEQFEAT_RNA) return;
4466 
4467   gbq = sfp->qual;
4468   prevqual = (GBQualPtr PNTR) &(sfp->qual);
4469   while (gbq != NULL) {
4470     nextqual = gbq->next;
4471     if (StringICmp (gbq->qual, "transcript_id") == 0 ||
4472         StringICmp (gbq->qual, "protein_id") == 0) {
4473       *(prevqual) = gbq->next;
4474       gbq->next = NULL;
4475       GBQualFree (gbq);
4476     } else {
4477       prevqual = (GBQualPtr PNTR) &(gbq->next);
4478     }
4479     gbq = nextqual;
4480   }
4481 }
4482 
4483 typedef struct dupprot {
4484   SeqFeatPtr  firstprot;
4485   SeqFeatPtr  secondprot;
4486 } DupProt, PNTR DupProtPtr;
4487 
4488 static void FindDupProtFeats (
4489   SeqFeatPtr sfp,
4490   Pointer userdata
4491 )
4492 
4493 {
4494   DupProtPtr  dpp;
4495   ProtRefPtr  prp;
4496 
4497   if (sfp == NULL || sfp->data.choice != SEQFEAT_PROT) return;
4498   dpp = (DupProtPtr) userdata;
4499   prp = (ProtRefPtr) sfp->data.value.ptrvalue;
4500   if (dpp == NULL || prp == NULL) return;
4501   if (prp->processed != 0) return;
4502   if (dpp->firstprot == NULL) {
4503     dpp->firstprot = sfp;
4504   } else if (dpp->secondprot == NULL) {
4505     dpp->secondprot = sfp;
4506   }
4507 }
4508 
4509 static void ClearProtFeatStrand (
4510   SeqFeatPtr sfp,
4511   Pointer userdata
4512 )
4513 
4514 {
4515   SeqIntPtr  sintp;
4516   SeqLocPtr  slp;
4517 
4518   if (sfp == NULL) return;
4519   if (sfp->data.choice != SEQFEAT_REGION &&
4520       sfp->data.choice != SEQFEAT_SITE &&
4521       sfp->data.choice != SEQFEAT_BOND &&
4522       sfp->data.choice != SEQFEAT_PROT) return;
4523 
4524   slp = SeqLocFindNext (sfp->location, NULL);
4525   while (slp != NULL) {
4526     if (slp->choice == SEQLOC_INT) {
4527       sintp = (SeqIntPtr) slp->data.ptrvalue;
4528       if (sintp != NULL) {
4529         if (sintp->strand != Seq_strand_unknown) {
4530           sintp->strand = Seq_strand_unknown;
4531         }
4532       }
4533     }
4534     slp = SeqLocFindNext (sfp->location, slp);
4535   }
4536 }
4537 
4538 static void RemoveDupProtFeats (
4539   BioseqPtr bsp,
4540   Pointer userdata
4541 )
4542 
4543 {
4544   DupProt  dp;
4545 
4546   if (bsp == NULL) return;
4547   if (! ISA_aa (bsp->mol)) return;
4548   VisitFeaturesOnBsp (bsp, NULL, ClearProtFeatStrand);
4549   dp.firstprot = NULL;
4550   dp.secondprot = NULL;
4551   VisitFeaturesOnBsp (bsp, (Pointer) &dp, FindDupProtFeats);
4552   if (dp.firstprot == NULL || dp.secondprot == NULL) return;
4553   if (AsnIoMemComp ((Pointer) dp.firstprot, (Pointer) dp.secondprot, (AsnWriteFunc) SeqFeatAsnWrite)) {
4554     dp.firstprot->idx.deleteme = TRUE;
4555   }
4556 }
4557 
4558 /*
4559 static void RemoveUnnecGeneXref (
4560   SeqFeatPtr sfp,
4561   Pointer userdata
4562 )
4563 
4564 {
4565   SeqFeatXrefPtr  curr, next;
4566   SeqFeatXrefPtr  PNTR last;
4567   GeneRefPtr      grp, grpx;
4568   Boolean         redundantgenexref;
4569   SeqFeatPtr      sfpx;
4570   CharPtr         syn1, syn2;
4571 
4572   if (sfp == NULL || sfp->data.choice == SEQFEAT_GENE) return;
4573   grp = SeqMgrGetGeneXref (sfp);
4574   if (grp == NULL || SeqMgrGeneIsSuppressed (grp)) return;
4575   sfpx = SeqMgrGetOverlappingGene (sfp->location, NULL);
4576   if (sfpx == NULL || sfpx->data.choice != SEQFEAT_GENE) return;
4577   grpx = (GeneRefPtr) sfpx->data.value.ptrvalue;
4578   if (grpx == NULL) return;
4579 
4580   redundantgenexref = FALSE;
4581   if (StringDoesHaveText (grp->locus) && StringDoesHaveText (grpx->locus)) {
4582     if ((StringICmp (grp->locus, grpx->locus) == 0)) {
4583       redundantgenexref = TRUE;
4584     }
4585   } else if (StringDoesHaveText (grp->locus_tag) && StringDoesHaveText (grpx->locus_tag)) {
4586     if ((StringICmp (grp->locus_tag, grpx->locus_tag) == 0)) {
4587       redundantgenexref = TRUE;
4588     }
4589   } else if (grp->syn != NULL && grpx->syn != NULL) {
4590     syn1 = (CharPtr) grp->syn->data.ptrvalue;
4591     syn2 = (CharPtr) grpx->syn->data.ptrvalue;
4592     if (StringDoesHaveText (syn1) && StringDoesHaveText (syn2)) {
4593       if (StringICmp (syn1, syn2) == 0) {
4594         redundantgenexref = TRUE;
4595       }
4596     }
4597   }
4598 
4599   if (redundantgenexref) {
4600     last = (SeqFeatXrefPtr PNTR) &(sfp->xref);
4601     curr = sfp->xref;
4602     while (curr != NULL) {
4603       next = curr->next;
4604       if (curr->data.choice == SEQFEAT_GENE) {
4605         *last = next;
4606         curr->next = NULL;
4607         SeqFeatXrefFree (curr);
4608       } else {
4609         last = &(curr->next);
4610       }
4611       curr = next;
4612     }
4613   }
4614 }
4615 */
4616 
4617 typedef struct dummysmfedata {
4618   Int4  max;
4619   Int4  num_at_max;
4620 } DummySmfeData, PNTR DummySmfePtr;
4621 
4622 static Boolean LIBCALLBACK T2ADummySMFEProc (
4623   SeqFeatPtr sfp,
4624   SeqMgrFeatContextPtr context
4625 )
4626 
4627 
4628 {
4629   DummySmfePtr  dsp;
4630   Int4          len;
4631 
4632   if (sfp == NULL || context == NULL) return TRUE;
4633   dsp = context->userdata;
4634   if (dsp == NULL) return TRUE;
4635 
4636   len = SeqLocLen (sfp->location);
4637   if (len < dsp->max) {
4638     dsp->max = len;
4639     dsp->num_at_max = 1;
4640   } else if (len == dsp->max) {
4641     (dsp->num_at_max)++;
4642   }
4643 
4644   return TRUE;
4645 }
4646 
4647 static void FillInPartialGeneXref (
4648   SeqFeatPtr sfp,
4649   Pointer userdata
4650 )
4651 
4652 {
4653   BioseqPtr          bsp;
4654   SeqMgrFeatContext  context;
4655   SeqFeatPtr         gene;
4656   GeneRefPtr         grp, grpx;
4657 
4658   if (sfp == NULL || sfp->data.choice == SEQFEAT_GENE) return;
4659 
4660   grp = SeqMgrGetGeneXref (sfp);
4661   if (grp == NULL || SeqMgrGeneIsSuppressed (grp)) return;
4662   if (StringDoesHaveText (grp->locus) || StringHasNoText (grp->locus_tag)) return;
4663 
4664   bsp = BioseqFindFromSeqLoc (sfp->location);
4665   if (bsp == NULL) return;
4666   gene = SeqMgrGetGeneByLocusTag (bsp, grp->locus_tag, &context);
4667   if (gene == NULL || gene->data.choice != SEQFEAT_GENE) return;
4668   grpx = (GeneRefPtr) gene->data.value.ptrvalue;
4669   if (grpx == NULL) return;
4670 
4671   if (StringHasNoText (grpx->locus)) return;
4672   grp->locus = StringSave (grpx->locus);
4673 }
4674 
4675 static void RemoveUnnecGeneXref (
4676   SeqFeatPtr sfp,
4677   Pointer userdata
4678 )
4679 
4680 {
4681   Int2                 count;
4682   SeqFeatXrefPtr       curr, next;
4683   DummySmfeData        dsd;
4684   SeqMgrFeatContext    fcontext;
4685   SeqFeatXrefPtr PNTR  last;
4686   GeneRefPtr           grp, grpx;
4687   SeqFeatPtr           sfpx;
4688   CharPtr              syn1, syn2;
4689 
4690   if (sfp == NULL || sfp->data.choice == SEQFEAT_GENE) return;
4691   grp = SeqMgrGetGeneXref (sfp);
4692   if (grp == NULL || SeqMgrGeneIsSuppressed (grp)) return;
4693   sfpx = SeqMgrGetOverlappingGene (sfp->location, &fcontext);
4694   if (sfpx == NULL || sfpx->data.choice != SEQFEAT_GENE) return;
4695   grpx = (GeneRefPtr) sfpx->data.value.ptrvalue;
4696   if (grpx == NULL) return;
4697 
4698   if ((!StringHasNoText (grp->locus)) && (!StringHasNoText (grpx->locus))) {
4699     if ((StringICmp (grp->locus, grpx->locus) != 0)) return;
4700   } else if (StringDoesHaveText (grp->locus_tag) && StringDoesHaveText (grp->locus_tag)) {
4701     if ((StringICmp (grp->locus_tag, grpx->locus_tag) != 0)) return;
4702   } else if (grp->syn != NULL && grpx->syn != NULL) {
4703     syn1 = (CharPtr) grp->syn->data.ptrvalue;
4704     syn2 = (CharPtr) grpx->syn->data.ptrvalue;
4705     if ((!StringHasNoText (syn1)) && (!StringHasNoText (syn2))) {
4706       if ((StringICmp (syn1, syn2) != 0)) return;
4707     }
4708   }
4709 
4710   MemSet ((Pointer) &dsd, 0, sizeof (DummySmfeData));
4711   dsd.max = INT4_MAX;
4712   dsd.num_at_max = 0;
4713   count = SeqMgrGetAllOverlappingFeatures (sfp->location, FEATDEF_GENE, NULL, 0,
4714                                            LOCATION_SUBSET, (Pointer) &dsd, T2ADummySMFEProc);
4715 
4716   if (dsd.num_at_max < 2) {
4717     last = (SeqFeatXrefPtr PNTR) &(sfp->xref);
4718     curr = sfp->xref;
4719     while (curr != NULL) {
4720       next = curr->next;
4721       if (curr->data.choice == SEQFEAT_GENE) {
4722         *last = next;
4723         curr->next = NULL;
4724         SeqFeatXrefFree (curr);
4725       } else {
4726         last = &(curr->next);
4727       }
4728       curr = next;
4729     }
4730   }
4731 }
4732 
4733 static CharPtr RnaTypeLabel (
4734   SeqFeatPtr rna
4735 )
4736 
4737 {
4738   if (rna == NULL) return "RNA";
4739   switch (rna->idx.subtype) {
4740     case FEATDEF_preRNA :
4741       return "preRNA";
4742     case FEATDEF_mRNA :
4743       return "mRNA";
4744     case FEATDEF_tRNA :
4745       return "tRNA";
4746     case FEATDEF_rRNA :
4747       return "rRNA";
4748     case FEATDEF_snRNA :
4749       return "snRNA";
4750     case FEATDEF_scRNA :
4751       return "scRNA";
4752     case FEATDEF_otherRNA :
4753       return "otherRNA";
4754     case FEATDEF_snoRNA :
4755       return "snoRNA";
4756     case FEATDEF_ncRNA :
4757       return "ncRNA";
4758     case FEATDEF_tmRNA :
4759       return "tmRNA";
4760     default :
4761       break;
4762   }
4763   return "RNA";
4764 }
4765 
4766 static void AddRnaTitles (
4767   SeqFeatPtr rna,
4768   CharPtr organism
4769 )
4770 
4771 {
4772   BioseqPtr          bsp;
4773   SeqMgrFeatContext  ccontext;
4774   CharPtr            cdslabel = NULL;
4775   SeqMgrFeatContext  gcontext;
4776   CharPtr            genelabel = NULL;
4777   size_t             len;
4778   SeqFeatPtr         sfp;
4779   CharPtr            str;
4780   CharPtr            typ = NULL;
4781 
4782   if (rna == NULL || rna->product == NULL) return;
4783   bsp = BioseqFindFromSeqLoc (rna->product);
4784   if (bsp == NULL) return;
4785   if (! ISA_na (bsp->mol)) return;
4786   if (BioseqGetTitle (bsp) != NULL) return;
4787   sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_GENE, 0, &gcontext);
4788   if (sfp != NULL) {
4789     genelabel = gcontext.label;
4790     if (StringHasNoText (genelabel)) {
4791       genelabel = NULL;
4792     }
4793   }
4794   sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &ccontext);
4795   if (sfp != NULL) {
4796     cdslabel = ccontext.label;
4797     if (StringHasNoText (cdslabel)) {
4798       cdslabel = NULL;
4799     }
4800   }
4801   typ = RnaTypeLabel (rna); 
4802   len = StringLen (organism) + StringLen (genelabel) + StringLen (cdslabel) +
4803         StringLen (" mRNA, complete cds.") + StringLen (typ) + 10;
4804   str = (CharPtr) MemNew (len * sizeof (Char));
4805   if (str == NULL) return;
4806   str [0] = '\0';
4807 
4808   if (StringDoesHaveText (organism)) {
4809     StringCat (str, organism);
4810   }
4811   if (cdslabel != NULL) {
4812     StringCat (str, " ");
4813     StringCat (str, cdslabel);
4814   }
4815   if (genelabel != NULL) {
4816       StringCat (str, " (");
4817       StringCat (str, genelabel);
4818       StringCat (str, ")");
4819   }
4820   if (cdslabel != NULL && genelabel != NULL) {
4821     StringCat (str, " ");
4822     StringCat (str, typ);
4823     if (ccontext.partialL || ccontext.partialR) {
4824       StringCat (str, ", partial cds.");
4825     } else {
4826       StringCat (str, ", complete cds.");
4827     }
4828   } else if (genelabel != NULL) {
4829     StringCat (str, " ");
4830     StringCat (str, typ);
4831     StringCat (str, ".");
4832   }
4833   SeqDescrAddPointer (&(bsp->descr), Seq_descr_title, (Pointer) str);
4834 }
4835 
4836 static void MakeOneRnaTitle (
4837   SeqFeatPtr rna,
4838   SeqFeatPtr gene,
4839   CharPtr label,
4840   CharPtr organism,
4841   Boolean alt_splice
4842 )
4843 
4844 {
4845   BioseqPtr          bsp;
4846   SeqMgrFeatContext  ccontext;
4847   SeqFeatPtr         cds;
4848   GeneRefPtr         grp;
4849   Char               id [64];
4850   CharPtr            lbl = NULL;
4851   size_t             len;
4852   CharPtr            ptr;
4853   CharPtr            str;
4854   CharPtr            typ = NULL;
4855 
4856   if (rna == NULL || rna->product == NULL) return;
4857 
4858   grp = SeqMgrGetGeneXref (rna);
4859   if (SeqMgrGeneIsSuppressed (grp)) return;
4860   if (grp == NULL && gene != NULL) {
4861     grp = (GeneRefPtr) gene->data.value.ptrvalue;
4862   }
4863   if (grp == NULL) return;
4864 
4865   bsp = BioseqFindFromSeqLoc (rna->product);
4866   if (bsp == NULL) return;
4867   SeqIdWrite (bsp->id, id, PRINTID_TEXTID_ACC_VER, sizeof (id) - 1);
4868 
4869   typ = RnaTypeLabel (rna); 
4870   lbl = StringSaveNoNull (label);
4871 
4872   cds = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &ccontext);
4873 
4874   len = StringLen (organism) + StringLen (grp->locus_tag) + StringLen (grp->locus) +
4875         StringLen (id) + StringLen (" transcript variant") + StringLen (lbl) +
4876         StringLen (" mRNA, complete cds.") + StringLen (typ) + 20;
4877   str = (CharPtr) MemNew (len * sizeof (Char));
4878   if (str == NULL) return;
4879   str [0] = '\0';
4880 
4881   if (StringDoesHaveText (organism)) {
4882     StringCat (str, organism);
4883   }
4884   if (lbl != NULL) {
4885     StringCat (str, " ");
4886     ptr = StringStr (lbl, ", transcript variant ");
4887     if (ptr != NULL) {
4888       *ptr = '\0';
4889       ptr += 2;
4890       StringCat (str, lbl);
4891       if (StringDoesHaveText (grp->locus)) {
4892           StringCat (str, " (");
4893           StringCat (str, grp->locus);
4894           StringCat (str, ")");
4895       }
4896       StringCat (str, ", ");
4897       StringCat (str, ptr);
4898     } else {
4899       StringCat (str, lbl);
4900       if (StringDoesHaveText (grp->locus)) {
4901           StringCat (str, " (");
4902           StringCat (str, grp->locus);
4903           StringCat (str, ")");
4904       }
4905     }
4906   }
4907 
4908   StringCat (str, ", ");
4909   StringCat (str, typ);
4910   StringCat (str, ".");
4911 
4912   SeqDescrAddPointer (&(bsp->descr), Seq_descr_title, (Pointer) str);
4913   MemFree (lbl);
4914 }
4915 
4916 static void MakeSmartRnaTitles (
4917   BioseqPtr bsp,
4918   CharPtr organism
4919 )
4920 
4921 {
4922   SeqMgrFeatContext  context;
4923   GmcDataPtr         gdp, head;
4924   GeneRefPtr         grp;
4925   Int2               i, j, k, numgene, numrna;
4926   SeqFeatPtr         sfp;
4927 
4928   if (bsp == NULL) return;
4929 
4930   numgene = 0;
4931   numrna = 0;
4932 
4933   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &context);
4934   while (sfp != NULL) {
4935     switch (sfp->data.choice) {
4936       case SEQFEAT_GENE :
4937         numgene++;
4938         break;
4939       case SEQFEAT_RNA :
4940         numrna++;
4941         break;
4942       default :
4943         break;
4944     }
4945     sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &context);
4946   }
4947 
4948   /* if (numgene == 0) return; */
4949 
4950   if (numrna > 0) {
4951     head = (GmcDataPtr) MemNew (sizeof (GmcData) * (size_t) (numrna + 1));
4952     if (head != NULL) {
4953       gdp = head;
4954       sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_RNA, 0, &context);
4955       while (sfp != NULL) {
4956         if (sfp->product != NULL) {
4957           gdp->feat = sfp;
4958           gdp->label = context.label;
4959           grp = SeqMgrGetGeneXref (sfp);
4960           if (grp == NULL || (! SeqMgrGeneIsSuppressed (grp))) {
4961             gdp->gene = SeqMgrGetOverlappingGene (sfp->location, NULL);
4962           }
4963           gdp++;
4964         }
4965         sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_RNA, 0, &context);
4966       }
4967       HeapSort (head, (size_t) numrna, sizeof (GmcData), SortByGenePtr);
4968       for (i = 0; i < numrna; i += j) {
4969         sfp = head [i].gene;
4970         for (j = 1; i + j < numrna && sfp == head [i + j].gene; j++) continue;
4971         if (j == 1) {
4972           /* no alt splicing */
4973           MakeOneRnaTitle (head [i].feat, head [i].gene, head [i].label, organism, FALSE);
4974         } else {
4975           /* is alt splicing */
4976           for (k = 0; k < j; k++) {
4977             MakeOneRnaTitle (head [i + k].feat, head [i + k].gene, head [i + k].label, organism, TRUE);
4978           }
4979         }
4980       }
4981     }
4982     MemFree (head);
4983   }
4984 }
4985 
4986 typedef struct gosearch {
4987   TextFsaPtr  gotags;
4988   Boolean     isbad;
4989 } GoSearch, PNTR GoSearchPtr;
4990 
4991 static void LookForGo (
4992   SeqFeatPtr sfp,
4993   Pointer userdata
4994 )
4995 
4996 {
4997   Char         ch;
4998   GoSearchPtr  gsp;
4999   CharPtr      ptr;
5000   Int4         state;
5001   ValNodePtr   matches;
5002 
5003   if (sfp == NULL || StringHasNoText (sfp->comment)) return;
5004   gsp = (GoSearchPtr) userdata;
5005 
5006   state = 0;
5007   ptr = sfp->comment;
5008   ch = *ptr;
5009   while (ch != '\0') {
5010     matches = NULL;
5011     state = TextFsaNext (gsp->gotags, state, ch, &matches);
5012     if (matches != NULL) {
5013       gsp->isbad = TRUE;
5014     }
5015     ptr++;
5016     ch = *ptr;
5017   }
5018 }
5019 
5020 static Boolean HasGoTermsInNote (
5021   SeqEntryPtr sep,
5022   TextFsaPtr gotags
5023 )
5024 
5025 {
5026   GoSearch  gs;
5027 
5028   gs.gotags = gotags;
5029   gs.isbad = FALSE;
5030   VisitFeaturesInSep (sep, (Pointer) &gs, LookForGo);
5031   return gs.isbad;
5032 }
5033 
5034 static void TakeProteinsFromGPS (
5035   BioseqPtr bsp,
5036   Pointer userdata
5037 )
5038 
5039 {
5040   SeqEntryPtr PNTR  lastp;
5041   SeqEntryPtr       sep;
5042 
5043   if (bsp == NULL || (! ISA_aa (bsp->mol))) return;
5044   lastp = (SeqEntryPtr PNTR) userdata;
5045   if (lastp == NULL) return;
5046 
5047   /* link copy after genomic sequence */
5048 
5049   bsp = (BioseqPtr) AsnIoMemCopy ((Pointer) bsp,
5050                                   (AsnReadFunc) BioseqAsnRead,
5051                                   (AsnWriteFunc) BioseqAsnWrite);
5052   sep = ValNodeAddPointer (lastp, 1, (Pointer) bsp);
5053   *lastp = sep;
5054 }
5055 
5056 static void GPStoNPS (
5057   SeqEntryPtr top,
5058   Uint2 entityID
5059 )
5060 
5061 {
5062   BioseqSetPtr  bssp;
5063   BioseqSetPtr  dum;
5064   SeqEntryPtr   last, sep;
5065   Uint2         parenttype;
5066   Pointer       parentptr;
5067 
5068   if (top == NULL || top->choice != 2) {
5069     Message (MSG_POSTERR, "GPStoNPS failed at top || top->choice");
5070     return;
5071   }
5072   bssp = (BioseqSetPtr) top->data.ptrvalue;
5073   if (bssp == NULL || bssp->_class != BioseqseqSet_class_gen_prod_set) {
5074     Message (MSG_POSTERR, "GPStoNPS failed at bssp || bssp->_class");
5075     return;
5076   }
5077 
5078   GetSeqEntryParent (top, &parentptr, &parenttype);
5079 
5080   /* point to genomic Bioseq component of gps */
5081 
5082   sep = bssp->seq_set;
5083   if (sep == NULL || sep->choice != 1) {
5084     Message (MSG_POSTERR, "GPStoNPS failed at sep || sep->choice");
5085     return;
5086   }
5087 
5088   /* unlink nuc-prot sets, etc., from genomic Bioseq */
5089 
5090   dum = BioseqSetNew ();
5091   if (dum == NULL) {
5092     Message (MSG_POSTERR, "GPStoNPS failed at BioseqSetNew");
5093     return;
5094   }
5095   dum->_class = 1;
5096   dum->seq_set = sep->next;
5097   sep->next = NULL;
5098 
5099   last = sep;
5100   VisitBioseqsInSet (dum, (Pointer) &last, TakeProteinsFromGPS);
5101 
5102   bssp->_class = BioseqseqSet_class_nuc_prot;
5103 
5104   SeqMgrLinkSeqEntry (top, parenttype, parentptr);
5105 
5106   SeqMgrClearFeatureIndexes (bssp->idx.entityID, NULL);
5107 
5108   VisitFeaturesInSet (bssp, NULL, ClearRnaProducts);
5109 
5110   move_cds (top);
5111 
5112   /* in case result has no proteins, demote to bioseq */
5113 
5114   RenormalizeNucProtSets (top, TRUE);
5115 
5116   /* cleanup original nuc-prot sets */
5117 
5118   BioseqSetFree (dum);
5119 }
5120 
5121 static void GeneralToNote (
5122   SeqFeatPtr sfp,
5123   Pointer userdata
5124 )
5125 
5126 {
5127   BioseqPtr  bsp;
5128   Char       buf [41];
5129   DbtagPtr   dbt;
5130   size_t     len;
5131   SeqIdPtr   sip;
5132   CharPtr    str;
5133 
5134   if (sfp == NULL || sfp->product == NULL) return;
5135   if (sfp->data.choice != SEQFEAT_RNA) return;
5136 
5137   bsp = BioseqFindFromSeqLoc (sfp->product);
5138   if (bsp == NULL) return;
5139 
5140   for (sip = bsp->id; sip != NULL; sip = sip->next) {
5141     if (sip->choice != SEQID_GENERAL) continue;
5142     dbt = (DbtagPtr) sip->data.ptrvalue;
5143     if (dbt == NULL) continue;
5144     if (StringICmp (dbt->db, "TMSMART") == 0 || StringICmp (dbt->db, "NCBIFILE") == 0) continue;
5145 
5146     SeqIdWrite (sip, buf, PRINTID_REPORT, sizeof (buf) - 1);
5147 
5148     if (sfp->comment == NULL) {
5149       sfp->comment = StringSave (buf);
5150     } else {
5151       len = StringLen (sfp->comment) + StringLen (buf) + 5;
5152       str = MemNew (sizeof (Char) * len);
5153       StringCpy (str, sfp->comment);
5154       StringCat (str, "; ");
5155       StringCat (str, buf);
5156       sfp->comment = MemFree (sfp->comment);
5157       sfp->comment = str;
5158     }
5159   }
5160 }
5161 
5162 static SeqEntryPtr PropagateDescsFromGenBankSet (
5163   SeqEntryPtr sep
5164 )
5165 
5166 {
5167   BioseqPtr     bsp;
5168   BioseqSetPtr  bssp;
5169   SeqEntryPtr   firstsep = NULL;
5170   SeqEntryPtr   seqentry;
5171   ValNodePtr    sourcedescr;
5172 
5173   if (sep == NULL) return NULL;
5174   if (! IS_Bioseq_set (sep)) return sep;
5175   bssp = (BioseqSetPtr) sep->data.ptrvalue;
5176   if (bssp == NULL) return sep;
5177   sourcedescr = bssp->descr;
5178   if (sourcedescr == NULL) return sep;
5179   firstsep = bssp->seq_set;
5180   seqentry = firstsep;
5181   while (seqentry != NULL) {
5182     if (seqentry->data.ptrvalue != NULL) {
5183       if (seqentry->choice == 1) {
5184         bsp = (BioseqPtr) seqentry->data.ptrvalue;
5185         ValNodeLink (&(bsp->descr),
5186                      AsnIoMemCopy ((Pointer) sourcedescr,
5187                                    (AsnReadFunc) SeqDescrAsnRead,
5188                                    (AsnWriteFunc) SeqDescrAsnWrite));
5189       } else if (seqentry->choice == 2) {
5190         bssp = (BioseqSetPtr) seqentry->data.ptrvalue;
5191         ValNodeLink (&(bssp->descr),
5192                      AsnIoMemCopy ((Pointer) sourcedescr,
5193                                    (AsnReadFunc) SeqDescrAsnRead,
5194                                    (AsnWriteFunc) SeqDescrAsnWrite));
5195       }
5196     }
5197     seqentry = seqentry->next;
5198   }
5199   bssp = (BioseqSetPtr) sep->data.ptrvalue;
5200   bssp->descr = SeqDescrFree (bssp->descr);
5201   NormalizeDescriptorOrder (sep);
5202   return firstsep;
5203 }
5204 
5205 typedef struct srcdata {
5206   Boolean  isSeqId;
5207   Boolean  isOrganism;
5208   Uint1    orgmodType;
5209   Uint1    subsourceType;
5210 } SrcData, PNTR SrcDataPtr;
5211 
5212 static void ParseOneOrgLabel (
5213   SrcDataPtr field,
5214   CharPtr label
5215 )
5216 
5217 {
5218   Int2  i;
5219 
5220   if (field == NULL || StringHasNoText (label)) return;
5221 
5222   if (StringICmp (label, "local_id") == 0 ||
5223       StringICmp (label, "local id") == 0 ||
5224       StringICmp (label, "SequenceID") == 0 ||
5225       StringICmp (label, "Sequence_ID") == 0 ||
5226       StringICmp (label, "Sequence ID") == 0 ||
5227       StringICmp (label, "SeqID") == 0 ||
5228       StringICmp (label, "Seq_ID") == 0 ||
5229       StringICmp (label, "Seq ID") == 0) {
5230     field->isSeqId = TRUE;
5231     return;
5232   }
5233   if (StringICmp (label, "organism") == 0) {
5234     field->isOrganism = TRUE;
5235     return;
5236   }
5237 
5238   i = EquivalentOrgMod (label);
5239   if (i != 0) {
5240     field->orgmodType = (Uint1) i;
5241     return;
5242   }
5243   i = EquivalentSubSource (label);
5244   if (i != 0) {
5245     field->subsourceType = (Uint1) i;
5246     return;
5247   }
5248   if (StringICmp (label, "note") == 0) {
5249     field->subsourceType = (Uint1) SUBSRC_other;
5250   }
5251 }
5252 
5253 static void ProcessSourceTable (
5254   FILE *fp
5255 )
5256 
5257 {
5258   BioSourcePtr  biop;
5259   BioseqPtr     bsp;
5260   CharPtr       columns [80];
5261   FileCache     fc;
5262   SrcData       fields [80];
5263   Int2          i, numfields;
5264   Char          line [4095];
5265   OrgModPtr     omp;
5266   OrgNamePtr    onp;
5267   OrgRefPtr     orp;
5268   CharPtr       ptr, str;
5269   SeqDescrPtr   sdp;
5270   SeqIdPtr      sip;
5271   SubSourcePtr  ssp;
5272 
5273   if (fp == NULL) return;
5274 
5275   MemSet ((Pointer) fields, 0, sizeof (fields));
5276   numfields = 0;
5277 
5278   FileCacheSetup (&fc, fp);
5279 
5280   /* read first line with field names */
5281 
5282   str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
5283   if (str == NULL) return;
5284 
5285   TrimSpacesAroundString (str);
5286   while (StringDoesHaveText (str) && numfields < 78) {
5287     ptr = StringChr (str, '\t');
5288     if (ptr != NULL) {
5289       *ptr = '\0';
5290       ptr++;
5291     }
5292     TrimSpacesAroundString (str);
5293     ParseOneOrgLabel (&(fields [numfields]), str);
5294     numfields++;
5295     str = ptr;
5296   }
5297 
5298   if (! fields [0].isSeqId) return;
5299 
5300   /* read remaining lines with source data */
5301 
5302   str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
5303   while (str != NULL) {
5304 
5305     MemSet ((Pointer) columns, 0, sizeof (columns));
5306 
5307     TrimSpacesAroundString (str);
5308     i = 0;
5309     while (StringDoesHaveText (str) && i < numfields) {
5310       ptr = StringChr (str, '\t');
5311       if (ptr != NULL) {
5312         *ptr = '\0';
5313         ptr++;
5314       }
5315       TrimSpacesAroundString (str);
5316       columns [i] = str;
5317       i++;
5318       str = ptr;
5319     }
5320 
5321     if (StringDoesHaveText (columns [0])) {
5322       sip = MakeSeqID (columns [0]);
5323       if (sip != NULL) {
5324         bsp = BioseqFind (sip);
5325         if (bsp != NULL) {
5326           biop = NULL;
5327           sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_source, NULL);
5328           if (sdp != NULL) {
5329             biop = (BioSourcePtr) sdp->data.ptrvalue;
5330           }
5331           if (biop == NULL) {
5332             biop = BioSourceNew ();
5333             if (biop != NULL) {
5334               SeqDescrAddPointer (&(bsp->descr), Seq_descr_source, (Pointer) biop);
5335             }
5336           }
5337           if (biop != NULL) {
5338             for (i = 1; i < numfields; i++) {
5339               if (StringHasNoText (columns [i])) continue;
5340               if (fields [i].isOrganism) {
5341                 if (biop->org == NULL) {
5342                   biop->org = OrgRefNew ();
5343                 }
5344                 orp = biop->org;
5345                 if (orp != NULL) {
5346                   orp->taxname = MemFree (orp->taxname);
5347                   orp->taxname = StringSave (columns [i]);
5348                 }
5349               } else if (fields [i].orgmodType > 0) {
5350                 if (biop->org == NULL) {
5351                   biop->org = OrgRefNew ();
5352                 }
5353                 orp = biop->org;
5354                 if (orp != NULL) {
5355                   if (orp->orgname == NULL) {
5356                     orp->orgname = OrgNameNew ();
5357                   }
5358                   onp = orp->orgname;
5359                   if (onp != NULL) {
5360                     omp = OrgModNew ();
5361                     if (omp != NULL) {
5362                       omp->subtype = (Uint1) fields [i].orgmodType;
5363                       omp->subname = StringSave (columns [i]);
5364                       omp->next = onp->mod;
5365                       onp->mod = omp;
5366                     }
5367                   }
5368                 }
5369               } else if (fields [i].subsourceType > 0) {
5370                 ssp = SubSourceNew ();
5371                 if (ssp != NULL) {
5372                   ssp->subtype = (Uint1) fields [i].subsourceType;
5373                   ssp->name = StringSave (columns [i]);
5374                   ssp->next = biop->subtype;
5375                   biop->subtype = ssp;
5376                 }
5377               }
5378             }
5379           }
5380         }
5381         sip = SeqIdFree (sip);
5382       }
5383     }
5384 
5385     str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
5386   }
5387 }
5388 
5389 static SeqDescrPtr GetDescriptorTypeAlreadyInList (
5390   Uint1 descr_choice,
5391   SeqDescrPtr list
5392 )
5393 
5394 {
5395   while (list != NULL && list->choice != descr_choice) {
5396     list = list->next;
5397   }
5398   return list;
5399 }
5400 
5401 static void AddTemplateDescriptors (
5402   SeqDescrPtr PNTR current_list,
5403   SeqDescrPtr new_list,
5404   Boolean copy
5405 )
5406 
5407 {
5408   SeqDescrPtr  dsc, sdp_next, sdp;
5409 
5410   if (current_list == NULL || new_list == NULL) return;
5411 
5412   for (sdp = new_list; sdp != NULL; sdp = sdp_next) {
5413     sdp_next = sdp->next;
5414     if (sdp->choice == Seq_descr_molinfo) continue;
5415     if (sdp->choice == Seq_descr_source &&
5416         GetDescriptorTypeAlreadyInList (Seq_descr_source, *current_list) != NULL) continue;
5417     sdp->next = NULL;
5418     if (copy) {
5419       dsc = AsnIoMemCopy ((Pointer) sdp,
5420                           (AsnReadFunc) SeqDescrAsnRead,
5421                           (AsnWriteFunc) SeqDescrAsnWrite);
5422     } else {
5423       dsc = sdp;
5424     }
5425     ValNodeLink (current_list, (Pointer) dsc);
5426     sdp->next = sdp_next;
5427   }
5428 }
5429 
5430 static void GenomizeSeqId (
5431   SeqIdPtr sip,
5432   Pointer userdata
5433 )
5434 
5435 {
5436   CharPtr      accn = NULL;
5437   CharPtr      center;
5438   DbtagPtr     dbt;
5439   ObjectIdPtr  oip;
5440 
5441   if (sip == NULL || sip->choice != SEQID_LOCAL) return;
5442   center = (CharPtr) userdata;
5443   if (StringHasNoText (center)) return;
5444 
5445   oip = (ObjectIdPtr) sip->data.ptrvalue;
5446   if (oip == NULL) return;
5447   accn = oip->str;
5448   if (StringHasNoText (accn)) return;
5449 
5450   dbt = DbtagNew ();
5451   if (dbt == NULL) return;
5452   oip = ObjectIdNew ();
5453   if (oip == NULL) return;
5454   oip->str = StringSave (accn);
5455   dbt->db = StringSave (center);
5456   dbt->tag = oip;
5457 
5458   sip->data.ptrvalue = ObjectIdFree ((ObjectIdPtr) sip->data.ptrvalue);
5459   sip->data.ptrvalue = (Pointer) dbt;
5460   sip->choice = SEQID_GENERAL;
5461 }
5462 
5463 static void GenomizeFeatureSeqIds (
5464   SeqFeatPtr sfp,
5465   Pointer userdata
5466 )
5467 
5468 {
5469   VisitSeqIdsInSeqLoc (sfp->location, userdata, GenomizeSeqId);
5470 }
5471 
5472 static void GenomizeGraphSeqIds (
5473   SeqGraphPtr sgp,
5474   Pointer userdata
5475 )
5476 
5477 {
5478   VisitSeqIdsInSeqGraph (sgp, userdata, GenomizeSeqId);
5479 }
5480 
5481 static void MakeGenomeCenterID (
5482   BioseqPtr bsp,
5483   Pointer userdata
5484 )
5485 
5486 {
5487   CharPtr  center;
5488 
5489   if (bsp == NULL) return;
5490   center = (CharPtr) userdata;
5491   if (StringHasNoText (center)) return;
5492 
5493   VisitSeqIdsInBioseq (bsp, userdata, GenomizeSeqId);
5494   SeqMgrReplaceInBioseqIndex (bsp);
5495   VisitFeaturesOnBsp (bsp, userdata, GenomizeFeatureSeqIds);
5496   VisitGraphsOnBsp (bsp, userdata, GenomizeGraphSeqIds);
5497 }
5498 
5499 static void MakeAccessionID (
5500   BioseqPtr bsp,
5501   Pointer userdata
5502 )
5503 
5504 {
5505   CharPtr     accn;
5506   ValNodePtr  generalIDs;
5507   SeqIdPtr    sip;
5508 
5509   if (bsp == NULL) return;
5510   if (! ISA_na (bsp->mol)) return;
5511   accn = (CharPtr) userdata;
5512   if (StringHasNoText (accn)) return;
5513 
5514   /* if existing accession, coerce all SeqIds */
5515 
5516   sip = SeqIdFromAccession (accn, INT2_MIN, NULL);
5517   if (sip == NULL) return;
5518   generalIDs = ValNodeExtractList (&(bsp->id), SEQID_GENERAL);
5519   bsp->id = SeqIdSetFree (bsp->id);
5520   bsp->id = sip;
5521   if (generalIDs != NULL) {
5522     ValNodeLink (&(bsp->id), generalIDs);
5523   }
5524   SeqMgrReplaceInBioseqIndex (bsp);
5525   VisitFeaturesOnBsp (bsp, (Pointer) bsp->id, CorrectFeatureSeqIds);
5526   VisitGraphsOnBsp (bsp, (Pointer) bsp->id, CorrectGraphSeqIds);
5527 }
5528 
5529 static void FindCreateDate (
5530   SeqDescrPtr sdp,
5531   Pointer userdata
5532 )
5533 
5534 {
5535   BoolPtr  has_create_dateP;
5536 
5537   if (sdp == NULL || sdp->choice != Seq_descr_create_date || userdata == NULL) return;
5538   has_create_dateP = (BoolPtr) userdata;
5539   *has_create_dateP = TRUE;
5540 }
5541 
5542 static void ConvertStructuredComment (
5543   SeqDescrPtr sdp,
5544   Pointer userdata
5545 )
5546 
5547 {
5548   SeqDescrPtr    com;
5549   CharPtr        prefix = NULL;
5550   CharPtr        str;
5551   UserObjectPtr  uop = NULL;
5552 
5553   if (sdp == NULL || sdp->choice != Seq_descr_comment) return;
5554   str = (CharPtr) sdp->data.ptrvalue;
5555   if (StringHasNoText (str)) return;
5556 
5557   if (StringStr (str, "##HIVData-START##") != NULL &&
5558       StringStr (str, "##HIVData-END##") != NULL) {
5559     prefix = StringStr (str, "##HIVData-START##");
5560     uop = ParseStringIntoStructuredComment (NULL, str, "##HIVData-START##",
5561                                             "##HIVData-END##");
5562   } else if (StringStr (str, "##FluData-START##") != NULL &&
5563              StringStr (str, "##FluData-END##") != NULL) {
5564     prefix = StringStr (str, "##FluData-START##");
5565     uop = ParseStringIntoStructuredComment (NULL, str, "##FluData-START##",
5566                                             "##FluData-END##");
5567   }
5568   if (uop == NULL) return;
5569 
5570   /* if there is text before prefix, truncate existing comment and append user object */
5571 
5572   if (prefix != NULL) {
5573     *prefix = '\0';
5574     TrimSpacesAroundString (str);
5575     if (StringDoesHaveText (str)) {
5576       com = SeqDescrNew (NULL);
5577       if (com != NULL) {
5578         com->choice = Seq_descr_user;
5579         com->data.ptrvalue = uop;
5580         com->next = sdp->next;
5581         sdp->next = com;
5582         return;
5583       }
5584     }
5585   }
5586 
5587   /* if entire comment was structured, replace existing descriptor with user object */
5588 
5589   MemFree (sdp->data.ptrvalue);
5590   sdp->choice = Seq_descr_user;
5591   sdp->data.ptrvalue = uop;
5592 }
5593 
5594 static void CleanUpLatLonAndCountry (
5595   BioSourcePtr biop,
5596   Pointer userdata
5597 )
5598 
5599 {
5600   CharPtr       fix_lat_lon;
5601   Boolean       format_ok = FALSE;
5602   CharPtr       lat_lon = NULL;
5603   Boolean       lat_in_range = FALSE;
5604   Boolean       lon_in_range = FALSE;
5605   CharPtr PNTR  list;
5606   CharPtr       new_country;
5607   SubSourcePtr  ssp;
5608 
5609   if (biop == NULL) return;
5610   list = (CharPtr PNTR) userdata;
5611   if (list == NULL) return;
5612 
5613   for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
5614     if (ssp->subtype == SUBSRC_country && StringDoesHaveText (ssp->name)) {
5615       new_country = GetCountryFix (ssp->name, list);
5616       if (new_country != NULL) {
5617         ssp->name = MemFree (ssp->name);
5618         ssp->name = new_country;
5619       }
5620     } else if (ssp->subtype == SUBSRC_lat_lon && StringDoesHaveText (ssp->name)) {
5621       lat_lon = ssp->name;
5622       IsCorrectLatLonFormat (lat_lon, &format_ok, &lat_in_range, &lon_in_range);
5623       if (! format_ok) {
5624         fix_lat_lon = FixLatLonFormat (lat_lon);
5625         if (fix_lat_lon != NULL) {
5626           ssp->name = MemFree (ssp->name);
5627           ssp->name = fix_lat_lon;
5628         }
5629       }
5630     }
5631   }
5632 }
5633 
5634 static void LookupPubdesc (
5635   PubdescPtr pdp,
5636   Pointer userdata
5637 )
5638 
5639 {
5640   CitArtPtr        cap;
5641   MedlineEntryPtr  mep;
5642   PubmedEntryPtr   pep;
5643   Int4             pmid = 0;
5644   ValNodePtr       vnp;
5645 
5646   if (pdp == NULL) return;
5647 
5648   for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
5649     switch (vnp->choice) {
5650       case PUB_Muid :
5651         /* ignore obsolete muids */
5652         break;
5653       case PUB_PMid :
5654         pmid = vnp->data.intvalue;
5655         break;
5656       default :
5657         /* return on real pub */
5658         return;
5659         break;
5660     }
5661   }
5662 
5663   if (pmid == 0) return;
5664 
5665   pep = GetPubMedForUid (pmid);
5666   if (pep == NULL) return;
5667   mep = (MedlineEntryPtr) pep->medent;
5668   if (mep != NULL && mep->cit != NULL) {
5669     cap = AsnIoMemCopy ((Pointer) mep->cit,
5670                         (AsnReadFunc) CitArtAsnRead,
5671                         (AsnWriteFunc) CitArtAsnWrite);
5672     ValNodeAddPointer (&(pdp->pub), PUB_Article, (Pointer) cap);
5673   }
5674 
5675   PubmedEntryFree (pep);
5676 }
5677 
5678 
5679 
5680 #ifdef INTERNAL_NCBI_ASNDISC
5681 const PerformDiscrepancyTest taxlookup = CheckTaxNamesAgainstTaxDatabase;
5682 #else
5683 const PerformDiscrepancyTest taxlookup = NULL;
5684 #endif
5685 
5686 
5687 static void CleanupCollectionDatesMonthFirst (BioSourcePtr biop, Pointer data)
5688 {
5689   SubSourcePtr ssp;
5690   CharPtr      reformatted_date = NULL;
5691 
5692   if (biop == NULL) return;
5693 
5694   ssp = biop->subtype;
5695   while (ssp != NULL)
5696   {
5697     if (ssp->subtype == SUBSRC_collection_date)
5698     {
5699       reformatted_date = ReformatDateStringEx (ssp->name, TRUE, NULL);
5700       if (reformatted_date != NULL)
5701       {
5702         ssp->name = MemFree (ssp->name);
5703         ssp->name = reformatted_date;
5704       }
5705     }
5706     ssp = ssp->next;
5707   }
5708 }
5709 
5710 
5711 static void CleanupCollectionDatesDayFirst (BioSourcePtr biop, Pointer data)
5712 {
5713   SubSourcePtr ssp;
5714   CharPtr      reformatted_date = NULL;
5715 
5716   if (biop == NULL) return;
5717 
5718   ssp = biop->subtype;
5719   while (ssp != NULL)
5720   {
5721     if (ssp->subtype == SUBSRC_collection_date)
5722     {
5723       reformatted_date = ReformatDateStringEx (ssp->name, FALSE, NULL);
5724       if (reformatted_date != NULL)
5725       {
5726         ssp->name = MemFree (ssp->name);
5727         ssp->name = reformatted_date;
5728       }
5729     }
5730     ssp = ssp->next;
5731   }
5732 }
5733 
5734 
5735 static void ValNodeLinkCopy (ValNodePtr PNTR list1, ValNodePtr list2)
5736 {
5737   if (list1 == NULL) return;
5738   while (list2 != NULL)
5739   {
5740     ValNodeAddPointer (list1, list2->choice, list2->data.ptrvalue);
5741     list2 = list2->next;
5742   }
5743 }
5744 
5745 static ValNodePtr FindItemListForClickableItemCategory (ValNodePtr list, CharPtr category_fmt)
5746 {
5747   ClickableItemPtr cip;
5748   ValNodePtr       vnp;
5749   ValNodePtr       item_list = NULL;
5750   CharPtr          cp;
5751 
5752   if (StringLen (category_fmt) < 2) {
5753     return NULL;
5754   }
5755   for (vnp = list; vnp != NULL; vnp = vnp->next) {
5756     cip = (ClickableItemPtr) vnp->data.ptrvalue;
5757     if (cip != NULL) {
5758       if (cip->description != NULL) {
5759         /* skip number at beginning of category title */
5760         cp = cip->description;
5761         while (isdigit (*cp)) {
5762           cp++;
5763         }
5764         if (StringCmp (cp, category_fmt + 2) == 0) {
5765           ValNodeLinkCopy (&item_list, cip->item_list);
5766         }
5767       }
5768       ValNodeLink (&item_list, FindItemListForClickableItemCategory (cip->subcategories, category_fmt));
5769     }
5770   }
5771   return item_list;
5772 }
5773 
5774 
5775 static void DoTbl2AsnCleanup (SeqEntryPtr sep, CleanupArgsPtr c)
5776 {
5777   ValNodePtr sep_list = NULL;
5778   ValNodePtr discrepancy_list = NULL, item_list = NULL, vnp;
5779   SeqFeatPtr sfp;
5780 
5781   if (sep == NULL || c == NULL) {
5782     return;
5783   }
5784 
5785   if (c->collection_dates) {
5786     if (c->collection_dates_month_first) {
5787       VisitBioSourcesInSep (sep, NULL, CleanupCollectionDatesMonthFirst);
5788     } else {
5789       VisitBioSourcesInSep (sep, NULL, CleanupCollectionDatesDayFirst);
5790     }
5791   }
5792   if (c->add_notes_to_overlapping_cds_without_abc) {
5793     ValNodeAddPointer (&sep_list, 0, sep);
5794     SeqMgrIndexFeatures (ObjMgrGetEntityIDForChoice (sep), NULL);
5795     AddOverlappingCodingRegionDiscrepancies (&discrepancy_list, sep_list);
5796     sep_list = ValNodeFree (sep_list);
5797     item_list = FindItemListForClickableItemCategory (discrepancy_list, kOverlappingCDSNeedsNoteFmt);
5798     discrepancy_list = FreeClickableList (discrepancy_list);
5799     for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
5800       if (vnp->choice == OBJ_SEQFEAT) {
5801         sfp = (SeqFeatPtr) vnp->data.ptrvalue;
5802         if (sfp != NULL) {
5803           SetStringValue (&(sfp->comment), kOverlappingCDSNoteText, ExistingTextOption_append_semi);
5804         }
5805       }
5806     }
5807     item_list = ValNodeFree (item_list);
5808   }
5809 }
5810 
5811 
5812 static void SeqEntryHasConflictingIDsCallback (BioseqPtr bsp, Pointer data)
5813 {
5814   CharPtr msg, fmt = "SeqID %s is present on multiple Bioseqs in record";
5815   BioseqPtr bsp2;
5816   SeqIdPtr sip;
5817   DbtagPtr dbt;
5818   Char     buf[100];
5819 
5820   if (bsp == NULL || data == NULL) {
5821     return;
5822   }
5823 
5824   for (sip = bsp->id; sip != NULL; sip = sip->next) {
5825     if (sip->choice == SEQID_GENERAL 
5826         && (dbt = (DbtagPtr) sip->data.ptrvalue) != NULL
5827         && StringICmp (dbt->db, "NCBIFILE") == 0) {
5828       continue;
5829         }
5830     bsp2 = BioseqFindSpecial (sip);
5831     if (bsp2 != NULL && bsp2 != bsp) {
5832       SeqIdWrite (sip, buf, PRINTID_FASTA_SHORT, sizeof (buf) - 1);
5833       msg = (CharPtr) MemNew (sizeof (Char) * (StringLen (fmt) + StringLen (buf)));
5834       sprintf (msg, fmt, buf);
5835       ValNodeAddPointer ((ValNodePtr PNTR) data, 0, msg);
5836     }
5837   }
5838 }
5839 
5840 
5841 static Boolean SeqEntryHasConflictingIDs (SeqEntryPtr sep)
5842 {
5843   ValNodePtr errs = NULL, vnp;
5844 
5845   VisitBioseqsInSep (sep, &errs, SeqEntryHasConflictingIDsCallback);
5846   if (errs == NULL) {
5847     return FALSE;
5848   } else {
5849     ValNodeUnique (&errs, SortVnpByString, ValNodeFreeData);
5850     for (vnp = errs; vnp != NULL; vnp = vnp->next) {
5851       Message (MSG_POSTERR, vnp->data.ptrvalue);
5852     }
5853     errs = ValNodeFreeData (errs);
5854     return TRUE;
5855   }
5856 }
5857 
5858 
5859 static void ProcessOneRecord (
5860   SubmitBlockPtr sbp,
5861   PubdescPtr pdp,
5862   BioSourcePtr src,
5863   CharPtr directory,
5864   CharPtr results,
5865   CharPtr base,
5866   CharPtr suffix,
5867   SeqDescrPtr sdphead,
5868   TblArgsPtr tbl,
5869   TextFsaPtr gotags,
5870   AsnIoPtr aip,
5871   CharPtr outfile
5872 )
5873 
5874 {
5875   AsnTypePtr         atp_bssse;
5876   BioSourcePtr       biop;
5877   BioseqPtr          bsp;
5878   BioseqSetPtr       bssp = NULL;
5879   Char               buf [256];
5880   SeqMgrFeatContext  context;
5881   Pointer            dataptr;
5882   Uint2              datatype, entityID;
5883   SeqDescrPtr        descr;
5884   DatePtr            dp;
5885   BioseqSetPtr       dssp;
5886   Boolean            failure = FALSE;
5887   FileCache          fc;
5888   FILE               *fp;
5889   Int2               genCode;
5890   Boolean            goOn;
5891   SeqEntryPtr        gsep = NULL;
5892   Boolean            has_create_date;
5893   SeqGraphPtr        lastsgp;
5894   Int4               linenum = 0;
5895   CharPtr PNTR       list;
5896   CharPtr            localname = NULL;
5897   MolInfoPtr         mip;
5898   ErrSev             msev;
5899   Boolean            nonewline;
5900   BioseqPtr          nucbsp;
5901   ObjMgrDataPtr      omdp;
5902   CharPtr            organism;
5903   OrgRefPtr          orp;
5904   BioseqPtr          protbsp;
5905   SeqEntryPtr        protsep;
5906   CharPtr            ptr;
5907   SeqAnnotPtr        sap;
5908   SeqDescrPtr        sdp;
5909   SeqEntryPtr        sep;
5910   SeqFeatPtr         sfp;
5911   CharPtr            sfx = NULL;
5912   SeqGraphPtr        sgp;
5913   SeqIdPtr           sip;
5914   SeqSubmitPtr       sub;
5915   SimpleSeqPtr       ssp;
5916   CharPtr            str;
5917   CharPtr            tblfile = NULL;
5918   SeqEntryPtr        tmp;
5919   MolInfoPtr         template_molinfo = NULL;
5920   ValNodePtr         cmt_errors, vnp;
5921 
5922   fp = OpenOneFile (directory, base, suffix);
5923   if (fp == NULL) return;
5924 
5925   if (tbl->logtoterminal) {
5926     Message (MSG_POSTERR, "File %s", base);
5927   }
5928 
5929   /* if genomic product set, make parent set */
5930 
5931   if (tbl->genprodset) {
5932     bssp = BioseqSetNew ();
5933     if (bssp == NULL) return;
5934     bssp->_class = BioseqseqSet_class_gen_prod_set;
5935 
5936     gsep = SeqEntryNew ();
5937     if (gsep == NULL) return;
5938     gsep->choice = 2;
5939     gsep->data.ptrvalue = (Pointer) bssp;
5940     SeqMgrSeqEntry (SM_BIOSEQSET, (Pointer) bssp, gsep);
5941   }
5942 
5943   if (tbl->seqidfromfile) {
5944     localname = base;
5945   }
5946 
5947   /* find MolInfo from template, if there is any */
5948   sdp = sdphead;
5949   while (sdp != NULL && sdp->choice != Seq_descr_molinfo) {
5950     sdp = sdp->next;
5951   }
5952   if (sdp != NULL) {
5953     template_molinfo = (MolInfoPtr) sdp->data.ptrvalue;
5954   }
5955 
5956   /* read one or more ASN.1 or FASTA sequence files */
5957 
5958   if (tbl->fastaset) {
5959     entityID = ProcessBulkSet (fp, src, tbl, template_molinfo);
5960   } else if (tbl->deltaset) {
5961     entityID = ProcessDeltaSet (fp, src, tbl, localname, gsep, template_molinfo);
5962   } else if (tbl->alignset) {
5963     entityID = ProcessAlignSet (fp, src, tbl, template_molinfo);
5964   } else if (tbl->gapped) {
5965     entityID = ProcessGappedSet (fp, src, tbl, gsep, template_molinfo);
5966   } else if (tbl->phrapace) {
5967     entityID = ProcessPhrapAce (fp, src, tbl, localname, gsep, template_molinfo, directory, base);
5968   } else if (tbl->raw2delt) {
5969     entityID = ProcessRaw2Delt (fp, src, tbl, localname, gsep, template_molinfo);
5970   } else {
5971     entityID = ProcessOneAsn (fp, src, tbl, localname, gsep, template_molinfo);
5972   }
5973   FileClose (fp);
5974 
5975   if (entityID == 0) return;
5976 
5977   sep = GetTopSeqEntryForEntityID (entityID);
5978   if (SeqEntryHasConflictingIDs (sep)) {
5979     return;
5980   }
5981 
5982   if (tbl->dotaxlookup) {
5983     sep = GetTopSeqEntryForEntityID (entityID);
5984     if (sep != NULL) {
5985 
5986       /* optionally do network taxonomy lookup - prior to instantiating mRNA and protein titles */
5987 
5988       Taxon3ReplaceOrgInSeqEntry (sep, FALSE);
5989     }
5990   }
5991 
5992   if (tbl->dopublookup) {
5993     sep = GetTopSeqEntryForEntityID (entityID);
5994     if (sep != NULL) {
5995 
5996       /* optionally do network publication lookup of just PMID references */
5997 
5998       VisitPubdescsInSep (sep, NULL, LookupPubdesc);
5999     }
6000   }
6001 
6002   organism = NULL;
6003   if (tbl->genprodset) {
6004     descr = ExtractBioSourceAndPubs (bssp->seq_set);
6005     for (sdp = descr; sdp != NULL; sdp = sdp->next) {
6006       if (sdp->choice != Seq_descr_source) continue;
6007       biop = (BioSourcePtr) sdp->data.ptrvalue;
6008       if (biop == NULL) continue;
6009       orp = biop->org;
6010       if (orp == NULL) continue;
6011       if (StringDoesHaveText (orp->taxname)) {
6012         organism = orp->taxname;
6013       }
6014     }
6015     ReplaceBioSourceAndPubs (gsep, descr);
6016   }
6017 
6018   /* read one or more feature tables from .tbl file */
6019 
6020   if (StringDoesHaveText (tbl->tableFile)) {
6021     fp = FileOpen (tbl->tableFile, "r");
6022     tblfile = tbl->tableFile;
6023   } else {
6024     fp = OpenOneFile (directory, base, ".tbl");
6025     tblfile = base;
6026     sfx = ".tbl";
6027   }
6028   if (fp != NULL) {
6029 
6030     /* indexing needed to find segmented bsp if location is on part */
6031 
6032     sep = GetTopSeqEntryForEntityID (entityID);
6033 
6034     SeqMgrIndexFeatures (entityID, NULL);
6035 
6036     while ((! failure) && (dataptr = ReadFeatureTableFile (fp, &datatype, NULL, &linenum, &failure)) != NULL) {
6037       if (datatype == OBJ_SEQANNOT) {
6038 
6039         sap = (SeqAnnotPtr) dataptr;
6040         ProcessOneAnnot (sap, entityID, tbl);
6041 
6042       } else {
6043         ObjMgrFree (datatype, dataptr);
6044       }
6045     }
6046     FileClose (fp);
6047     sep = GetTopSeqEntryForEntityID (entityID);
6048 
6049 
6050     if (failure) {
6051       if (StringHasNoText (tblfile)) {
6052         tblfile = "?";
6053       }
6054       ptr = StringRChr (tblfile, DIRDELIMCHR);
6055       if (ptr != NULL) {
6056         ptr++;
6057         tblfile = ptr;
6058       }
6059       Message (MSG_POSTERR, "Bad feature table at line %ld of file %s%s", (long) linenum, tblfile, sfx);
6060     }
6061   }
6062 
6063   /* if genomic product set, copy CDS into nucprot sets */
6064 
6065   if (tbl->genprodset) {
6066     /* need to reindex to get mRNA and CDS features from cDNA and protein */
6067     SeqMgrIndexFeatures (entityID, NULL);
6068     VisitSetsInSet (bssp, (Pointer) tbl, MakeNucProtCDS);
6069   }
6070 
6071   /* read source qualifiers for set of sequences from .src file */
6072 
6073   fp = OpenOneFile (directory, base, ".src");
6074   if (fp != NULL) {
6075 
6076     ProcessSourceTable (fp);
6077 
6078     FileClose (fp);
6079   }
6080 
6081   /* read structured comments from .cmt file */
6082   fp = OpenOneFile (directory, base, ".cmt");
6083   if (fp != NULL) {
6084     sep = GetTopSeqEntryForEntityID (entityID);
6085     cmt_errors = CreateStructuredCommentsFromFile (fp, sep);
6086     FileClose (fp);
6087     if (cmt_errors != NULL) {
6088       for (vnp = cmt_errors; vnp != NULL; vnp = vnp->next) {
6089         Message (MSG_POSTERR, "Error processing structured comment (.cmt) file: %s", vnp->data.ptrvalue);
6090       }
6091       cmt_errors = ValNodeFreeData (cmt_errors);
6092     }
6093   }
6094 
6095   /* read one or more protein sequences from .pep file */
6096 
6097   fp = OpenOneFile (directory, base, ".pep");
6098   if (fp != NULL) {
6099 
6100     /* indexing needed to find CDS from protein product to set conflict flag */
6101 
6102     SeqMgrIndexFeatures (entityID, NULL);
6103 
6104     while ((dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, TRUE, TRUE, TRUE)) != NULL) {
6105       if (datatype == OBJ_FASTA) {
6106 
6107         ssp = (SimpleSeqPtr) dataptr;
6108         ReplaceOnePeptide (ssp, tbl->conflict, tbl->genprodset);
6109         SimpleSeqFree (ssp);
6110 
6111       } else {
6112         ObjMgrFree (datatype, dataptr);
6113       }
6114     }
6115     FileClose (fp);
6116   }
6117 
6118   /* read one or more RNA sequences from .rna file */
6119 
6120   fp = OpenOneFile (directory, base, ".rna");
6121   if (fp != NULL) {
6122 
6123     /* indexing needed to find mRNA from transcript product to set RNA editing exception */
6124 
6125     SeqMgrIndexFeatures (entityID, NULL);
6126 
6127     while ((dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, TRUE, TRUE, TRUE)) != NULL) {
6128       if (datatype == OBJ_FASTA) {
6129 
6130         ssp = (SimpleSeqPtr) dataptr;
6131         ReplaceOneRNA (ssp, tbl->conflict);
6132         SimpleSeqFree (ssp);
6133 
6134       } else {
6135         ObjMgrFree (datatype, dataptr);
6136       }
6137     }
6138     FileClose (fp);
6139   }
6140 
6141   /* read one or more protein sequences from .prt file */
6142 
6143   fp = OpenOneFile (directory, base, ".prt");
6144   if (fp != NULL) {
6145 
6146     SeqMgrIndexFeatures (entityID, NULL);
6147 
6148     sep = GetTopSeqEntryForEntityID (entityID);
6149     nucbsp = FindNucBioseq (sep);
6150     if (nucbsp != NULL) {
6151       BioseqToGeneticCode (nucbsp, &genCode, NULL, NULL, NULL, 0, NULL);
6152       SetBatchSuggestNucleotide (nucbsp, genCode);
6153 
6154       descr = ExtractBioSourceAndPubs (sep);
6155 
6156       while ((dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, TRUE, TRUE, FALSE)) != NULL) {
6157         if (datatype == OBJ_BIOSEQ) {
6158 
6159           protbsp = (BioseqPtr) dataptr;
6160           protsep = SeqMgrGetSeqEntryForData (protbsp);
6161           mip = MolInfoNew ();
6162           if (mip != NULL) {
6163             mip->biomol = 8;
6164             mip->tech = 13;
6165             sdp = CreateNewDescriptor (protsep, Seq_descr_molinfo);
6166             if (sdp != NULL) {
6167               sdp->data.ptrvalue = (Pointer) mip;
6168             }
6169           }
6170           AddSeqEntryToSeqEntry (sep, protsep, TRUE);
6171           SuggestOnePeptide (nucbsp, protbsp, genCode);
6172 
6173         } else {
6174           ObjMgrFree (datatype, dataptr);
6175         }
6176       }
6177 
6178       ClearBatchSuggestNucleotide ();
6179 
6180       ReplaceBioSourceAndPubs (sep, descr);
6181     }
6182     FileClose (fp);
6183 
6184     SeqMgrIndexFeatures (entityID, NULL);
6185   }
6186 
6187   /* read one or more quality score blocks from .qvl file */
6188 
6189   fp = OpenOneFile (directory, base, ".qvl");
6190   if (fp != NULL) {
6191 
6192     FileCacheSetup (&fc, fp);
6193 
6194     goOn = TRUE;
6195     while (goOn) {
6196       str = FileCacheReadLine (&fc, buf, sizeof (buf), &nonewline);
6197       if (str == NULL) {
6198         goOn = FALSE;
6199       } else if (StringDoesHaveText (str)) {
6200         if (str [0] == '>') {
6201           ptr = StringChr (str, ' ');
6202           if (ptr == NULL) {
6203             ptr = StringChr (str, '\t');
6204           }
6205           if (ptr != NULL) {
6206             *ptr = '\0';
6207           }
6208           sip = MakeSeqID (str + 1);
6209           bsp = BioseqFind (sip);
6210           if (bsp != NULL) {
6211             sgp = ReadPhrapQualityFC (&fc, bsp);
6212             if (sgp != NULL) {
6213               for (sap = bsp->annot; sap != NULL; sap = sap->next) {
6214                 if (sap->type == 3) {
6215                   for (lastsgp = sap->data; lastsgp->next != NULL; lastsgp = lastsgp->next) {
6216                     continue;
6217                   }
6218                   lastsgp->next = sgp;
6219                   break;
6220                 }
6221               }
6222               if (sap == NULL) {
6223                 if (bsp->annot != NULL) {
6224                   for (sap = bsp->annot; sap->next != NULL; sap = sap->next) {
6225                     continue;
6226                   }
6227                   sap->next = NewGraphSeqAnnot ("Phrap Graph", sgp);
6228                 } else {
6229                   bsp->annot = NewGraphSeqAnnot ("Phrap Graph", sgp);
6230                 }
6231               }
6232             }
6233           }
6234           SeqIdFree (sip);
6235         }
6236       }
6237     }
6238     FileClose (fp);
6239   }
6240 
6241   /* finish processing */
6242 
6243   if (sbp == NULL) {
6244     omdp = ObjMgrGetData (entityID);
6245     if (omdp != NULL && omdp->datatype == OBJ_SEQSUB) {
6246 
6247       /* if read a Seq-submit, write out a Seq-submit */
6248 
6249       sub = (SeqSubmitPtr) omdp->dataptr;
6250       if (sub != NULL && sub->datatype == 1) {
6251         sbp = sub->sub;
6252       }
6253     }
6254   }
6255 
6256   sep = GetTopSeqEntryForEntityID (entityID);
6257   if (sep != NULL) {
6258 
6259     if (tbl->gnltonote) {
6260       VisitFeaturesInSep (sep, NULL, GeneralToNote);
6261     }
6262 
6263     if (tbl->gpstonps) {
6264       GPStoNPS (sep, entityID);
6265       sep = GetTopSeqEntryForEntityID (entityID);
6266     }
6267 
6268     if (! tbl->genprodset) {
6269       VisitFeaturesInSep (sep, NULL, RemoveGBQualIDs);
6270     }
6271     if (sdphead != NULL) {
6272       if (IS_Bioseq (sep)) {
6273         bsp = (BioseqPtr) sep->data.ptrvalue;
6274         AddTemplateDescriptors (&(bsp->descr), sdphead, TRUE);
6275       } else if (IS_Bioseq_set (sep)) {
6276         dssp = (BioseqSetPtr) sep->data.ptrvalue;
6277         AddTemplateDescriptors (&(dssp->descr), sdphead, TRUE);
6278       }
6279     }
6280     dp = DateCurr ();
6281     if (dp != NULL) {
6282       has_create_date = FALSE;
6283       VisitDescriptorsInSep (sep, (Pointer) &has_create_date, FindCreateDate);
6284       if (has_create_date) {
6285         sdp = CreateNewDescriptor (sep, Seq_descr_update_date);
6286       } else {
6287         sdp = CreateNewDescriptor (sep, Seq_descr_create_date);
6288       }
6289       if (sdp != NULL) {
6290         sdp->data.ptrvalue = (Pointer) dp;
6291       }
6292     }
6293 
6294     /* read one or more descriptors from .dsc file */
6295 
6296     fp = OpenOneFile (directory, base, ".dsc");
6297     if (fp != NULL) {
6298 
6299       while ((dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, TRUE, TRUE, TRUE)) != NULL) {
6300         if (datatype == OBJ_SEQDESC) {
6301 
6302           if (IS_Bioseq (sep)) {
6303             bsp = (BioseqPtr) sep->data.ptrvalue;
6304             AddTemplateDescriptors (&(bsp->descr), (SeqDescrPtr) dataptr, FALSE);
6305           } else if (IS_Bioseq_set (sep)) {
6306             dssp = (BioseqSetPtr) sep->data.ptrvalue;
6307             AddTemplateDescriptors (&(dssp->descr), (SeqDescrPtr) dataptr, FALSE);
6308           }
6309 
6310         } else {
6311           ObjMgrFree (datatype, dataptr);
6312         }
6313       }
6314       FileClose (fp);
6315     }
6316 
6317     msev = ErrSetMessageLevel (SEV_MAX);
6318     move_cds (sep);
6319 
6320     /* if reading nucleotide and protein tables, remove duplicate prot feat */
6321     VisitBioseqsInSep (sep, NULL, RemoveDupProtFeats);
6322     DeleteMarkedObjects (entityID, 0, NULL);
6323 
6324     /* need to reindex before extending CDS to stop codon */
6325     SeqMgrIndexFeatures (entityID, NULL);
6326     CdCheck (sep, NULL);
6327 
6328     /* need to reindex before copying genes, instantiating protein titles */
6329     SeqMgrIndexFeatures (entityID, NULL);
6330     EntryChangeImpFeat (sep);
6331 
6332     /* find locus for any gene xrefs that only have locus_tag */
6333     VisitFeaturesInSep (sep, NULL, FillInPartialGeneXref);
6334 
6335     if (tbl->removeunnecxref) {
6336       /* if not removed, xref will prevent locus, maploc, dbxref from being copied */
6337       VisitFeaturesInSep (sep, NULL, RemoveUnnecGeneXref);
6338     }
6339 
6340     if (tbl->genprodset) {
6341       VisitFeaturesInSep (sep, NULL, CopyGene);
6342     }
6343     if (tbl->genprodset) {
6344       /* currently copying ncRNA feature onto product */
6345       VisitFeaturesInSep (sep, NULL, CopyNcRna);
6346     }
6347     if (! tbl->genprodset) {
6348     VisitFeaturesInSep (sep, NULL, ClearRnaProducts);
6349     }
6350 
6351     if (tbl->removeunnecxref) {
6352       /* need to reindex before removing unnecesary gene xrefs in nuc-prot sets */
6353       SeqMgrIndexFeatures (entityID, NULL);
6354 
6355       VisitFeaturesInSep (sep, NULL, RemoveUnnecGeneXref);
6356     }
6357 
6358     if (! tbl->relaxed) {
6359       list = GetValidCountryList ();
6360       VisitBioSourcesInSep (sep, (Pointer) list, CleanUpLatLonAndCountry);
6361     }
6362 
6363     /* need to reindex so hypothetical protein titles pick up locus_tag */
6364     SeqMgrIndexFeatures (entityID, NULL);
6365     InstantiateProteinTitles (entityID, NULL);
6366 
6367     if (tbl->genprodset) {
6368       /* need to reindex before instantiating mRNA titles */
6369       SeqMgrIndexFeatures (entityID, NULL);
6370       bsp = FindNucBioseq (sep);
6371 
6372       if (tbl->smarttitle) {
6373         MakeSmartRnaTitles (bsp, organism);
6374       } else {
6375         sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_RNA, 0, &context);
6376         while (sfp != NULL) {
6377           AddRnaTitles (sfp, organism);
6378           sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_RNA, 0, &context);
6379         }
6380       }
6381     }
6382 
6383     if (StringDoesHaveText (tbl->center)) {
6384       VisitBioseqsInSep (sep, tbl->center, MakeGenomeCenterID);
6385     }
6386 
6387     if (StringDoesHaveText (tbl->accn)) {
6388       bsp = FindNucBioseq (sep);
6389       MakeAccessionID (bsp, tbl->accn);
6390     }
6391 
6392     VisitDescriptorsInSep (sep, NULL, ConvertStructuredComment);
6393 
6394     SeqMgrClearFeatureIndexes (entityID, NULL);
6395     BasicSeqEntryCleanup (sep);
6396     ErrSetMessageLevel (msev);
6397     /*
6398     SeriousSeqEntryCleanup (sep, NULL, NULL);
6399     */
6400     ConvertFullLenSourceFeatToDesc (sep);
6401     ConvertFullLenPubFeatToDesc (sep);
6402     if (tbl->linkbyoverlap) {
6403       SeqMgrIndexFeatures (entityID, NULL);
6404       LinkCDSmRNAbyOverlap (sep);
6405     } else if (tbl->linkbyproduct) {
6406       SeqMgrIndexFeatures (entityID, NULL);
6407       LinkCDSmRNAbyProduct (sep);
6408     }
6409 
6410     DoTbl2AsnCleanup (sep, &(tbl->cleanup_args));
6411     NormalizeDescriptorOrder (sep);
6412 
6413     if (StringHasNoText (results)) {
6414       results = directory;
6415     }
6416 
6417     if (aip != NULL) {
6418       atp_bssse = AsnFind ("Bioseq-set.seq-set.E");
6419       if (atp_bssse == NULL) {
6420         Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.seq-set.E");
6421       } else if (tbl->fastaset && tbl->whichclass == 0) {
6422         /* already has genbank wrapper, write individual components */
6423         tmp = PropagateDescsFromGenBankSet (sep);
6424         SeqMgrClearFeatureIndexes (entityID, NULL);
6425         while (tmp != NULL) {
6426           SeqEntryAsnWrite (tmp, aip, atp_bssse);
6427           tmp = tmp->next;
6428         }
6429       } else {
6430         SeqEntryAsnWrite (sep, aip, atp_bssse);
6431       }
6432     } else {
6433       if (tbl->fastaset && tbl->whichclass == 0) {
6434         PropagateDescsFromGenBankSet (sep);
6435         SeqMgrClearFeatureIndexes (entityID, NULL);
6436       }
6437       WriteOneFile (results, base, ".sqn", outfile, sep, sbp, tbl->save_bioseq_set);
6438     }
6439 
6440     if (HasGoTermsInNote (sep, gotags)) {
6441       Message (MSG_OK, "Illegal GO term format detected in note - contact database for instructions");
6442     }
6443 
6444     if (tbl->global_report != NULL) {
6445       AddSeqEntryToGlobalDiscrepReport (sep, tbl->global_report, base);
6446     }
6447 
6448     if (tbl->validate || tbl->flatfile || tbl->genereport || tbl->validate_barcode) {
6449       if (pdp != NULL) {
6450 
6451         /* copy in citsub as publication for validator and flatfile */
6452 
6453         sdp = CreateNewDescriptor (sep, Seq_descr_pub);
6454         if (sdp != NULL) {
6455           sdp->data.ptrvalue = AsnIoMemCopy ((Pointer) pdp,
6456                                              (AsnReadFunc) PubdescAsnRead,
6457                                              (AsnWriteFunc) PubdescAsnWrite);
6458         }
6459       }
6460       SeqMgrIndexFeatures (entityID, 0);
6461       if (tbl->flatfile) {
6462         Message (MSG_POST, "Flatfile %s\n", base);
6463         FlatfileOneFile (results, base, ".gbf", sep);
6464       }
6465       if (tbl->validate || tbl->validate_barcode) {
6466         Message (MSG_POST, "Validating %s\n", base);
6467         ValidateOneFile (results, base, ".val", sep, tbl->validate, tbl->relaxed, tbl->validate_barcode);
6468       }
6469       if (tbl->genereport) {
6470         GeneReportOneFile (results, base, ".t2g", sep);
6471       }
6472     }
6473   }
6474 
6475   ObjMgrFreeByEntityID (entityID);
6476 }
6477 
6478 
6479 
6480 static CharPtr overwriteMsg = "Your template with a .sqn suffix will be overwritten.  Do you wish to continue?";
6481 
6482 static Boolean TemplateOverwriteRisk (
6483   CharPtr filename,
6484   CharPtr single,
6485   CharPtr directory,
6486   CharPtr suffix
6487 )
6488 
6489 {
6490   Char     file [FILENAME_MAX], path [PATH_MAX];
6491   CharPtr  ptr;
6492 
6493 
6494   if (StringStr (filename, ".sqn") == NULL) return FALSE;
6495   if (StringDoesHaveText (single)) {
6496     StringNCpy_0 (file, filename, sizeof (file));
6497     ptr = StringStr (file, ".");
6498     if (ptr != NULL) {
6499       *ptr = '\0';
6500     }
6501     ptr = StringStr (single, ".");
6502     if (ptr != NULL) {
6503       StringCat (file, ptr);
6504     }
6505     if (StringCmp (file, single) == 0) return TRUE;
6506   } else if (StringDoesHaveText (directory)) {
6507     StringNCpy_0 (path, directory, sizeof (path));
6508     StringNCpy_0 (file, filename, sizeof (file));
6509     ptr = StringStr (file, ".");
6510     if (ptr != NULL) {
6511       *ptr = '\0';
6512     }
6513     StringCat (file, suffix);
6514     FileBuildPath (path, NULL, file);
6515     if (FileLength (path) > 0) return TRUE;
6516   }
6517   return FALSE;
6518 }
6519 
6520 static void FileRecurse (
6521   SubmitBlockPtr sbp,
6522   PubdescPtr pdp,
6523   BioSourcePtr src,
6524   CharPtr directory,
6525   CharPtr results,
6526   CharPtr suffix,
6527   Boolean recurse,
6528   SeqDescrPtr sdphead,
6529   TblArgsPtr tbl,
6530   TextFsaPtr gotags,
6531   AsnIoPtr aip,
6532   CharPtr outfile
6533 )
6534 
6535 {
6536   Char        path [PATH_MAX];
6537   CharPtr     ptr;
6538   CharPtr     str;
6539   ValNodePtr  head, vnp;
6540 
6541   /* get list of all files in source directory */
6542 
6543   head = DirCatalog (directory);
6544 
6545   for (vnp = head; vnp != NULL; vnp = vnp->next) {
6546     if (vnp->choice == 0) {
6547       str = (CharPtr) vnp->data.ptrvalue;
6548       if (StringDoesHaveText (str)) {
6549 
6550         /* does filename have desired substring? */
6551 
6552         ptr = StringStr (str, suffix);
6553 
6554         if (ptr != NULL) {
6555 
6556           /* make sure detected suffix is really at end of filename */
6557 
6558           if (StringCmp (ptr, suffix) == 0) {
6559             *ptr = '\0';
6560 
6561             /* process file that has desired suffix (usually .fsa) */
6562 
6563             ProcessOneRecord (sbp, pdp, src, directory, results, str, suffix, sdphead, tbl, gotags, aip, outfile);
6564           }
6565         }
6566       }
6567     } else if (vnp->choice == 1 && recurse) {
6568 
6569       /* recurse into subdirectory */
6570 
6571       StringNCpy_0 (path, directory, sizeof (path));
6572       str = (CharPtr) vnp->data.ptrvalue;
6573       FileBuildPath (path, str, NULL);
6574       FileRecurse (sbp, pdp, src, path, results, suffix, recurse, sdphead, tbl, gotags, aip, outfile);
6575     }
6576   }
6577 
6578   /* clean up file list */
6579 
6580   ValNodeFreeData (head);
6581 }
6582 
6583 static AsnTypePtr DoFirstPrefix (
6584   AsnIoPtr aip,
6585   SubmitBlockPtr sbp
6586 )
6587 
6588 {
6589   AsnTypePtr  atp_se, atp_ses, atp_ss, atp_ssd, atp_ssde, atp_ssdee, atp_sss, sep_atp, ssp_atp;
6590   DataVal     av;
6591   SeqEntry    se;
6592   SeqSubmit   ss;
6593 
6594   if (aip == NULL || sbp == NULL) return NULL;
6595 
6596   atp_ss = AsnFind ("Seq-submit");
6597   if (atp_ss == NULL) {
6598     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit");
6599     return NULL;
6600   }
6601 
6602   atp_sss = AsnFind ("Seq-submit.sub");
6603   if (atp_sss == NULL) {
6604     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit.sub");
6605     return NULL;
6606   }
6607 
6608   atp_ssd = AsnFind ("Seq-submit.data");
6609   if (atp_ssd == NULL) {
6610     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit.data");
6611     return NULL;
6612   }
6613 
6614   atp_ssde = AsnFind ("Seq-submit.data.entrys");
6615   if (atp_ssde == NULL) {
6616     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit.data.entrys");
6617     return NULL;
6618   }
6619 
6620   atp_se = AsnFind ("Seq-entry");
6621   if (atp_se == NULL) {
6622     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-entry");
6623     return NULL;
6624   }
6625 
6626   atp_ses = AsnFind ("Seq-entry.set");
6627   if (atp_ses == NULL) {
6628     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-entry.set");
6629     return NULL;
6630   }
6631 
6632   atp_ssdee = AsnFind ("Seq-submit.data.entrys.E");
6633   if (atp_ssdee == NULL) {
6634     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit.data.entrys.E");
6635     return NULL;
6636   }
6637 
6638 
6639   ssp_atp = AsnLinkType (NULL, atp_ss);
6640   if (ssp_atp == NULL) return NULL;
6641 
6642   MemSet ((Pointer) &ss, 0, sizeof (SeqSubmit));
6643   MemSet ((Pointer) &se, 0, sizeof (SeqEntry));
6644   se.choice = 2;
6645 
6646   if (! AsnOpenStruct (aip, ssp_atp, (Pointer) &ss)) return NULL;
6647 
6648   if (! SubmitBlockAsnWrite (sbp, aip, atp_sss)) return NULL;
6649 
6650   av.ptrvalue = (Pointer) &se;
6651   if (! AsnWriteChoice (aip, atp_ssd, (Int2) 1, &av)) return NULL;
6652 
6653   if (! AsnOpenStruct (aip, atp_ssde, (Pointer) &se)) return NULL;
6654 
6655   sep_atp = AsnLinkType (atp_ssdee, atp_se);
6656   if (sep_atp == NULL) return NULL;
6657 
6658   av.ptrvalue = (Pointer) &se;
6659   se.choice = 2;
6660   if (! AsnWriteChoice (aip, sep_atp, (Int2) 2, &av)) return NULL;
6661 
6662   return ssp_atp;
6663 }
6664 
6665 static AsnTypePtr DoSecondPrefix (
6666   AsnIoPtr aip,
6667   TblArgsPtr tbl
6668 )
6669 
6670 {
6671   AsnTypePtr  atp_bsc, atp_bss, atp_bsss, atp_ses, bssp_atp;
6672   DataVal     av;
6673   BioseqSet   bs;
6674 
6675   atp_ses = AsnFind ("Seq-entry.set");
6676   if (atp_ses == NULL) {
6677     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-entry.set");
6678     return NULL;
6679   }
6680 
6681   atp_bss = AsnFind ("Bioseq-set");
6682   if (atp_bss == NULL) {
6683     Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set");
6684     return NULL;
6685   }
6686 
6687   atp_bsc = AsnFind ("Bioseq-set.class");
6688   if (atp_bsc == NULL) {
6689     Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.class");
6690     return NULL;
6691   }
6692 
6693   atp_bsss = AsnFind ("Bioseq-set.seq-set");
6694   if (atp_bsss == NULL) {
6695     Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.seq-set");
6696     return NULL;
6697   }
6698 
6699 
6700   bssp_atp = AsnLinkType (atp_ses, atp_bss);
6701   if (bssp_atp == NULL) return NULL;
6702 
6703   MemSet ((Pointer) &bs, 0, sizeof (BioseqSet));
6704 
6705   if (! AsnOpenStruct (aip, bssp_atp, (Pointer) &bs)) return NULL;
6706 
6707   switch (tbl->whichclass) {
6708     case 1 :
6709       av.intvalue = BioseqseqSet_class_pop_set;
6710       break;
6711     case 2 :
6712       av.intvalue = BioseqseqSet_class_phy_set;
6713       break;
6714     case 3 :
6715       av.intvalue = BioseqseqSet_class_mut_set;
6716       break;
6717     case 4 :
6718       av.intvalue = BioseqseqSet_class_eco_set;
6719       break;
6720     default :
6721       av.intvalue = BioseqseqSet_class_genbank;
6722       break;
6723   }
6724   if (! AsnWrite (aip, atp_bsc, &av)) return NULL;
6725 
6726   if (! AsnOpenStruct (aip, atp_bsss, (Pointer) &bs.seq_set)) return NULL;
6727 
6728   return bssp_atp;
6729 }
6730 
6731 static Boolean DoFirstSuffix (
6732   AsnIoPtr aip,
6733   AsnTypePtr ssp_atp
6734 )
6735 
6736 {
6737   AsnTypePtr  atp_bsss, atp_ssde, atp_ssdee;
6738   BioseqSet   bs;
6739   SeqEntry    se;
6740   SeqSubmit   ss;
6741 
6742   if (aip == NULL || ssp_atp == NULL) return FALSE;
6743 
6744   atp_ssde = AsnFind ("Seq-submit.data.entrys");
6745   if (atp_ssde == NULL) {
6746     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit.data.entrys");
6747     return FALSE;
6748   }
6749 
6750   atp_ssdee = AsnFind ("Seq-submit.data.entrys.E");
6751   if (atp_ssdee == NULL) {
6752     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit.data.entrys.E");
6753     return FALSE;
6754   }
6755 
6756   atp_bsss = AsnFind ("Bioseq-set.seq-set");
6757   if (atp_bsss == NULL) {
6758     Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.seq-set");
6759     return FALSE;
6760   }
6761 
6762 
6763   MemSet ((Pointer) &ss, 0, sizeof (SeqSubmit));
6764   MemSet ((Pointer) &se, 0, sizeof (SeqEntry));
6765   MemSet ((Pointer) &bs, 0, sizeof (BioseqSet));
6766 
6767   if (! AsnCloseStruct (aip, atp_ssde, &se)) return FALSE;
6768 
6769   if (! AsnCloseStruct (aip, ssp_atp, (Pointer) &ss)) return FALSE;
6770 
6771   AsnUnlinkType (atp_ssdee);
6772 
6773   return TRUE;
6774 }
6775 
6776 static Boolean DoSecondSuffix (
6777   AsnIoPtr aip,
6778   AsnTypePtr bssp_atp
6779 )
6780 
6781 {
6782   AsnTypePtr  atp_bsss, atp_ses;
6783   BioseqSet   bs;
6784 
6785    if (aip == NULL || bssp_atp == NULL) return FALSE;
6786 
6787   atp_ses = AsnFind ("Seq-entry.set");
6788   if (atp_ses == NULL) {
6789     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-entry.set");
6790     return FALSE;
6791   }
6792 
6793   atp_bsss = AsnFind ("Bioseq-set.seq-set");
6794   if (atp_bsss == NULL) {
6795     Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.seq-set");
6796     return FALSE;
6797   }
6798 
6799 
6800   MemSet ((Pointer) &bs, 0, sizeof (BioseqSet));
6801 
6802   if (! AsnCloseStruct(aip, atp_bsss, (Pointer) &bs.seq_set)) return FALSE;
6803 
6804   if (! AsnCloseStruct (aip, bssp_atp, (Pointer) &bs)) return FALSE;
6805 
6806   AsnUnlinkType (atp_ses);
6807 
6808   return TRUE;
6809 }
6810 
6811 static CharPtr ReadCommentFile (
6812   CharPtr filename
6813 )
6814 
6815 {
6816   FileCache   fc;
6817   FILE        *fp;
6818   ValNodePtr  head = NULL, last = NULL, vnp;
6819   Int4        len;
6820   Char        line [4096];
6821   Boolean     nonewline, notfirst;
6822   CharPtr     ptr, str, tmp;
6823 
6824   if (StringHasNoText (filename)) return NULL;
6825   fp = FileOpen (filename, "r");
6826   if (fp == NULL) return NULL;
6827 
6828   FileCacheSetup (&fc, fp);
6829 
6830   str = FileCacheReadLine (&fc, line, sizeof (line), &nonewline);
6831   while (str != NULL) {
6832     vnp = ValNodeCopyStr (&last, 0, str);
6833     if (head == NULL) {
6834       head = vnp;
6835     }
6836     last = vnp;
6837 
6838     str = FileCacheReadLine (&fc, line, sizeof (line), &nonewline);
6839   }
6840 
6841   FileClose (fp);
6842 
6843   if (head == NULL) return NULL;
6844 
6845   len = 0;
6846   for (vnp = head; vnp != NULL; vnp = vnp->next) {
6847     str = (CharPtr) vnp->data.ptrvalue;
6848     len += StringLen (str) + 1;
6849   }
6850 
6851   tmp = (CharPtr) MemNew (sizeof (Char) * (len + 5));
6852   if (tmp == NULL) return NULL;
6853 
6854   ptr = tmp;
6855   notfirst = FALSE;
6856   for (vnp = head; vnp != NULL; vnp = vnp->next) {
6857     str = (CharPtr) vnp->data.ptrvalue;
6858     if (str == NULL) continue;
6859     if (*str == '\0' || *str == ' ') {
6860       ptr = StringMove (ptr, "~");
6861     } else if (notfirst) {
6862       ptr = StringMove (ptr, " ");
6863     }
6864     ptr = StringMove (ptr, str);
6865     notfirst = TRUE;
6866   }
6867 
6868   ValNodeFreeData (head);
6869 
6870   return tmp;
6871 }
6872 
6873 static CharPtr ParseCommaField (
6874   CharPtr PNTR strP
6875 )
6876 
6877 {
6878   CharPtr  ptr;
6879   CharPtr  str;
6880 
6881   if (strP == NULL) return NULL;
6882 
6883   str = *strP;
6884   if (StringHasNoText (str)) {
6885     *strP = NULL;
6886     return NULL;
6887   }
6888 
6889   ptr = StringChr (str, ',');
6890   if (ptr == NULL) {
6891     *strP = NULL;
6892     return str;
6893   }
6894 
6895   *ptr = '\0';
6896   ptr++;
6897   if (StringHasNoText (ptr)) {
6898     ptr = NULL;
6899   }
6900   *strP = ptr;
6901 
6902   if (StringHasNoText (str)) {
6903     str = NULL;
6904   }
6905   return str;
6906 }
6907 
6908 static DatePtr DateParse (
6909   CharPtr str
6910 )
6911 
6912 {
6913   Int4      day = -1, month = -1, year = -1;
6914   DatePtr   dp;
6915   CharPtr   ptr;
6916   Char      tmp [64];
6917   long int  val;
6918 
6919   if (StringHasNoText (str)) return NULL;
6920 
6921   StringNCpy_0 (tmp, str, sizeof (tmp));
6922   ptr = StringChr (tmp, '/');
6923   if (ptr == NULL) {
6924     ptr = StringChr (tmp, '-');
6925   }
6926   if (ptr != NULL) {
6927     *ptr = '\0';
6928     ptr++;
6929     if (sscanf (tmp, "%ld", &val) == 1) {
6930       month = (Int4) val;
6931     }
6932     str = StringChr (ptr, '/');
6933     if (str == NULL) {
6934       str = StringChr (ptr, '-');
6935     }
6936     if (str != NULL) {
6937       *str = '\0';
6938       str++;
6939       if (sscanf (ptr, "%ld", &val) == 1) {
6940         day = (Int4) val;
6941       }
6942       if (sscanf (str, "%ld", &val) == 1) {
6943         year = (Int4) val;
6944      }
6945     }
6946   }
6947 
6948   if (month < 0 || day < 0 || year < 2000) return NULL;
6949   if (month > 12 || day > 31 || year > 2099) return NULL;
6950 
6951   dp = DateNew ();
6952   if (dp == NULL) return NULL;
6953 
6954   dp->data [0] = 1;
6955   dp->data [1] = (Uint1) (year - 1900);
6956   dp->data [2] = (Uint1) month;
6957   dp->data [3] = (Uint1) day;
6958 
6959   return dp;
6960 }
6961 
6962 /* Args structure contains command-line arguments */
6963 
6964 #define p_argInputPath         0
6965 #define r_argOutputPath        1
6966 #define i_argInputFile         2
6967 #define o_argOutputFile        3
6968 #define x_argSuffix            4
6969 #define E_argRecurse           5
6970 #define t_argTemplate          6
6971 #define a_argType              7
6972 #define s_argFastaSet          8
6973 #define g_argGenProdSet        9
6974 #define F_argFeatIdLinks      10
6975 #define A_argAccession        11
6976 #define C_argCenter           12
6977 #define n_argOrgName          13
6978 #define j_argSrcQuals         14
6979 #define y_argComment          15
6980 #define Y_argCommentFile      16
6981 #define D_argDescrsFile       17
6982 #define f_argTableFile        18
6983 #define k_argCdsFlags         19
6984 #define V_argVerify           20
6985 #define v_argValidate         21
6986 #define b_argGenBank          22
6987 #define q_argFileID           23
6988 #define u_argUndoGPS          24
6989 #define h_argGnlToNote        25
6990 #define G_argGapFields        26
6991 #define R_argRemote           27
6992 #define S_argSmartFeats       28
6993 #define Q_argSmartTitle       29
6994 #define U_argUnnecXref        30
6995 #define L_argLocalID          31
6996 #define T_argTaxLookup        32
6997 #define P_argPubLookup        33
6998 #define W_argLogProgress      34
6999 #define K_argBioseqSet        35
7000 #define H_argHoldUntilPub     36
7001 #define Z_argDiscRepFile      37
7002 #define c_argCleanupOptions   38
7003 
7004 
7005 Args myargs [] = {
7006   {"Path to Files", NULL, NULL, NULL,
7007     TRUE, 'p', ARG_STRING, 0.0, 0, NULL},
7008   {"Path for Results", NULL, NULL, NULL,
7009     TRUE, 'r', ARG_STRING, 0.0, 0, NULL},
7010   {"Single Input File", NULL, NULL, NULL,
7011     TRUE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
7012   {"Single Output File", NULL, NULL, NULL,
7013     TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
7014   {"Suffix", ".fsa", NULL, NULL,
7015     TRUE, 'x', ARG_STRING, 0.0, 0, NULL},
7016   {"Recurse", "F", NULL, NULL,
7017     TRUE, 'E', ARG_BOOLEAN, 0.0, 0, NULL},
7018   {"Template File", NULL, NULL, NULL,
7019     TRUE, 't', ARG_FILE_IN, 0.0, 0, NULL},
7020   {"File Type\n"
7021    "      a Any\n"
7022    "      r20u Runs of 20+ Ns are gaps, 100 Ns are unknown length\n"
7023    "      r20k Runs of 20+ Ns are gaps, 100 Ns are known length\n"
7024    "      s FASTA Set (s Batch, s1 Pop, s2 Phy, s3 Mut, s4 Eco)\n"
7025    "      d FASTA Delta, di FASTA Delta with Implicit Gaps\n"
7026    "      l FASTA+Gap Alignment\n"
7027    "      z FASTA with Gap Lines\n"
7028    "      e PHRAP/ACE\n", "a", NULL, NULL,
7029     TRUE, 'a', ARG_STRING, 0.0, 0, NULL},
7030   {"Read FASTAs as Set", "F", NULL, NULL,
7031     TRUE, 's', ARG_BOOLEAN, 0.0, 0, NULL},
7032   {"Genomic Product Set", "F", NULL, NULL,
7033     TRUE, 'g', ARG_BOOLEAN, 0.0, 0, NULL},
7034   {"Feature ID Links (o by Overlap, p by Product)", NULL, NULL, NULL,
7035     TRUE, 'F', ARG_STRING, 0.0, 0, NULL},
7036   {"Accession", NULL, NULL, NULL,
7037     TRUE, 'A', ARG_STRING, 0.0, 0, NULL},
7038   {"Genome Center Tag", NULL, NULL, NULL,
7039     TRUE, 'C', ARG_STRING, 0.0, 0, NULL},
7040   {"Organism Name", NULL, NULL, NULL,
7041     TRUE, 'n', ARG_STRING, 0.0, 0, NULL},
7042   {"Source Qualifiers", NULL, NULL, NULL,
7043     TRUE, 'j', ARG_STRING, 0.0, 0, NULL},
7044   {"Comment", NULL, NULL, NULL,
7045     TRUE, 'y', ARG_STRING, 0.0, 0, NULL},
7046   {"Comment File", NULL, NULL, NULL,
7047     TRUE, 'Y', ARG_FILE_IN, 0.0, 0, NULL},
7048   {"Descriptors File", NULL, NULL, NULL,
7049     TRUE, 'D', ARG_FILE_IN, 0.0, 0, NULL},
7050   {"Single Table File", NULL, NULL, NULL,
7051     TRUE, 'f', ARG_FILE_IN, 0.0, 0, NULL},
7052   {"CDS Flags (combine any of the following letters)\n"
7053    "      c Annotate Longest ORF\n"
7054    "      r Allow Runon ORFs\n"
7055    "      m Allow Alternative Starts\n"
7056    "      k Set Conflict on Mismatch\n", NULL, NULL, NULL,
7057     TRUE, 'k', ARG_STRING, 0.0, 0, NULL},
7058   {"Verification (combine any of the following letters)\n"
7059    "      v Validate with Normal Stringency\n"
7060    "      r Validate without Country Check\n"
7061    "      b Generate GenBank Flatfile\n"
7062    "      g Generate Gene Report\n", NULL, NULL, NULL,
7063     TRUE, 'V', ARG_STRING, 0.0, 0, NULL},
7064   {"Validate (obsolete: use -V v)", "F", NULL, NULL,
7065     TRUE, 'v', ARG_BOOLEAN, 0.0, 0, NULL},
7066   {"Generate GenBank File (obsolete: use -V b)", "F", NULL, NULL,
7067     TRUE, 'b', ARG_BOOLEAN, 0.0, 0, NULL},
7068   {"Seq ID from File Name", "F", NULL, NULL,
7069     TRUE, 'q', ARG_BOOLEAN, 0.0, 0, NULL},
7070   {"GenProdSet to NucProtSet", "F", NULL, NULL,
7071     TRUE, 'u', ARG_BOOLEAN, 0.0, 0, NULL},
7072   {"General ID to Note", "F", NULL, NULL,
7073     TRUE, 'h', ARG_BOOLEAN, 0.0, 0, NULL},
7074   {"Alignment Gap Flags (comma separated fields, e.g., p,-,-,-,?,. )\n"
7075    "      n Nucleotide or p Protein,\n"
7076    "      Begin, Middle, End Gap Characters,\n"
7077    "      Missing Characters, Match Characters\n",  NULL, NULL, NULL,
7078     TRUE, 'G', ARG_STRING, 0.0, 0, NULL},
7079   {"Remote Sequence Record Fetching from ID", "F", NULL, NULL,
7080     TRUE, 'R', ARG_BOOLEAN, 0.0, 0, NULL},
7081   {"Smart Feature Annotation", "F", NULL, NULL,
7082     TRUE, 'S', ARG_BOOLEAN, 0.0, 0, NULL},
7083   {"Special mRNA Titles", "F", NULL, NULL,
7084     TRUE, 'Q', ARG_BOOLEAN, 0.0, 0, NULL},
7085   {"Remove Unnecessary Gene Xref", "F", NULL, NULL,
7086     TRUE, 'U', ARG_BOOLEAN, 0.0, 0, NULL},
7087   {"Force Local protein_id/transcript_id", "F", NULL, NULL,
7088     TRUE, 'L', ARG_BOOLEAN, 0.0, 0, NULL},
7089   {"Remote Taxonomy Lookup", "F", NULL, NULL,
7090     TRUE, 'T', ARG_BOOLEAN, 0.0, 0, NULL},
7091   {"Remote Publication Lookup", "F", NULL, NULL,
7092     TRUE, 'P', ARG_BOOLEAN, 0.0, 0, NULL},
7093   {"Log Progress", "F", NULL, NULL,
7094     TRUE, 'W', ARG_BOOLEAN, 0.0, 0, NULL},
7095   {"Save Bioseq-set", "F", NULL, NULL,
7096     TRUE, 'K', ARG_BOOLEAN, 0.0, 0, NULL},
7097   {"Hold Until Publish\n"
7098    "      y Hold for One Year\n"
7099    "      mm/dd/yyyy\n", NULL, NULL, NULL,
7100     TRUE, 'H', ARG_STRING, 0.0, 0, NULL},
7101   {"Discrepancy Report Output File", NULL, NULL, NULL,
7102     TRUE, 'Z', ARG_FILE_OUT, 0.0, 0, NULL},
7103   {"Cleanup (combine any of the following letters)\n"
7104    "      d Correct Collection Dates (assume month first)\n"
7105    "      D Correct Collection Dates (assume day first)\n"
7106    "      b Append note to coding regions that overlap other coding regions with similar product names and do not contain 'ABC'",
7107     NULL, NULL, NULL,
7108     TRUE, 'c', ARG_STRING, 0.0, 0, NULL},
7109 };
7110 
7111 Int2 Main (void)
7112 
7113 {
7114   AsnIoPtr        aip = NULL;
7115   Char            app [64];
7116   CharPtr         base;
7117   AsnTypePtr      bssp_atp = NULL;
7118   CitSubPtr       csp;
7119   Pointer         dataptr;
7120   Uint2           datatype;
7121   CharPtr         descrs;
7122   CharPtr         directory;
7123   DatePtr         dp;
7124   FILE            *fp;
7125   Char            gapstring [128];
7126   TextFsaPtr      gotags;
7127   CharPtr         hold;
7128   CharPtr         os;
7129   CharPtr         outfile;
7130   Pubdesc         pd;
7131   PubdescPtr      pdp = NULL;
7132   ValNode         pb;
7133   CharPtr         ptr;
7134   Boolean         recurse;
7135   Boolean         remote;
7136   CharPtr         results;
7137   SubmitBlockPtr  sbp = NULL;
7138   SeqDescrPtr     sdphead = NULL;
7139   SeqEntryPtr     sep;
7140   Char            sfx [32];
7141   BioSourcePtr    src = NULL;
7142   SeqSubmitPtr    ssp = NULL;
7143   AsnTypePtr      ssp_atp = NULL;
7144   Char            str [64];
7145   CharPtr         suffix;
7146   TblArgs         tbl;
7147   CharPtr         tmp;
7148   CharPtr         tmplate;
7149   CharPtr         disc_rep_file = NULL;
7150 
7151   /* standard setup */
7152 
7153   ErrSetFatalLevel (SEV_MAX);
7154   ErrSetMessageLevel (SEV_MAX);
7155   ErrClearOptFlags (EO_SHOW_USERSTR);
7156   UseLocalAsnloadDataAndErrMsg ();
7157   ErrPathReset ();
7158 
7159   /* finish resolving internal connections in ASN.1 parse tables */
7160 
7161   if (! AllObjLoad ()) {
7162     Message (MSG_FATAL, "AllObjLoad failed");
7163     return 1;
7164   }
7165   if (! SubmitAsnLoad ()) {
7166     Message (MSG_FATAL, "SubmitAsnLoad failed");
7167     return 1;
7168   }
7169   if (! FeatDefSetLoad ()) {
7170     Message (MSG_FATAL, "FeatDefSetLoad failed");
7171     return 1;
7172   }
7173   if (! SeqCodeSetLoad ()) {
7174     Message (MSG_FATAL, "SeqCodeSetLoad failed");
7175     return 1;
7176   }
7177   if (! GeneticCodeTableLoad ()) {
7178     Message (MSG_FATAL, "GeneticCodeTableLoad failed");
7179     return 1;
7180   }
7181 
7182   /* process command line arguments */
7183 
7184   sprintf (app, "tbl2asn %s", TBL2ASN_APPLICATION);
7185   if (! GetArgs (app, sizeof (myargs) / sizeof (Args), myargs)) {
7186     return 0;
7187   }
7188 
7189   directory = (CharPtr) myargs [p_argInputPath].strvalue;
7190   results = (CharPtr) myargs [r_argOutputPath].strvalue;
7191   if (StringHasNoText (results)) {
7192     results = NULL;
7193   }
7194   suffix = (CharPtr) myargs [x_argSuffix].strvalue;
7195   recurse = (Boolean) myargs [E_argRecurse].intvalue;
7196   base = (CharPtr) myargs [i_argInputFile].strvalue;
7197   outfile = (CharPtr) myargs [o_argOutputFile].strvalue;
7198   if (StringHasNoText (outfile)) {
7199     outfile = NULL;
7200   }
7201   tmplate = (CharPtr) myargs [t_argTemplate].strvalue;
7202   descrs = (CharPtr) myargs [D_argDescrsFile].strvalue;
7203 
7204   hold = (CharPtr) myargs [H_argHoldUntilPub].strvalue;
7205 
7206   if (StringHasNoText(directory) && StringHasNoText(base)) {
7207     Message (MSG_FATAL, "You must supply either an input file (-i) or an input directory (-p).\nUse -p . to specify the current directory.\n\n");
7208     return 1;
7209   }
7210   remote = (Boolean) myargs [R_argRemote].intvalue;
7211 
7212   MemSet ((Pointer) &tbl, 0, sizeof (TblArgs));
7213 
7214   /* -s is heavily used and will remain as an alternative to -a s */
7215 
7216   tbl.fastaset = (Boolean) myargs [s_argFastaSet].intvalue;
7217 
7218   /* process new -a type argument */
7219 
7220   ptr = myargs [a_argType].strvalue;
7221   if (StringICmp (ptr, "r20u") == 0) {
7222     tbl.raw2delt = TRUE;
7223     tbl.r2dmin = 20;
7224     tbl.r2dunk100 = TRUE;
7225   } else if (StringICmp (ptr, "r20k") == 0) {
7226     tbl.raw2delt = TRUE;
7227     tbl.r2dmin = 20;
7228     tbl.r2dunk100 = FALSE;
7229   } else if (StringICmp (ptr, "s") == 0) {
7230     tbl.fastaset = TRUE;
7231   } else if (StringICmp (ptr, "w1") == 0 || StringICmp (ptr, "s1") == 0) {
7232     tbl.fastaset = TRUE;
7233     tbl.whichclass = 1;
7234   } else if (StringICmp (ptr, "w2") == 0 || StringICmp (ptr, "s2") == 0) {
7235     tbl.fastaset = TRUE;
7236     tbl.whichclass = 2;
7237   } else if (StringICmp (ptr, "w3") == 0 || StringICmp (ptr, "s3") == 0) {
7238     tbl.fastaset = TRUE;
7239     tbl.whichclass = 3;
7240   } else if (StringICmp (ptr, "w4") == 0 || StringICmp (ptr, "s4") == 0) {
7241     tbl.fastaset = TRUE;
7242     tbl.whichclass = 4;
7243   } else if (StringICmp (ptr, "d") == 0) {
7244     tbl.deltaset = TRUE;
7245   } else if (StringICmp (ptr, "di") == 0) {
7246     tbl.deltaset = TRUE;
7247     tbl.implicitgaps = TRUE;
7248   } else if (StringICmp (ptr, "l") == 0) {
7249     tbl.alignset = TRUE;
7250   } else if (StringICmp (ptr, "z") == 0) {
7251     tbl.gapped = TRUE;
7252   } else if (StringICmp (ptr, "e") == 0) {
7253     tbl.phrapace = TRUE;
7254   }
7255 
7256   tbl.genprodset = (Boolean) myargs [g_argGenProdSet].intvalue;
7257   ptr = myargs [F_argFeatIdLinks].strvalue;
7258   if (StringICmp (ptr, "o") == 0) {
7259     tbl.linkbyoverlap = TRUE;
7260   } else if (StringICmp (ptr, "p") == 0) {
7261     tbl.linkbyproduct = TRUE;
7262   }
7263   tbl.forcelocalid = (Boolean) myargs [L_argLocalID].intvalue;
7264   tbl.gpstonps = (Boolean) myargs [u_argUndoGPS].intvalue;
7265   tbl.gnltonote = (Boolean) myargs [h_argGnlToNote].intvalue;
7266   tbl.accn = (CharPtr) myargs [A_argAccession].strvalue;
7267   tbl.center = (CharPtr) myargs [C_argCenter].strvalue;
7268   tbl.organism = (CharPtr) myargs [n_argOrgName].strvalue;
7269   tbl.srcquals = (CharPtr) myargs [j_argSrcQuals].strvalue;
7270   tbl.comment = (CharPtr) myargs [y_argComment].strvalue;
7271   tbl.commentFile = ReadCommentFile ((CharPtr) myargs [Y_argCommentFile].strvalue);
7272 
7273   ptr = myargs [k_argCdsFlags].strvalue;
7274   if (StringChr (ptr, 'c') != NULL) {
7275     tbl.findorf = TRUE;
7276   }
7277   if (StringChr (ptr, 'r') != NULL) {
7278     tbl.runonorf = TRUE;
7279     tbl.findorf = TRUE;
7280   }
7281   if (StringChr (ptr, 'm') != NULL) {
7282     tbl.altstart = TRUE;
7283   }
7284   if (StringChr (ptr, 'k') != NULL) {
7285     tbl.conflict = TRUE;
7286   }
7287   /*
7288   if (!tbl.findorf && tbl.runonorf) {
7289     Message (MSG_FATAL, "-k r cannot be used without -k c");
7290     return 1;
7291   }
7292   */
7293 
7294   /* process obsolete validate/flatfile arguments first, warn if used */
7295 
7296   tbl.validate = (Boolean) myargs [v_argValidate].intvalue;
7297   if (tbl.validate) {
7298     Message (MSG_POST, "-v is obsolete, use -V v instead");
7299   }
7300   tbl.flatfile = (Boolean) myargs [b_argGenBank].intvalue;
7301   if (tbl.flatfile) {
7302     Message (MSG_POST, "-b is obsolete, use -V b instead");
7303   }
7304 
7305   ptr = myargs [V_argVerify].strvalue;
7306   if (StringChr (ptr, 'v') != NULL) {
7307     tbl.validate = TRUE;
7308   }
7309   if (StringChr (ptr, 'r') != NULL) {
7310     tbl.validate = TRUE;
7311     tbl.relaxed = TRUE;
7312   }
7313   if (StringChr (ptr, 'b') != NULL) {
7314     tbl.flatfile = TRUE;
7315   }
7316   if (StringChr (ptr, 'g') != NULL) {
7317     tbl.genereport = TRUE;
7318   }
7319   if (StringChr (ptr, 'c') != NULL) {
7320     tbl.validate_barcode = TRUE;
7321   }
7322   
7323 
7324   tbl.seqidfromfile = (Boolean) myargs [q_argFileID].intvalue;
7325   tbl.smartfeats = (Boolean) myargs [S_argSmartFeats].intvalue;
7326   tbl.smarttitle = (Boolean) myargs [Q_argSmartTitle].intvalue;
7327   tbl.removeunnecxref = (Boolean) myargs [U_argUnnecXref].intvalue;
7328   tbl.dotaxlookup = (Boolean) myargs [T_argTaxLookup].intvalue;
7329   tbl.dopublookup = (Boolean) myargs [P_argPubLookup].intvalue;
7330   tbl.logtoterminal = (Boolean) myargs [W_argLogProgress].intvalue;
7331 
7332   tbl.save_bioseq_set = (Boolean) myargs [K_argBioseqSet].intvalue;
7333 
7334   disc_rep_file = (CharPtr) myargs [Z_argDiscRepFile].strvalue;
7335   if (StringHasNoText (disc_rep_file)) {
7336     tbl.global_report = NULL;
7337   } else {
7338     tbl.global_report = GlobalDiscrepReportNew();
7339     tbl.global_report->test_config = DiscrepancyConfigNew ();
7340     DisableTRNATests (tbl.global_report->test_config);
7341     ConfigureForGenomes (tbl.global_report->test_config);
7342     tbl.global_report->taxlookup = taxlookup;
7343     tbl.global_report->output_config->summary_report = FALSE;
7344     tbl.global_report->output_config->expand_report_categories[DISC_SUPERFLUOUS_GENE] = TRUE;
7345     tbl.global_report->output_config->expand_report_categories[DISC_RNA_CDS_OVERLAP] = TRUE;
7346     tbl.global_report->output_config->expand_report_categories[DISC_SUSPECT_PRODUCT_NAME] = TRUE;
7347     tbl.global_report->output_config->expand_report_categories[DISC_OVERLAPPING_CDS] = TRUE;
7348   }
7349 
7350 
7351   /* arguments for alignment reading, e.g., "p,-,-,-,?,." */
7352 
7353   gapstring [0] = '\0';
7354   ptr = (CharPtr) myargs [G_argGapFields].strvalue;
7355   StringNCpy_0 (gapstring, ptr, sizeof (gapstring));
7356 
7357   ptr = gapstring;
7358   tmp = ParseCommaField (&ptr);
7359   if (tmp != NULL) {
7360     if (StringChr (tmp, 'p') != NULL) {
7361       tbl.aln_is_protein = TRUE;
7362     } else if (StringChr (tmp, 'n') == NULL) {
7363       Message (MSG_FATAL, "-G must start with p for Protein or n for Nucleotide");
7364       return 1;
7365     }
7366   }
7367   tbl.aln_beginning_gap = ParseCommaField (&ptr);
7368   tbl.aln_middle_gap = ParseCommaField (&ptr);
7369   tbl.aln_end_gap = ParseCommaField (&ptr);
7370   tbl.aln_missing = ParseCommaField (&ptr);
7371   tbl.aln_match = ParseCommaField (&ptr);
7372 
7373   if (StringHasNoText (tbl.accn)) {
7374     tbl.accn = NULL;
7375   }
7376   if (StringHasNoText (tbl.organism)) {
7377     tbl.organism = NULL;
7378   }
7379   if (StringHasNoText (tbl.srcquals)) {
7380     tbl.srcquals = NULL;
7381   }
7382   if (StringHasNoText (tbl.comment)) {
7383     tbl.comment = NULL;
7384   }
7385   if (StringHasNoText (tbl.commentFile)) {
7386     tbl.commentFile = NULL;
7387   }
7388 
7389   if (tbl.fastaset &&
7390       (tbl.deltaset || tbl.phrapace || tbl.genprodset ||
7391        tbl.alignset || tbl.gapped)) {
7392     Message (MSG_FATAL, "-s cannot be used with -d, -e, -g, -l or -z");
7393     return 1;
7394   }
7395 
7396   if (! tbl.alignset && (StringDoesHaveText (tbl.aln_beginning_gap)
7397       || StringDoesHaveText (tbl.aln_end_gap)
7398       || StringDoesHaveText (tbl.aln_middle_gap)
7399       || StringDoesHaveText (tbl.aln_missing)
7400       || StringDoesHaveText (tbl.aln_match)
7401       || tbl.aln_is_protein)) {
7402     Message (MSG_FATAL, "-G can only be used with -a l");
7403     return 1;
7404   }
7405 
7406   /* arguments for cleanup */
7407   MemSet (&(tbl.cleanup_args), 0, sizeof (CleanupArgsData));
7408   ptr = (CharPtr) myargs [c_argCleanupOptions].strvalue;
7409   if (StringChr (ptr, 'd') != NULL) {
7410     if (StringChr (ptr, 'D') != NULL) {
7411       Message (MSG_FATAL, "Cannot use both d and D options for cleanup.  Choose one.");
7412       return 1;
7413     }
7414     tbl.cleanup_args.collection_dates = TRUE;
7415     tbl.cleanup_args.collection_dates_month_first = TRUE;
7416   } else if (StringChr (ptr, 'D') != NULL) {
7417     tbl.cleanup_args.collection_dates = TRUE;
7418     tbl.cleanup_args.collection_dates_month_first = FALSE;
7419   }
7420 
7421   if (StringChr (ptr, 'b') != NULL) {
7422     tbl.cleanup_args.add_notes_to_overlapping_cds_without_abc = TRUE;
7423   }
7424   
7425   if (StringHasNoText (base) && (StringDoesHaveText (tbl.accn))) {
7426     Message (MSG_FATAL, "Accession can be entered only for a single record");
7427     return 1;
7428   }
7429 
7430   /* Seq-submit or Submit-block template is optional */
7431 
7432   if (StringDoesHaveText (tmplate)) {
7433     if (TemplateOverwriteRisk (tmplate, base, directory, suffix)) {
7434       if (Message (MSG_YN, overwriteMsg) == ANS_NO) return 0;
7435     }
7436     fp = FileOpen (tmplate, "r");
7437     if (fp != NULL) {
7438       while ((dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, TRUE, FALSE)) != NULL) {
7439         if (datatype == OBJ_SEQSUB) {
7440           ssp = (SeqSubmitPtr) dataptr;
7441         } else if (datatype == OBJ_SUBMIT_BLOCK) {
7442           sbp = (SubmitBlockPtr) dataptr;
7443         } else if (datatype == OBJ_SEQDESC) {
7444           ValNodeLink (&sdphead, (SeqDescrPtr) dataptr);
7445         } else {
7446           ObjMgrFree (datatype, dataptr);
7447         }
7448       }
7449       FileClose (fp);
7450     }
7451 
7452     if (ssp != NULL && sbp == NULL) {
7453       sbp = ssp->sub;
7454     }
7455     if (sbp == NULL) {
7456       Message (MSG_FATAL, "Unable to read required template file");
7457       return 1;
7458     }
7459 
7460     if (sbp != NULL) {
7461       if (ssp != NULL) {
7462 
7463         /* copy submit block, will free SeqSubmit before processing */
7464 
7465         sbp = AsnIoMemCopy ((Pointer) sbp,
7466                             (AsnReadFunc) SubmitBlockAsnRead,
7467                             (AsnWriteFunc) SubmitBlockAsnWrite);
7468       }
7469       sbp->tool = MemFree (sbp->tool);
7470       os = GetOpSysString ();
7471       if (os != NULL) {
7472         sprintf (str, "tbl2asn %s - %s", TBL2ASN_APPLICATION, os);
7473       } else {
7474         sprintf (str, "tbl2asn %s", TBL2ASN_APPLICATION);
7475       }
7476       sbp->tool = StringSave (str);
7477       MemFree (os);
7478       sbp->hup = FALSE;
7479       sbp->reldate = DateFree (sbp->reldate);
7480       if (StringDoesHaveText (hold)) {
7481         if (StringICmp (hold, "y") == 0) {
7482           sbp->hup = TRUE;
7483           dp = DateCurr ();
7484           sbp->reldate = dp;
7485           if (dp != NULL) {
7486             if (dp->data [0] == 1) {
7487               (dp->data [1])++;
7488             }
7489           }
7490         } else {
7491           dp = DateParse (hold);
7492           if (dp != NULL) {
7493             sbp->hup = TRUE;
7494             sbp->reldate = dp;
7495           }
7496         }
7497       }
7498       csp = sbp->cit;
7499       if (csp != NULL) {
7500         csp->date = DateFree (csp->date);
7501         csp->date = DateCurr ();
7502         MemSet ((Pointer) &pd, 0, sizeof (Pubdesc));
7503         MemSet ((Pointer) &pb, 0, sizeof (ValNode));
7504         pb.choice = PUB_Sub;
7505         pb.data.ptrvalue = (Pointer) csp;
7506         pd.pub = &pb;
7507         pdp = &pd;
7508       }
7509     }
7510     if (ssp != NULL && ssp->datatype == 1) {
7511       sep = (SeqEntryPtr) ssp->data;
7512       if (sep != NULL) {
7513         VisitBioSourcesInSep (sep, (Pointer) &src, GetFirstBiop);
7514         if (src != NULL) {
7515 
7516           /* copy top biosource */
7517 
7518           src = AsnIoMemCopy ((Pointer) src,
7519                               (AsnReadFunc) BioSourceAsnRead,
7520                               (AsnWriteFunc) BioSourceAsnWrite);
7521         }
7522       }
7523 
7524       /* in case template has colliding ID, free it now */
7525 
7526       SeqSubmitFree (ssp);
7527     }
7528   }
7529 
7530   if (StringDoesHaveText (descrs)) {
7531     if (TemplateOverwriteRisk (descrs, base, directory, suffix)) {
7532       if (Message (MSG_YN, overwriteMsg) == ANS_NO) return 0;
7533     }
7534     fp = FileOpen (descrs, "r");
7535     if (fp != NULL) {
7536       while ((dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, TRUE, FALSE)) != NULL) {
7537         if (datatype == OBJ_SEQDESC) {
7538           ValNodeLink (&sdphead, (SeqDescrPtr) dataptr);
7539         } else {
7540           ObjMgrFree (datatype, dataptr);
7541         }
7542       }
7543       FileClose (fp);
7544     }
7545   }
7546 
7547   gotags = TextFsaNew ();
7548   TextFsaAdd (gotags, "go_component");
7549   TextFsaAdd (gotags, "go_function");
7550   TextFsaAdd (gotags, "go_process");
7551 
7552   /* register fetch functions */
7553 
7554   if (remote) {
7555 #ifdef INTERNAL_NCBI_TBL2ASN
7556     if (! PUBSEQBioseqFetchEnable ("tbl2asn", FALSE)) {
7557       Message (MSG_POSTERR, "PUBSEQBioseqFetchEnable failed");
7558       return 1;
7559     }
7560 #else
7561     PubSeqFetchEnable ();
7562 #endif
7563   }
7564 
7565   if (remote || tbl.dopublookup) {
7566     PubMedFetchEnable ();
7567   }
7568 
7569   /* process one or more records */
7570 
7571   if (StringDoesHaveText (outfile) && StringHasNoText (base)) {
7572     aip = AsnIoOpen (outfile, "w");
7573     if (aip == NULL) {
7574       Message (MSG_FATAL, "Unable to open single output file");
7575       return 1;
7576     }
7577     ssp_atp = DoFirstPrefix (aip, sbp);
7578     bssp_atp = DoSecondPrefix (aip, &tbl);
7579   }
7580 
7581   if (StringDoesHaveText (base)) {
7582     ptr = StringRChr (base, '.');
7583     sfx[0] = '\0';
7584     if (ptr != NULL) {
7585       StringNCpy_0 (sfx, ptr, sizeof (sfx));
7586       *ptr = '\0';
7587     }
7588     tbl.tableFile = (CharPtr) myargs [f_argTableFile].strvalue;
7589     ProcessOneRecord (sbp, pdp, src, directory, results, base, sfx, sdphead, &tbl, gotags, aip, outfile);
7590 
7591   } else {
7592 
7593     FileRecurse (sbp, pdp, src, directory, results, suffix, recurse, sdphead, &tbl, gotags, aip, NULL);
7594   }
7595 
7596   if (aip != NULL) {
7597     DoSecondSuffix (aip, bssp_atp);
7598     DoFirstSuffix (aip, ssp_atp);
7599     AsnIoClose (aip);
7600   }
7601 
7602   if (tbl.global_report != NULL) {
7603     fp = FileOpen (disc_rep_file, "w");
7604     WriteGlobalDiscrepancyReport (tbl.global_report, fp);
7605     FileClose (fp);
7606     tbl.global_report = GlobalDiscrepReportFree (tbl.global_report);
7607   }
7608 
7609   if (sbp != NULL) {
7610     SubmitBlockFree (sbp);
7611   }
7612   if (src != NULL) {
7613     BioSourceFree (src);
7614   }
7615 
7616   SeqDescrFree (sdphead);
7617 
7618   TransTableFreeAll ();
7619 
7620   ECNumberFSAFreeAll ();
7621 
7622   TextFsaFree (gotags);
7623 
7624   /* close fetch functions */
7625 
7626   if (remote || tbl.dopublookup) {
7627     PubMedFetchDisable ();
7628   }
7629 
7630   if (remote) {
7631 #ifdef INTERNAL_NCBI_TBL2ASN
7632     PUBSEQBioseqFetchDisable ();
7633 #else
7634     PubSeqFetchDisable ();
7635 #endif
7636   }
7637 
7638   return 0;
7639 }
7640 
7641 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.