NCBI C Toolkit Cross Reference

C/api/aliparse.c


  1 /*=========================================================================*/
  2 /*                                                                         */
  3 /*  aliparse.c                                                             */
  4 /*                                                                         */
  5 /*=========================================================================*/
  6 
  7 #include <stdarg.h>
  8 
  9 #include <aliparse.h>
 10 #include <aliread.h>
 11 
 12 /* Defined constants */
 13 
 14 #define ALI_USE_MAYBES                FALSE  /* Default values for    */
 15 #define ALI_READ_BUFFSIZE             80     /* configuration options */
 16 #define ALI_GAP_CHAR                  '-'    /*          |            */
 17 #define ALI_MISSING_CHAR              '?'    /*          |            */
 18 #define ALI_CORRUPT_SEQ_THRESHOLD     95     /*          |            */
 19 #define ALI_NUCL_LINE_MAX_THRESHOLD   75     /*          |            */
 20 #define ALI_NUCL_LINE_MIN_THRESHOLD   25     /*          V            */
 21 
 22 /* Data structures */
 23 
 24 typedef struct
 25 {
 26   DataInfo    foundInfo;
 27   IdInfoPtr   currentId;
 28   IdInfoPtr   currentDeflineId;
 29   ValNodePtr  lastRow;
 30   Boolean     hasFullLength;
 31   Boolean     isFirstGroup;
 32   Boolean     isFirstId;
 33   Boolean     maybesFound;
 34   SeqPartPtr  lastSeqPart;
 35   Boolean     gotAllIds;
 36   Int4        idCount;
 37   Int4        currentIdCount;
 38 } PatternInfo, PNTR PatternInfoPtr;
 39 
 40 /* Filewide static variables */
 41 
 42 static AliConfigInfo s_configInfo;
 43 static Boolean       s_configurationSet = FALSE;
 44 
 45 /* Function prototypes */
 46 
 47 static void      s_FreeErrorList (ErrInfoPtr errorList);
 48 static void      s_FreeSequenceList (SeqPartPtr seqPtr);
 49 static void      s_FreeIdList (IdInfoPtr idList);
 50 static void      s_FreeRowList (ValNodePtr rowList);
 51 static void      s_FreeRowList_Safe (ValNodePtr rowList);
 52 static void      s_DisplayRowList (ValNodePtr rowList,
 53                                    Int2 mask);
 54 static CharPtr   s_GetRowIdString (ValNodePtr row);
 55 static CharPtr   s_GetRowSeqString (ValNodePtr row);
 56 static IdInfoPtr s_ProcessMaybes (ValNodePtr rowList);
 57 static int       s_SegCompare(const void *i,
 58                               const void *j);
 59 static Boolean   s_IsInterleaved (ValNodePtr rowList,
 60                                   Int2 PNTR idCount);
 61 static Boolean   s_ProcessInterId (CharPtr          newIdStr,
 62                                    PatternInfoPtr   pattern,
 63                                    AlignFileDataPtr fileInfoPtr,
 64                                    Boolean          isMaybe);
 65 static Boolean   s_ProcessInterSeq (CharPtr          newSeqStr,
 66                                     PatternInfoPtr   pattern,
 67                                     AlignFileDataPtr fileInfoPtr,
 68                                     Boolean          isMaybe);
 69 static Boolean   s_AnalyzeInterleaved (ValNodePtr       rowList,
 70                                        AlignFileDataPtr fileInfoPtr,
 71                                        Int2             idCount);
 72 static Boolean   s_ProcessContigId (CharPtr          newIdStr,
 73                                     PatternInfoPtr   pattern,
 74                                     AlignFileDataPtr fileInfoPtr);
 75 static Boolean   s_ProcessContigSeq (CharPtr          newSeqStr,
 76                                      PatternInfoPtr   pattern,
 77                                      AlignFileDataPtr fileInfoPtr);
 78 static Boolean   s_AnalyzeContiguous (ValNodePtr       rowList,
 79                                       AlignFileDataPtr fileInfoPtr);
 80 static Boolean   s_AnalyzeContents (ValNodePtr       rowList,
 81                                     AlignFileDataPtr fileInfoPtr);
 82 static void      s_SortErrors (AlignFileDataPtr fileInfoPtr);
 83 static void      s_AnalyzeErrors (AlignFileDataPtr fileInfoPtr);
 84 static Boolean   s_CheckContext (ValNodePtr       rowList,
 85                                  AlignFileDataPtr fileInfoPtr);
 86 
 87 
 88 /*=========================================================================*/
 89 /*                                                                         */
 90 /* Ali_GetConfig () -- Get the current configuration settings.             */
 91 /*                                                                         */
 92 /*=========================================================================*/
 93 
 94 AliConfigInfoPtr Ali_GetConfig (void)
 95 {
 96 
 97   AliConfigInfoPtr configPtr;
 98 
 99   /* If configuration hasn't been set yet, */
100   /* then set it to the defaults.          */
101 
102   if (s_configurationSet == FALSE)
103     {
104       s_configInfo.useMaybes                  = ALI_USE_MAYBES;
105       s_configInfo.readBuffSize               = ALI_READ_BUFFSIZE;
106       s_configInfo.debugLevel                 = ALI_SHOW_NONE;
107       s_configInfo.corruptSeqThreshold        = ALI_CORRUPT_SEQ_THRESHOLD;
108       s_configInfo.nuclLineMinThreshold       = ALI_NUCL_LINE_MIN_THRESHOLD;
109       s_configInfo.nuclLineMaxThreshold       = ALI_NUCL_LINE_MAX_THRESHOLD;
110       s_configInfo.errExpandLevel             = ALI_ERRMSG_EXPAND_SOME;
111       s_configInfo.declaredInfo.dataType      = ALI_UNKNOWN;
112       s_configInfo.declaredInfo.contigOrInter = ALI_UNKNOWN;
113       s_configInfo.declaredInfo.idCount       = 0;
114       s_configInfo.declaredInfo.seqLength     = 0;
115 
116       s_configInfo.gapChar = (CharPtr) MemNew (32);
117       sprintf (s_configInfo.gapChar    , "%c%c", ALI_GAP_CHAR, '.');
118       s_configInfo.missingChar = (CharPtr) MemNew (32);
119       sprintf (s_configInfo.missingChar, "%c", ALI_MISSING_CHAR);
120 
121       s_configurationSet = TRUE;
122     }
123 
124   /* Copy the current settings to the return struct */
125 
126   configPtr = (AliConfigInfoPtr) MemNew (sizeof (AliConfigInfo));
127   MemSet (configPtr, 0, sizeof (AliConfigInfo));
128 
129   configPtr->useMaybes            = s_configInfo.useMaybes;
130   configPtr->readBuffSize         = s_configInfo.readBuffSize;
131   configPtr->debugLevel           = s_configInfo.debugLevel;
132   configPtr->corruptSeqThreshold  = s_configInfo.corruptSeqThreshold;
133   configPtr->nuclLineMinThreshold = s_configInfo.nuclLineMinThreshold;
134   configPtr->nuclLineMaxThreshold = s_configInfo.nuclLineMaxThreshold;
135   configPtr->errExpandLevel       = s_configInfo.errExpandLevel;
136 
137   configPtr->gapChar = (CharPtr) MemNew (32);
138   StringCpy (configPtr->gapChar, s_configInfo.gapChar);
139   configPtr->missingChar = (CharPtr) MemNew (32);
140   StringCpy (configPtr->missingChar, s_configInfo.missingChar);
141 
142   /* Return successfully */
143 
144   return configPtr;
145 }
146 
147 /*=========================================================================*/
148 /*                                                                         */
149 /*  Ali_SetConfig () - Sets various runtime configuration options used by  */
150 /*                     the Ali_Read () function.                           */
151 /*                                                                         */
152 /* configPtr                                                               */
153 /* ---------                                                               */
154 /*                                                                         */
155 /* The configPtr parameter contains new values for one or more             */
156 /* configuration settings.  The values that are applied are selected by    */
157 /* the options parameter.                                                  */
158 /*                                                                         */
159 /*     gapChar - [default: '-'] -- This is the character that will be used */
160 /*               as the gap character if the file does not define one.     */
161 /*                                                                         */
162 /*     missingChar - [default: '?'] -- This is the character that will be  */
163 /*                   used as missing character if the file does not define */
164 /*                   one.                                                  */
165 /*                                                                         */
166 /*     useMaybes - [default: FALSE] -- If a line is found that doesn't     */
167 /*                 quite meet the criteria for being a sequence, but is    */
168 /*                 close enough that it might be a slightly mangled        */
169 /*                 sequence line, then it is marked as a 'maybe'. The      */
170 /*                 useMaybes setting determines how these 'maybe'          */
171 /*                 sequences are treated.  If set to FALSE, they ARE NOT   */
172 /*                 treated as sequences, if set to TRUE they ARE treated   */
173 /*                 as sequences.                                           */
174 /*                                                                         */
175 /*     readBuffSize - [default: 2048] -- This is size (in bytes) of the    */
176 /*                    chunks that are read when reading in the file.       */
177 /*                    Setting it to higher values may increase the         */
178 /*                    efficiency, but with operating system and hardware   */
179 /*                    buffering going on, it probably doesn't make much    */
180 /*                    difference.                                          */
181 /*                                                                         */
182 /*     debugLevel - [default: ALI_SHOW_NONE] -- Determines what debugging  */
183 /*                  information to display to stderr during processing.    */
184 /*                  Can be set to one of the following:                    */
185 /*                                                                         */
186 /*                  ALI_SHOW_NONE       : Show no debugging info [default] */
187 /*                  ALI_SHOW_SEQUENCES  : Show lines classified as seqs    */
188 /*                  ALI_SHOW_DEFLINES   : Show lines classified as deflines*/
189 /*                  ALI_SHOW_OTHERS     : Show lines classified as others  */
190 /*                                        (ie, not sequences or deflines). */
191 /*                  ALI_SHOW_ALL        : Show all lines and their         */
192 /*                                        classification.                  */
193 /*                                                                         */
194 /*     corruptSeqThreshold - [Default: 95] -- Used to guess that a line is */
195 /*                           actually a corrupted sequence.  If the line   */
196 /*                           contains a percentage of sequence characters  */
197 /*                           equal to or above the corruptSeqThreshold     */
198 /*                           then it is marked as maybe a sequence line.   */
199 /*                                                                         */
200 /*     nuclLineMaxThreshold - [Default: 75] -- Used to determine whether a */
201 /*                            sequence is DNA or protein.  If the line has */
202 /*                            MORE than nuclLineMaxThreshold percent of    */
203 /*                            the characters "ACGT" and the missing and    */
204 /*                            gap chars (and all the other characters are  */
205 /*                            ambiguous protein/DNA characters), then it   */
206 /*                            is marked as a nucleotide sequence.          */
207 /*                                                                         */
208 /*     nuclLineMinThreshold - [Default: 25] -- Used to determine whether a */
209 /*                            sequence is DNA or protein.  If the line has */
210 /*                            LESS than nuclLineMinThreshold percent of    */
211 /*                            the characters "ACGT" and the missing and    */
212 /*                            gap chars (and all the other characters are  */
213 /*                            ambiguous protein/DNA characters), then it   */
214 /*                            is marked as a protein sequence.             */
215 /*                                                                         */
216 /*     errExpandLevel       -                                              */
217 /*                                                                         */
218 /*                                                                         */
219 /* options parameter                                                       */
220 /* -----------------                                                       */
221 /*                                                                         */
222 /* The options parameter determines which fields in the configPtr are      */
223 /* being given new values.  It contains one or more of the following       */
224 /* values OR'd together :                                                  */
225 /*                                                                         */
226 /*      ALI_SET_DEFAULTS                                                   */
227 /*      ALI_SET_ALL                                                        */
228 /*                                                                         */
229 /*      ALI_SET_GAP_CHAR                                                   */
230 /*      ALI_SET_MISSING_CHAR                                               */
231 /*      ALI_SET_MAYBES                                                     */
232 /*      ALI_SET_READBUFF                                                   */
233 /*      ALI_SET_NUCL_MIN                                                   */
234 /*      ALI_SET_NUCL_MAX                                                   */
235 /*      ALI_SET_CORRUPT_MAX                                                */
236 /*      ALI_SET_DEBUG_LEVEL                                                */
237 /*      ALI_SET_ERRMSG_EXPAND                                              */
238 /*                                                                         */
239 /* If ALI_SET_DEFAULTS or ALI_SET_ALL are used then any others are         */
240 /* ignored.                                                                */
241 /*                                                                         */
242 /*=========================================================================*/
243 
244 Boolean Ali_SetConfig (AliConfigInfoPtr configPtr,
245                        Int2             mask)
246 {
247 
248   /* If this is the first time called, or we're restoring */
249   /* the defaults, then set all options to the defaults.  */
250 
251   if ((s_configurationSet == FALSE) ||
252       (configPtr == NULL)           ||
253       (mask == ALI_SET_DEFAULTS))
254     {
255       s_configInfo.useMaybes                  = ALI_USE_MAYBES;
256       s_configInfo.readBuffSize               = ALI_READ_BUFFSIZE;
257       s_configInfo.debugLevel                 = ALI_SHOW_NONE;
258       s_configInfo.corruptSeqThreshold        = ALI_CORRUPT_SEQ_THRESHOLD;
259       s_configInfo.nuclLineMinThreshold       = ALI_NUCL_LINE_MIN_THRESHOLD;
260       s_configInfo.nuclLineMaxThreshold       = ALI_NUCL_LINE_MAX_THRESHOLD;
261       s_configInfo.declaredInfo.dataType      = ALI_UNKNOWN;
262       s_configInfo.declaredInfo.contigOrInter = ALI_UNKNOWN;
263       s_configInfo.errExpandLevel             = ALI_ERRMSG_EXPAND_SOME;
264       s_configInfo.declaredInfo.idCount       = 0;
265       s_configInfo.declaredInfo.seqLength     = 0;
266       s_configInfo.gapChar = (CharPtr) MemNew (32);
267       sprintf (s_configInfo.gapChar    , "%c%c", ALI_GAP_CHAR, '.');
268       s_configInfo.missingChar = (CharPtr) MemNew (32);
269       sprintf (s_configInfo.missingChar, "%c", ALI_MISSING_CHAR);
270     }
271 
272   s_configurationSet = TRUE;
273 
274   /* If we're setting to the defaults, then we're done */
275 
276   if ((configPtr == NULL) || (mask == ALI_SET_DEFAULTS))
277     return TRUE;
278 
279   /* Otherwise, override the current settings */
280   /* where instructed.                        */
281 
282   if ((mask & ALI_SET_GAP_CHAR) || (mask == ALI_SET_ALL))
283     StringCpy (s_configInfo.gapChar, configPtr->gapChar);
284 
285   if ((mask & ALI_SET_MISSING_CHAR) || (mask == ALI_SET_ALL))
286     StringCpy (s_configInfo.missingChar, configPtr->missingChar);
287 
288   if ((mask & ALI_SET_MAYBES) || (mask == ALI_SET_ALL))
289     s_configInfo.useMaybes = configPtr->useMaybes;
290 
291   if ((mask & ALI_SET_READBUFF) || (mask == ALI_SET_ALL))
292     s_configInfo.readBuffSize = configPtr->readBuffSize;
293 
294   if ((mask & ALI_SET_NUCL_MIN) || (mask == ALI_SET_ALL))
295     s_configInfo.nuclLineMinThreshold = configPtr->nuclLineMinThreshold;
296 
297   if ((mask & ALI_SET_NUCL_MAX) || (mask == ALI_SET_ALL))
298     s_configInfo.nuclLineMaxThreshold = configPtr->nuclLineMaxThreshold;
299 
300   if ((mask & ALI_SET_CORRUPT_MAX) || (mask == ALI_SET_ALL))
301     s_configInfo.corruptSeqThreshold = configPtr->corruptSeqThreshold;
302 
303   if ((mask & ALI_SET_DEBUG_LEVEL) || (mask == ALI_SET_ALL))
304     s_configInfo.debugLevel = configPtr->debugLevel;
305 
306   if ((mask & ALI_SET_ERRMSG_EXPAND) || (mask == ALI_SET_ALL))
307     s_configInfo.errExpandLevel = configPtr->errExpandLevel;
308 
309   /* Return successfully */
310 
311   return TRUE;
312 }
313 
314 /*=========================================================================*/
315 /*                                                                         */
316 /*  s_FreeErrorNode () - Free one error structure.                         */
317 /*                                                                         */
318 /*=========================================================================*/
319 
320 static void s_FreeErrorNode (ErrInfoPtr errorPtr)
321 {
322   if (errorPtr->info != NULL)
323     {
324       MemFree (errorPtr->info);
325       errorPtr->info = NULL;
326     }
327   if (errorPtr->extraInfo != NULL)
328     {
329       MemFree (errorPtr->extraInfo);
330       errorPtr->extraInfo = NULL;
331     }
332   MemFree (errorPtr);
333 }
334 
335 /*=========================================================================*/
336 /*                                                                         */
337 /*  s_FreeErrorList () - Free a linked list of error structures and all    */
338 /*                       the memory that they point to.                    */
339 /*                                                                         */
340 /*=========================================================================*/
341 
342 static void s_FreeErrorList (ErrInfoPtr errorPtr)
343 {
344   ErrInfoPtr currentErr;
345 
346   while (errorPtr != NULL)
347     {
348       currentErr = errorPtr;
349       errorPtr = errorPtr->next;
350       s_FreeErrorNode (currentErr);
351     }
352 }
353 
354 /*=========================================================================*/
355 /*                                                                         */
356 /*  s_FreeSequenceList () - Free a linked list of SeqPart structures and   */
357 /*                          all the memory that they point to.             */
358 /*                                                                         */
359 /*=========================================================================*/
360 
361 static void s_FreeSequenceList (SeqPartPtr seqPtr)
362 {
363   SeqPartPtr currentSeq;
364 
365   while (seqPtr != NULL)
366     {
367       MemFree (seqPtr->sequence);
368       currentSeq = seqPtr;
369       seqPtr = seqPtr->next;
370       MemFree (currentSeq);
371     }
372 }
373 
374 /*=========================================================================*/
375 /*                                                                         */
376 /*  s_FreeIdList () - Free a linked list of ID structures and all the      */
377 /*                    memory that they point to.                           */
378 /*                                                                         */
379 /*=========================================================================*/
380 
381 static void s_FreeIdList (IdInfoPtr idPtr)
382 {
383   IdInfoPtr currentId;
384 
385   while (idPtr != NULL)
386     {
387       MemFree (idPtr->id);
388       s_FreeSequenceList (idPtr->sequence);
389       MemFree (idPtr->defline);
390       currentId = idPtr;
391       idPtr = idPtr->next;
392       MemFree (currentId);
393     }
394 }
395 
396 /*=========================================================================*/
397 /*                                                                         */
398 /*  s_FreeParsedInfo () - Free a ParsedInfo structure and the memory that  */
399 /*                        it points to.                                    */
400 /*                                                                         */
401 /*=========================================================================*/
402 
403 static void s_FreeParsedInfo (ParsedInfoPtr info)
404 {
405   if (info->missingChar != NULL)
406     MemFree (info->missingChar);
407   if (info->gapChar != NULL)
408     MemFree (info->gapChar);
409   if (info->unalignedChar != NULL)
410     MemFree (info->unalignedChar);
411   MemFree (info);
412 }
413 
414 /*=========================================================================*/
415 /*                                                                         */
416 /*  Ali_Free () - Free a AlignFileData structure and all the memory that   */
417 /*                it points to.                                            */
418 /*                                                                         */
419 /*=========================================================================*/
420 
421 void Ali_Free (AlignFileDataPtr fileInfoPtr)
422 {
423 
424   s_FreeIdList (fileInfoPtr->sequences);
425   fileInfoPtr->sequences = NULL;
426   s_FreeIdList (fileInfoPtr->maybes);
427   fileInfoPtr->maybes = NULL;
428   s_FreeErrorList (fileInfoPtr->errors);
429   fileInfoPtr->errors = NULL;
430   s_FreeParsedInfo (fileInfoPtr->info);
431   fileInfoPtr->info = NULL;
432 
433   MemFree (fileInfoPtr);
434 
435   return;
436 }
437 
438 /*=========================================================================*/
439 /*                                                                         */
440 /*  s_FreeRowList () - Free all row data structures and the strings that   */
441 /*                     they point to.                                      */
442 /*                                                                         */
443 /*         NOTE: The actual data strings in the row list may be pointed    */
444 /*               to by other structures, in which case                     */
445 /*               s_FreeRowList_Safe () should be used instead.             */
446 /*                                                                         */
447 /*=========================================================================*/
448 
449 static void s_FreeRowList (ValNodePtr rowList)
450 {
451   ValNodePtr       currentRow;
452   SeqLineInfoPtr   seqLine;
453   DefLineInfoPtr   defLine;
454   OtherLineInfoPtr otherLine;
455 
456   while (rowList != NULL)
457     {
458       switch (rowList->choice)
459         {
460         case ALI_DEFLINE :
461           defLine = (DefLineInfoPtr) rowList->data.ptrvalue;
462           if (defLine->definitions != NULL)
463             MemFree (defLine->definitions);
464           if (defLine->id != NULL)
465             MemFree (defLine->id);
466           MemFree (defLine);
467           break;
468         case ALI_SEQLINE :
469           seqLine = (SeqLineInfoPtr) rowList->data.ptrvalue;
470           if (seqLine->sequence != NULL)
471             MemFree (seqLine->sequence);
472           if (seqLine->id != NULL)
473             MemFree (seqLine->id);
474           if (seqLine->junk != NULL)
475             MemFree (seqLine->junk);
476           MemFree (seqLine);
477           break;
478         case ALI_OTHERLINE :
479           otherLine = (OtherLineInfoPtr) rowList->data.ptrvalue;
480           if (otherLine->other != NULL)
481             MemFree (otherLine->other);
482           if (otherLine->id != NULL)
483             MemFree (otherLine->id);
484           MemFree (otherLine);
485           break;
486         default:
487           break;
488         }
489       currentRow = rowList;
490       rowList = rowList->next;
491       MemFree (currentRow);
492     }
493 }
494 
495 /*=========================================================================*/
496 /*                                                                         */
497 /*  s_FreeRowList_Safe () - Free all row data structures, but don't free   */
498 /*                          the strings that they point since they are     */
499 /*                          still being used in the ID structures.         */
500 /*                                                                         */
501 /*=========================================================================*/
502 
503 static void s_FreeRowList_Safe (ValNodePtr rowList)
504 {
505   ValNodePtr       currentRow;
506   SeqLineInfoPtr   seqLine;
507   DefLineInfoPtr   defLine;
508   OtherLineInfoPtr otherLine;
509 
510   while (rowList != NULL)
511     {
512       switch (rowList->choice)
513         {
514         case ALI_DEFLINE :
515           defLine = (DefLineInfoPtr) rowList->data.ptrvalue;
516           MemFree (defLine);
517           break;
518         case ALI_SEQLINE :
519           seqLine = (SeqLineInfoPtr) rowList->data.ptrvalue;
520           MemFree (seqLine);
521           break;
522         case ALI_OTHERLINE :
523           otherLine = (OtherLineInfoPtr) rowList->data.ptrvalue;
524           MemFree (otherLine);
525           break;
526         default:
527           break;
528         }
529       currentRow = rowList;
530       rowList = rowList->next;
531       MemFree (currentRow);
532     }
533 }
534 
535 /*=========================================================================*/
536 /*                                                                         */
537 /*  s_GetRowIdStr ()                                                       */
538 /*                                                                         */
539 /*=========================================================================*/
540 
541 static CharPtr s_GetRowIdString (ValNodePtr row)
542 {
543   CharPtr          newIdStr;
544   SeqLineInfoPtr   seqLinePtr;
545   DefLineInfoPtr   defLinePtr;
546   OtherLineInfoPtr otherLinePtr;
547 
548   if (row == NULL)
549     return NULL;
550 
551   if (row->choice == ALI_SEQLINE)
552     {
553       seqLinePtr = (SeqLineInfoPtr) row->data.ptrvalue;
554       if (seqLinePtr->id != NULL)
555         {
556           if ((seqLinePtr->maybe == TRUE) && (s_configInfo.useMaybes == FALSE))
557             newIdStr = NULL;
558           else
559             newIdStr = seqLinePtr->id;
560         }
561       else
562         newIdStr = NULL;
563     }
564   else if (row->choice == ALI_DEFLINE)
565     {
566       defLinePtr = (DefLineInfoPtr) row->data.ptrvalue;
567       if (defLinePtr->id != NULL)
568         newIdStr = defLinePtr->id;
569       else
570         newIdStr = NULL;
571     }
572   else if (row->choice == ALI_OTHERLINE)
573     {
574       otherLinePtr = (OtherLineInfoPtr) row->data.ptrvalue;
575       if (otherLinePtr->id != NULL)
576         newIdStr = otherLinePtr->id;
577       else
578         newIdStr = NULL;
579     }
580 
581   return newIdStr;
582 }
583 
584 /*=========================================================================*/
585 /*                                                                         */
586 /*  s_GetRowSeqStr ()                                                      */
587 /*                                                                         */
588 /*=========================================================================*/
589 
590 static CharPtr s_GetRowSeqString (ValNodePtr row)
591 {
592   CharPtr          newSeqStr;
593   SeqLineInfoPtr   seqLinePtr;
594 
595   if (row == NULL)
596     return NULL;
597 
598   if (row->choice == ALI_SEQLINE)
599     {
600       seqLinePtr = (SeqLineInfoPtr) row->data.ptrvalue;
601       if (seqLinePtr->sequence != NULL)
602         {
603           if ((seqLinePtr->maybe == TRUE) && (s_configInfo.useMaybes == FALSE))
604             newSeqStr = NULL;
605           else
606             newSeqStr = seqLinePtr->sequence;
607         }
608       else
609         newSeqStr = NULL;
610     }
611   else
612     newSeqStr = NULL;
613 
614   return newSeqStr;
615 }
616 
617 /*=========================================================================*/
618 /*                                                                         */
619 /*  s_ProcessMaybes ()                                                     */
620 /*                                                                         */
621 /*=========================================================================*/
622 
623 static IdInfoPtr s_ProcessMaybes (ValNodePtr rowList)
624 {
625   ValNodePtr     currentRow;
626   IdInfoPtr      badIdList = NULL;
627   IdInfoPtr      existingId = NULL;
628   IdInfoPtr      currentId = NULL;
629   IdInfoPtr      lastId = NULL;
630   CharPtr        idStr;
631   CharPtr        currentIdStr;
632   SeqPartPtr     newSeqPart;
633   SeqPartPtr     lastSeqPart;
634   SeqLineInfoPtr seqLinePtr;
635 
636   currentRow = rowList;
637 
638   while (currentRow != NULL)
639     {
640       idStr = s_GetRowIdString (currentRow);
641       if (idStr != NULL)
642         currentIdStr = idStr;
643 
644       if (currentRow->choice == ALI_SEQLINE)
645         {
646           seqLinePtr = (SeqLineInfoPtr) currentRow->data.ptrvalue;
647           if (seqLinePtr->maybe == TRUE)
648             {
649 
650               /* Find the ID that this sequence 'belongs to' */
651 
652               existingId = badIdList;
653               while (existingId != NULL)
654                 {
655                   if (StringCmp(existingId->id,currentIdStr) == 0)
656                     break;
657                   existingId = existingId->next;
658                 }
659               
660               if (existingId != NULL)
661                 currentId = existingId;
662               else
663                 {
664                   currentId = (IdInfoPtr) MemNew (sizeof(IdInfo));
665                   if (currentId == NULL)
666                     return NULL;
667                   
668                   currentId->sequence = NULL;
669                   currentId->id       = currentIdStr;
670                   currentId->length   = 0;
671                   currentId->next     = NULL;
672                   
673                   if (badIdList == NULL)
674                     badIdList = currentId;
675                   else
676                     {
677                       lastId = badIdList;
678                       while (lastId->next != NULL)
679                         lastId = lastId->next;
680                       lastId->next = currentId;
681                     }
682                 }
683           
684               /* Add the sequence to the current ID */
685 
686               newSeqPart = (SeqPartPtr) MemNew(sizeof(SeqPart));
687               if (newSeqPart == NULL)
688                 return NULL;
689               
690               newSeqPart->sequence = (CharPtr) currentRow->data.ptrvalue;
691               newSeqPart->next     = NULL;
692               
693               if (currentId->sequence == NULL)
694                 currentId->sequence = newSeqPart;
695               else
696                 lastSeqPart->next = newSeqPart;
697               
698               currentId->length += StringLen (newSeqPart->sequence);
699               lastSeqPart = newSeqPart;
700               
701             }
702         }
703       currentRow = currentRow->next;
704     }
705 
706   return badIdList;
707 }
708 
709 /*=========================================================================*/
710 /*                                                                         */
711 /*  DisplayRowList() - Prints to stderr the linked list of ValNodes that   */
712 /*                     contain the data read in from the alignment file.   */
713 /*                                                                         */
714 /*=========================================================================*/
715 
716 static void s_DisplayRowList (ValNodePtr rowList,
717                               Int2       mask)
718 {
719   ValNodePtr       currRow;
720   SeqLineInfoPtr   seqLinePtr;
721   DefLineInfoPtr   defLinePtr;
722   OtherLineInfoPtr otherLinePtr;
723   Char             cLineType;
724 
725   currRow = rowList;
726   while (currRow != NULL)
727     {
728       if ((currRow->choice == ALI_SEQLINE) &&
729           ((mask & ALI_SHOW_SEQUENCES) ||
730            (mask == ALI_SHOW_ALL)))
731         {
732           seqLinePtr = (SeqLineInfoPtr) currRow->data.ptrvalue;
733 
734           if (seqLinePtr->type == ALI_NUCLEOTIDE)
735             cLineType = 'N';
736           else if (seqLinePtr->type == ALI_PROTEIN)
737             cLineType = 'P';
738           else if (seqLinePtr->type == ALI_AMBIGUOUS)
739             cLineType = 'U';
740 
741           if (seqLinePtr->maybe == FALSE)
742             {
743               if (seqLinePtr->id != NULL)
744                 fprintf(stderr,"%04d: ID          : %s\n",
745                         seqLinePtr->rowNum,
746                         seqLinePtr->id);
747               if (seqLinePtr->sequence != NULL)
748                 fprintf(stderr,"%04d: SEQUENCE[%c] : %s\n",
749                         seqLinePtr->rowNum,
750                         cLineType,
751                         seqLinePtr->sequence);
752             }
753           else
754             {
755               if (seqLinePtr->id != NULL)
756                 fprintf(stderr,"%04d: MAYBE ID          : %s\n",
757                         seqLinePtr->rowNum,
758                         seqLinePtr->id);
759               if (seqLinePtr->sequence != NULL)
760                 fprintf(stderr,"%04d: MAYBE SEQUENCE[%c] : %s\n",
761                         seqLinePtr->rowNum,
762                         cLineType,
763                         seqLinePtr->sequence);
764             }
765         }
766       else if ((currRow->choice == ALI_DEFLINE) &&
767                ((mask & ALI_SHOW_DEFLINES) ||
768                 (mask == ALI_SHOW_ALL)))
769         {
770           defLinePtr = (DefLineInfoPtr) currRow->data.ptrvalue;
771           if (defLinePtr->id != NULL)
772             fprintf(stderr,"%04d: DEFLINE ID          : %s\n",
773                     defLinePtr->rowNum,
774                     defLinePtr->id);
775           if (defLinePtr->definitions != NULL)
776             fprintf(stderr,"%04d: DEFLINE DEFINITIONS : %s\n", 
777                     defLinePtr->rowNum,
778                     defLinePtr->definitions);
779         }
780       else if ((currRow->choice == ALI_OTHERLINE) &&
781                ((mask & ALI_SHOW_OTHERS) ||
782                 (mask == ALI_SHOW_ALL)))
783         {
784           otherLinePtr = (OtherLineInfoPtr) currRow->data.ptrvalue;
785           if (otherLinePtr->id != NULL)
786             fprintf(stderr,"%04d: OTHER ID : %s\n", otherLinePtr->rowNum,
787                     otherLinePtr->id);
788           if (otherLinePtr->other != NULL)
789             fprintf(stderr,"%04d: OTHER    : %s\n", otherLinePtr->rowNum,
790                     otherLinePtr->other);
791         }
792       currRow = currRow->next;
793     }
794 
795   return;
796 }
797 
798 
799 /*=========================================================================*/
800 /*                                                                         */
801 /* s_isInterleaved ()                                                      */
802 /*                                                                         */
803 /*=========================================================================*/
804 
805 static Boolean s_IsInterleaved (ValNodePtr rowList,
806                                 Int2 PNTR idCount)
807 {
808   ValNodePtr       currentRow;
809   CharPtr          newIdStr;
810   IdInfoPtr        idList = NULL;
811   IdInfoPtr        lastId = NULL;
812   IdInfoPtr        currentId = NULL;
813   IdInfoPtr        existingId = NULL;
814   Boolean          isInterleaved;
815   Int4             patternRowCount;
816   Int4             patternCharCount;
817   Int4             currentRowCount;
818   Int4             currentCharCount;
819   Boolean          isFirstId;
820   SeqLineInfoPtr   seqLinePtr;
821   DefLineInfoPtr   defLinePtr;
822   OtherLineInfoPtr otherLinePtr;
823   Boolean          isMaybe;
824 
825   isInterleaved = FALSE;
826   currentRow  = rowList;
827 
828   patternRowCount  = 0;
829   patternCharCount = 0;
830   currentRowCount  = 0;
831   currentCharCount = 0;
832   isFirstId        = TRUE;
833   *idCount         = 0;
834 
835   /* Search the row list for IDs */
836 
837   while (currentRow != NULL)
838     {
839 
840       /* Look for an ID */
841 
842       newIdStr = NULL;
843       isMaybe = FALSE;
844 
845       if (currentRow->choice == ALI_SEQLINE)
846         {
847           seqLinePtr = (SeqLineInfoPtr) currentRow->data.ptrvalue;
848           if (seqLinePtr->id != NULL)
849             {
850               if ((seqLinePtr->maybe == TRUE) &&
851                   (s_configInfo.useMaybes == FALSE))
852                 newIdStr = NULL;
853               else
854                 newIdStr = seqLinePtr->id;
855             }
856         }
857       else if (currentRow->choice == ALI_DEFLINE)
858         {
859           defLinePtr = (DefLineInfoPtr) currentRow->data.ptrvalue;
860           if (defLinePtr->id != NULL)
861             newIdStr = defLinePtr->id;
862         }
863       else if (currentRow->choice == ALI_OTHERLINE)
864         {
865           otherLinePtr = (OtherLineInfoPtr) currentRow->data.ptrvalue;
866           if (otherLinePtr->id != NULL)
867             newIdStr = otherLinePtr->id;
868         }
869 
870       /* If we find an ID, see if it's one */
871       /* that we already have.             */
872 
873       if (newIdStr != NULL)
874         {
875 
876           existingId = idList;
877           while (existingId != NULL)
878             {
879               if (StringCmp(existingId->id,newIdStr) == 0)
880                 break;
881               existingId = existingId->next;
882             }
883 
884           /* Already have -- break and return TRUE */
885 
886           if (existingId != NULL)
887             {
888               isInterleaved = TRUE;
889               break;
890             }
891 
892           /* Otherwise, add the ID to the list */
893 
894           currentRowCount  = 0;
895           currentCharCount = 0;
896 
897           if (idList != NULL)
898             isFirstId = FALSE;
899 
900           (*idCount)++;
901           
902           currentId = (IdInfoPtr) MemNew (sizeof(IdInfo));
903           if (currentId == NULL)
904             return FALSE;
905           
906           currentId->sequence = NULL;
907           currentId->id       = newIdStr;
908           currentId->length   = 0;
909           currentId->next     = NULL;
910           
911           if (idList == NULL)
912             idList = currentId;
913           else
914             {
915               lastId = idList;
916               while (lastId->next != NULL)
917                 lastId = lastId->next;
918               lastId->next = currentId;
919             }
920         }
921 
922       /* Process sequence rows */
923 
924       if (currentRow->choice == ALI_SEQLINE)
925         {
926 
927           if (seqLinePtr->sequence != NULL)
928             if ((s_configInfo.useMaybes == TRUE) ||
929                 (s_configInfo.useMaybes == FALSE) &&
930                 (seqLinePtr->maybe == FALSE))
931               {
932                 /* There must be an ID before the first sequence */
933                 
934                 if (currentId == NULL)
935                   {
936                     isInterleaved = FALSE;
937                     break;
938                   }
939                 
940                 /* Look for sequences that probably */
941                 /* have no ID assigned to them.     */
942                 
943                 seqLinePtr = (SeqLineInfoPtr) currentRow->data.ptrvalue;
944                 if (isFirstId)
945                   {
946                     patternRowCount++;
947                     patternCharCount += StringLen (seqLinePtr->sequence);
948                   }
949                 else
950                   {
951                     currentRowCount++;
952                     currentCharCount += StringLen (seqLinePtr->sequence);
953                     if ((currentRowCount > patternRowCount) &&
954                         (currentCharCount > patternCharCount))
955                       {
956                         isInterleaved = TRUE;
957                         break;
958                       }
959                   }
960               }
961           
962         }
963 
964       /* Go to next row */
965 
966       currentRow = currentRow->next;
967     }
968 
969   /* Delete the ID records that we created */
970   /*  NOTE -- The ID strings themselves    */
971   /*          are stored elsewhere and     */
972   /*          only pointed to here, so     */
973   /*          DON"T delete them.      o     */
974 
975   while (idList != NULL)
976     {
977       lastId = idList;
978       idList = idList->next;
979       MemFree(lastId);
980     }
981 
982   /* Return result of search */
983 
984   return isInterleaved;
985 }
986 
987 /*=========================================================================*/
988 /*                                                                         */
989 /* s_ProcessInterId ()                                                     */
990 /*                                                                         */
991 /*=========================================================================*/
992 
993 static Boolean s_ProcessInterId (CharPtr          newIdStr,
994                                  PatternInfoPtr   pattern,
995                                  AlignFileDataPtr fileInfoPtr,
996                                  Boolean          isMaybe)
997 {
998   IdInfoPtr   lastId = NULL;
999   IdInfoPtr   existingId = NULL;
1000   ErrInfoPtr  errPtr;
1001 
1002   /* If we've got all our ID's then */
1003   /* ignore any further ones.       */
1004 
1005   if (pattern->gotAllIds == TRUE)
1006     return TRUE;
1007 
1008   /* All ID's, except for the first one, should */
1009   /* immediately follow a sequence line.        */
1010   
1011   if (pattern->isFirstId == FALSE)
1012     {
1013       if (pattern->lastRow->choice != ALI_SEQLINE)
1014         {
1015           errPtr = Ali_AddError (&(fileInfoPtr->errors),
1016                                  ERR_ID_NO_PRECEDING_SEQ,
1017                                  newIdStr);
1018           return FALSE;
1019         }
1020       else
1021         pattern->isFirstGroup = FALSE;
1022     }
1023 
1024   /* If this id already exists, */
1025   /* make it the current ID.    */
1026   
1027   existingId = fileInfoPtr->sequences;
1028   while (existingId != NULL)
1029     {
1030       if (StringCmp(existingId->id,newIdStr) == 0)
1031         break;
1032       existingId = existingId->next;
1033     }
1034   
1035   if (existingId != NULL)
1036     pattern->currentId = existingId;
1037   
1038   /* Otherwise create a new Id record */
1039   /* and add it to the end of list.   */
1040   
1041   else
1042     {
1043       pattern->currentId = (IdInfoPtr) MemNew (sizeof(IdInfo));
1044       if (pattern->currentId == NULL)
1045         {
1046           Ali_AddError (&(fileInfoPtr->errors), ERR_OUT_OF_MEMORY);
1047           return FALSE;
1048         }
1049       
1050       pattern->currentId->sequence = NULL;
1051       pattern->currentId->id       = newIdStr;
1052       pattern->currentId->length   = 0;
1053       pattern->currentId->next     = NULL;
1054       
1055       if (fileInfoPtr->sequences == NULL)
1056         fileInfoPtr->sequences = pattern->currentId;
1057       else
1058         {
1059           lastId = fileInfoPtr->sequences;
1060           while (lastId->next != NULL)
1061             lastId = lastId->next;
1062           lastId->next = pattern->currentId;
1063         }
1064       
1065       pattern->currentIdCount++;
1066       if (pattern->currentIdCount == pattern->idCount)
1067         pattern->gotAllIds = TRUE;
1068     }
1069   
1070   if (pattern->isFirstId)
1071     pattern->isFirstId = FALSE;
1072 
1073   /* Return successfully */
1074 
1075   return TRUE;
1076 }
1077 
1078 /*=========================================================================*/
1079 /*                                                                         */
1080 /* s_ProcessInterSeq ()                                                    */
1081 /*                                                                         */
1082 /*=========================================================================*/
1083 
1084 static Boolean s_ProcessInterSeq (CharPtr          newSeqStr,
1085                                   PatternInfoPtr   pattern,
1086                                   AlignFileDataPtr fileInfoPtr,
1087                                   Boolean          isMaybe)
1088 {
1089   SeqPartPtr  newSeqPart = NULL;
1090   ErrInfoPtr  errPtr = NULL;
1091 
1092   /* There must be an ID before the first sequence */
1093   
1094   if (pattern->currentId == NULL)
1095     {
1096       errPtr = Ali_AddError (&(fileInfoPtr->errors), ERR_SEQ_WITHOUT_ID,
1097                              newSeqStr);
1098       return FALSE;
1099     }
1100   
1101   /* Add the sequence to the current ID */
1102 
1103   newSeqPart = (SeqPartPtr) MemNew(sizeof(SeqPart));
1104   if (newSeqPart == NULL)
1105     {
1106       Ali_AddError (&(fileInfoPtr->errors), ERR_OUT_OF_MEMORY);
1107       return FALSE;
1108     }
1109   
1110   newSeqPart->sequence = newSeqStr;
1111   newSeqPart->next     = NULL;
1112   
1113   if (pattern->currentId->sequence == NULL)
1114     pattern->currentId->sequence = newSeqPart;
1115   else
1116     pattern->lastSeqPart->next = newSeqPart;
1117   
1118   pattern->currentId->length += StringLen (newSeqPart->sequence);
1119   pattern->lastSeqPart = newSeqPart;
1120   
1121   /* If we've started repeating IDs then */
1122   /* rotate through the id list.         */
1123   
1124   if (pattern->gotAllIds == TRUE)
1125     {
1126       if (pattern->currentId->next == NULL)
1127         pattern->currentId = fileInfoPtr->sequences;
1128       else
1129         pattern->currentId = pattern->currentId->next;
1130 
1131       pattern->lastSeqPart = pattern->currentId->sequence;
1132       while (pattern->lastSeqPart->next != NULL)
1133         pattern->lastSeqPart = pattern->lastSeqPart->next;
1134     }
1135 
1136   /* Return successfully */
1137   
1138   return TRUE;
1139 }
1140 
1141 /*=========================================================================*/
1142 /*                                                                         */
1143 /* s_AnalyzeInterleaved ()                                                 */
1144 /*                                                                         */
1145 /*=========================================================================*/
1146 
1147 static Boolean s_AnalyzeInterleaved (ValNodePtr       rowList,
1148                                      AlignFileDataPtr fileInfoPtr,
1149                                      Int2             idCount)
1150 {
1151   ValNodePtr       currentRow;
1152   Boolean          isValidPattern;
1153   IdInfoPtr        currentId = NULL;
1154   Int4             previousLength;
1155   ErrInfoPtr       errPtr;
1156   PatternInfoPtr   pattern;
1157   SeqLineInfoPtr   seqLinePtr;
1158   DefLineInfoPtr   defLinePtr;
1159   OtherLineInfoPtr otherLinePtr;
1160   Boolean          firstDefline = TRUE;
1161   IdInfoPtr        lastId = NULL;
1162 
1163   pattern = (PatternInfoPtr) MemNew (sizeof (PatternInfo));
1164 
1165   pattern->currentDeflineId = NULL;
1166   pattern->lastRow          = NULL;
1167   pattern->isFirstId        = TRUE;
1168   pattern->isFirstGroup     = TRUE;
1169   pattern->maybesFound      = FALSE;
1170   pattern->gotAllIds        = FALSE;
1171   pattern->idCount          = idCount;
1172   pattern->currentIdCount   = 0;
1173 
1174   pattern->foundInfo.dataType      = ALI_UNKNOWN;
1175   pattern->foundInfo.contigOrInter = ALI_UNKNOWN;
1176   pattern->foundInfo.idCount       = 0;
1177   pattern->foundInfo.seqLength     = 0;
1178 
1179   /* Match the sequences up with the IDs */
1180 
1181   currentRow    = rowList;
1182   isValidPattern  = TRUE;
1183 
1184   while (currentRow != NULL)
1185     {
1186 
1187       if (currentRow->choice == ALI_SEQLINE)
1188         {
1189   
1190           seqLinePtr = (SeqLineInfoPtr) currentRow->data.ptrvalue;
1191 
1192           if ((seqLinePtr->maybe == FALSE) ||
1193               (seqLinePtr->maybe == TRUE) && (s_configInfo.useMaybes == TRUE))
1194             {
1195               if (seqLinePtr->id != NULL)
1196                 {
1197                   /* Process the ID */
1198   
1199                   isValidPattern = s_ProcessInterId (seqLinePtr->id,
1200                                                      pattern,
1201                                                      fileInfoPtr,
1202                                                      seqLinePtr->maybe);
1203                   if (isValidPattern == FALSE)
1204                     break;
1205                 }
1206 
1207               if (seqLinePtr->sequence != NULL)
1208                 {
1209                   isValidPattern = s_ProcessInterSeq (seqLinePtr->sequence,
1210                                                       pattern,
1211                                                       fileInfoPtr,
1212                                                       seqLinePtr->maybe);
1213                   if (isValidPattern == FALSE)
1214                     break;
1215                 }
1216               pattern->lastRow = currentRow;
1217             }
1218           else
1219             pattern->maybesFound = TRUE;
1220         }
1221       else if (currentRow->choice == ALI_DEFLINE)
1222         {
1223           defLinePtr = (DefLineInfoPtr) currentRow->data.ptrvalue;
1224           if (defLinePtr->id != NULL)
1225             {
1226               isValidPattern = s_ProcessInterId (defLinePtr->id,
1227                                                  pattern,
1228                                                  fileInfoPtr,
1229                                                  FALSE);
1230               if (isValidPattern == FALSE)
1231                 break;
1232             }
1233           if (defLinePtr->definitions != NULL)
1234             {
1235               if (firstDefline)
1236                 {
1237                   firstDefline = FALSE;
1238                   pattern->currentDeflineId = fileInfoPtr->sequences;
1239                 }
1240               else
1241                 pattern->currentDeflineId =
1242                   pattern->currentDeflineId->next;
1243               
1244               if (pattern->currentDeflineId == NULL)
1245                 {
1246                   errPtr = Ali_AddError (&(fileInfoPtr->errors),
1247                                          ERR_DEFLINE_WITH_NO_ID,
1248                                          defLinePtr->definitions);
1249                   errPtr->rowNum = defLinePtr->rowNum;
1250                   isValidPattern = FALSE;
1251                   break;
1252                 }
1253               else
1254                 {
1255                   pattern->currentDeflineId->defline =
1256                     defLinePtr->definitions;
1257                 }
1258             }
1259           pattern->lastRow = currentRow;
1260         }
1261       else if (currentRow->choice == ALI_OTHERLINE)
1262         {
1263           otherLinePtr = (OtherLineInfoPtr) currentRow->data.ptrvalue;
1264           if (otherLinePtr->id != NULL)
1265             {
1266               isValidPattern = s_ProcessInterId (otherLinePtr->id,
1267                                                  pattern,
1268                                                  fileInfoPtr,
1269                                                  FALSE);
1270               if (isValidPattern == FALSE)
1271                 break;
1272             }
1273           pattern->lastRow = currentRow;
1274         }
1275 
1276       currentRow = currentRow->next;
1277     }
1278 
1279   /* If we found one defline, then */
1280   /* make sure they were all there */
1281 
1282   if (firstDefline == FALSE)
1283     {
1284       lastId = fileInfoPtr->sequences;
1285       if (lastId != NULL)
1286         {
1287           while (lastId->next != NULL)
1288             lastId = lastId->next;
1289           if (lastId->defline == NULL)
1290             {
1291               errPtr = Ali_AddError (&(fileInfoPtr->errors),
1292                                      ERR_ID_WITH_NO_DEFLINE,
1293                                      lastId->id);
1294               isValidPattern = FALSE;
1295             }
1296         }
1297     }
1298 
1299   /* If pattern not found, return failure */
1300 
1301   if (!isValidPattern)
1302     return FALSE;
1303 
1304   /* If there was a declared number of sequences then */
1305   /* check to see that it matches the number found.   */
1306 
1307   if ((s_configInfo.declaredInfo.idCount !=0) &&
1308       (s_configInfo.declaredInfo.idCount != idCount))
1309     {
1310       errPtr = Ali_AddError (&(fileInfoPtr->errors), ERR_ID_COUNT_MISMATCH,
1311                              idCount, s_configInfo.declaredInfo.idCount);
1312       errPtr->level = LEVEL_WARNING;
1313     }
1314 
1315   /* Sequences should all be the same length. */
1316 
1317   currentId = fileInfoPtr->sequences;
1318   pattern->isFirstId = TRUE;
1319 
1320   while (currentId != NULL)
1321     {
1322       if (pattern->isFirstId)
1323         pattern->isFirstId = FALSE;
1324       else
1325         {
1326           if (previousLength < currentId->length)
1327             {
1328               errPtr = Ali_AddError (&(fileInfoPtr->errors),
1329                                      ERR_SEQUENCE_TOO_LONG,
1330                                      currentId->id,
1331                                      previousLength,
1332                                      currentId->length);
1333               break;
1334             }
1335           else if (previousLength > currentId->length)
1336             {
1337               errPtr = Ali_AddError (&(fileInfoPtr->errors),
1338                                      ERR_SEQUENCE_TOO_SHORT,
1339                                      currentId->id,
1340                                      previousLength,
1341                                      currentId->length);
1342               break;
1343             }
1344         }
1345       previousLength = currentId->length;
1346       currentId = currentId->next;
1347     }
1348 
1349   /* Check to see that declared sequence */
1350   /* length matches the lengths found.   */
1351 
1352   if ((s_configInfo.declaredInfo.seqLength != 0) &&
1353       (s_configInfo.declaredInfo.seqLength != previousLength))
1354     {
1355       errPtr = Ali_AddError (&(fileInfoPtr->errors),ERR_SEQ_LENGTH_MISMATCH,
1356                              previousLength,
1357                              s_configInfo.declaredInfo.seqLength);
1358       errPtr->level = LEVEL_WARNING;
1359     }
1360 
1361   /* Process the maybes if they weren't used already */
1362 
1363   if (pattern->maybesFound == TRUE)
1364     fileInfoPtr->maybes = s_ProcessMaybes (rowList);
1365 
1366   /* Return successfully */
1367 
1368   if (currentId == NULL)
1369     return TRUE;
1370   else
1371     return FALSE;
1372 } 
1373 
1374 /*=========================================================================*/
1375 /*                                                                         */
1376 /* s_ProcessContigId ()                                                    */
1377 /*                                                                         */
1378 /*=========================================================================*/
1379 
1380 static Boolean s_ProcessContigId (CharPtr          newIdStr,
1381                                   PatternInfoPtr   pattern,
1382                                   AlignFileDataPtr fileInfoPtr)
1383 {
1384   IdInfoPtr      existingId = NULL;
1385   ErrInfoPtr     errPtr;
1386   IdInfoPtr      lastId = NULL;
1387 
1388   if (pattern->isFirstId == FALSE)
1389     {
1390       pattern->isFirstGroup = FALSE;
1391 
1392       /* The length of the last pattern must match */
1393       /* the length of previous ones.              */
1394   
1395       if (pattern->currentId->length < pattern->foundInfo.seqLength)
1396         {
1397           errPtr = Ali_AddError (&(fileInfoPtr->errors),
1398                                  ERR_SEQUENCE_TOO_SHORT,
1399                                  pattern->currentId->id,
1400                                  pattern->foundInfo.seqLength,
1401                                  pattern->currentId->length);
1402           return FALSE;
1403         }
1404     }  
1405   
1406   pattern->hasFullLength = FALSE;
1407 
1408   /* See if this ID already exists */
1409   
1410   existingId = fileInfoPtr->sequences;
1411   while (existingId != NULL)
1412     {
1413       if (StringCmp(existingId->id,newIdStr) == 0)
1414         {
1415           errPtr = Ali_AddError (&(fileInfoPtr->errors), ERR_DUPLICATE_IDS, 
1416                                  newIdStr);
1417           return FALSE;
1418         }
1419       existingId = existingId->next;
1420     }
1421   
1422   /* If this id already exists, */
1423   /* make it the current ID.    */
1424   
1425   if (existingId != NULL)
1426     pattern->currentId = existingId;
1427   
1428   /* Otherwise create a new Id record */
1429   /* and add it to the end of list.   */
1430   
1431   else
1432     {
1433       pattern->currentId = (IdInfoPtr) MemNew (sizeof(IdInfo));
1434       if (pattern->currentId == NULL)
1435         {
1436           Ali_AddError (&(fileInfoPtr->errors), ERR_OUT_OF_MEMORY);
1437           return FALSE;
1438         }
1439       
1440       pattern->currentId->sequence = NULL;
1441       pattern->currentId->id       = newIdStr;
1442       pattern->currentId->length   = 0;
1443       pattern->currentId->next     = NULL;
1444       
1445       if (fileInfoPtr->sequences == NULL)
1446         fileInfoPtr->sequences = pattern->currentId;
1447       else
1448         {
1449           lastId = fileInfoPtr->sequences;
1450           while (lastId->next != NULL)
1451             lastId = lastId->next;
1452           lastId->next = pattern->currentId;
1453         }
1454       pattern->foundInfo.idCount++;
1455     }
1456   
1457   if (pattern->isFirstId)
1458     pattern->isFirstId = FALSE;
1459 
1460   /* Return successfully */
1461 
1462   return TRUE;
1463 }
1464 
1465 /*=========================================================================*/
1466 /*                                                                         */
1467 /* s_ProcessContigSeq ()                                                   */
1468 /*                                                                         */
1469 /*=========================================================================*/
1470 
1471 static Boolean s_ProcessContigSeq (CharPtr          newSeqStr,
1472                                    PatternInfoPtr   pattern,
1473                                    AlignFileDataPtr fileInfoPtr)
1474 {
1475   SeqPartPtr     newSeqPart = NULL;
1476   ErrInfoPtr     errPtr;
1477 
1478   /* There must be an ID before we get a sequence */
1479 
1480   if (pattern->currentId == NULL)
1481     {
1482       errPtr = Ali_AddError (&(fileInfoPtr->errors), ERR_SEQ_WITHOUT_ID,
1483                              newSeqStr);
1484       return FALSE;
1485     }
1486   
1487   /* Add the sequence to the current ID */
1488   
1489   newSeqPart = (SeqPartPtr) MemNew(sizeof(SeqPart));
1490   if (newSeqPart == NULL)
1491     {
1492       Ali_AddError (&(fileInfoPtr->errors), ERR_OUT_OF_MEMORY);
1493       return FALSE;
1494     }
1495   
1496   newSeqPart->sequence = newSeqStr;
1497   newSeqPart->next     = NULL;
1498   
1499   if (pattern->currentId->sequence == NULL)
1500     pattern->currentId->sequence = newSeqPart;
1501   else
1502     pattern->lastSeqPart->next = newSeqPart;
1503 
1504   /* Make sure that sequence length hasn't */
1505   /* exceeded that of previous sequences.  */
1506   
1507   pattern->currentId->length += StringLen (newSeqPart->sequence);
1508   pattern->lastSeqPart = newSeqPart;
1509   
1510   if (pattern->isFirstGroup)
1511     {
1512       pattern->foundInfo.seqLength += StringLen (newSeqPart->sequence);
1513     }
1514   else
1515     {
1516 
1517       if (pattern->currentId->length == pattern->foundInfo.seqLength)
1518         pattern->hasFullLength = TRUE;
1519       else if (pattern->currentId->length > pattern->foundInfo.seqLength)
1520         {
1521           errPtr = Ali_AddError (&(fileInfoPtr->errors),
1522                                  ERR_SEQUENCE_TOO_LONG,
1523                                  pattern->currentId->id,
1524                                  pattern->foundInfo.seqLength,
1525                                  pattern->currentId->length);
1526           return FALSE;
1527         }
1528     }
1529 
1530   /* Return successfully */
1531 
1532   return TRUE;
1533 }
1534 
1535 /*=========================================================================*/
1536 /*                                                                         */
1537 /* s_AnalyzeContiguous ()                                                  */
1538 /*                                                                         */
1539 /*=========================================================================*/
1540 
1541 static Boolean s_AnalyzeContiguous (ValNodePtr       rowList,
1542                                     AlignFileDataPtr fileInfoPtr)
1543 {
1544   ValNodePtr       currentRow;
1545   SeqLineInfoPtr   seqLinePtr;
1546   DefLineInfoPtr   defLinePtr;
1547   OtherLineInfoPtr otherLinePtr;
1548   Boolean          isValidPattern;
1549   IdInfoPtr        lastId = NULL;
1550   IdInfoPtr        nextToLastId = NULL;
1551   ErrInfoPtr       errPtr;
1552   PatternInfoPtr   pattern;
1553   Boolean          firstDefline = TRUE;
1554 
1555   /* Initialize the pattern info */
1556 
1557   pattern = (PatternInfoPtr) MemNew (sizeof (PatternInfo));
1558 
1559   pattern->currentDeflineId = NULL;
1560   pattern->currentId        = NULL;
1561   pattern->lastSeqPart      = NULL;
1562   pattern->hasFullLength    = FALSE;
1563   pattern->isFirstId        = TRUE;
1564   pattern->isFirstGroup     = TRUE;
1565   pattern->maybesFound      = FALSE;
1566 
1567   pattern->foundInfo.dataType      = ALI_UNKNOWN;
1568   pattern->foundInfo.contigOrInter = ALI_UNKNOWN;
1569   pattern->foundInfo.idCount       = 0;
1570   pattern->foundInfo.seqLength     = 0;
1571 
1572   /* Match the sequences up with the IDS */
1573 
1574   currentRow    = rowList;
1575   isValidPattern  = TRUE;
1576 
1577   while (currentRow != NULL)
1578     {
1579 
1580       /* Process sequence lines */
1581 
1582       if (currentRow->choice == ALI_SEQLINE)
1583         {
1584 
1585           seqLinePtr = (SeqLineInfoPtr) currentRow->data.ptrvalue;
1586 
1587           /* If we already have a sequence equal in */
1588           /* in length to those that came before,   */
1589           /* then this line may actually be an      */
1590           /* ID.                                    */
1591              
1592 
1593           if ((pattern->hasFullLength == TRUE) &&
1594               (seqLinePtr->id == NULL))
1595             {
1596               Ali_ChangeRowToOther (currentRow);
1597               continue;
1598             }
1599 
1600           /* Process the line as a sequence */
1601 
1602           if ((seqLinePtr->maybe == FALSE) ||
1603               (seqLinePtr->maybe == TRUE) && (s_configInfo.useMaybes == TRUE))
1604             {
1605               if (seqLinePtr->id != NULL)
1606                 {
1607                   isValidPattern = s_ProcessContigId (seqLinePtr->id,
1608                                                       pattern,
1609                                                       fileInfoPtr);
1610                   if (isValidPattern == FALSE)
1611                     break;
1612                 }
1613               
1614               if (seqLinePtr->sequence != NULL)
1615                 {
1616                   isValidPattern = s_ProcessContigSeq (seqLinePtr->sequence,
1617                                                        pattern,
1618                                                        fileInfoPtr);
1619                   if (isValidPattern == FALSE)
1620                     break;
1621                 }
1622               pattern->lastRow = currentRow;
1623             }
1624           else
1625             pattern->maybesFound = TRUE;
1626         }
1627 
1628       /* Process Definition lines */
1629 
1630       else if (currentRow->choice == ALI_DEFLINE)
1631         {
1632           defLinePtr = (DefLineInfoPtr) currentRow->data.ptrvalue;
1633           if (defLinePtr->id != NULL)
1634             {
1635               isValidPattern = s_ProcessContigId (defLinePtr->id,
1636                                                   pattern,
1637                                                   fileInfoPtr);
1638               if (isValidPattern == FALSE)
1639                 break;
1640             }
1641 
1642           if (defLinePtr->definitions != NULL)
1643             {
1644               if (firstDefline)
1645                 {
1646                   firstDefline = FALSE;
1647                   pattern->currentDeflineId = fileInfoPtr->sequences;
1648                 }
1649               else
1650                 pattern->currentDeflineId =
1651                   pattern->currentDeflineId->next;
1652               
1653               if (pattern->currentDeflineId == NULL)
1654                 {
1655                   errPtr = Ali_AddError (&(fileInfoPtr->errors),
1656                                          ERR_DEFLINE_WITH_NO_ID,
1657                                          defLinePtr->definitions);
1658                   errPtr->rowNum = defLinePtr->rowNum;
1659                   isValidPattern = FALSE;
1660                   break;
1661                 }
1662               else
1663                 {
1664                   pattern->currentDeflineId->defline =
1665                     defLinePtr->definitions;
1666                 }
1667             }
1668           pattern->lastRow = currentRow;
1669         }
1670 
1671       /* Process Other lines */
1672 
1673       else if (currentRow->choice == ALI_OTHERLINE)
1674         {
1675           otherLinePtr = (OtherLineInfoPtr) currentRow->data.ptrvalue;
1676           if (otherLinePtr->id != NULL)
1677             {
1678               isValidPattern = s_ProcessContigId (otherLinePtr->id,
1679                                                   pattern,
1680                                                   fileInfoPtr);
1681               if (isValidPattern == FALSE)
1682                 break;
1683             }
1684           pattern->lastRow = currentRow;
1685         }
1686 
1687       currentRow = currentRow->next;
1688     }
1689 
1690   /* If the last sequence is too short, mark */
1691   /* it as a maybe.                          */
1692   
1693   if (pattern->lastRow->choice == ALI_SEQLINE)
1694     {
1695       if (s_configInfo.useMaybes == FALSE)
1696         {
1697           pattern->maybesFound = TRUE;
1698           if (pattern->currentId->length < pattern->foundInfo.seqLength)
1699             {
1700               seqLinePtr = (SeqLineInfoPtr)pattern->lastRow->data.ptrvalue;
1701               seqLinePtr->maybe = TRUE;
1702               nextToLastId = NULL;
1703               lastId = fileInfoPtr->sequences;
1704               while (lastId->next != NULL)
1705                 {
1706                   nextToLastId = lastId;
1707                   lastId = lastId->next;
1708                 }
1709               MemFree(lastId);
1710               if (nextToLastId == NULL)
1711                 fileInfoPtr->sequences = NULL;
1712               else
1713                 nextToLastId->next = NULL;
1714             }
1715         }
1716       else
1717         {
1718           if (pattern->currentId->length < pattern->foundInfo.seqLength)
1719             {
1720               errPtr = Ali_AddError (&(fileInfoPtr->errors),
1721                                      ERR_SEQUENCE_TOO_SHORT,
1722                                      pattern->currentId->id,
1723                                      pattern->foundInfo.seqLength,
1724                                      pattern->currentId->length);
1725               isValidPattern = FALSE;
1726             }
1727         }
1728     }
1729 
1730   /* If we found one defline, then */
1731   /* make sure they were all there */
1732 
1733   if (firstDefline == FALSE)
1734     {
1735       lastId = fileInfoPtr->sequences;
1736       if (lastId != NULL)
1737         {
1738           while (lastId->next != NULL)
1739             lastId = lastId->next;
1740           if (lastId->defline == NULL)
1741             {
1742               errPtr = Ali_AddError (&(fileInfoPtr->errors),
1743                                      ERR_ID_WITH_NO_DEFLINE,
1744                                      lastId->id);
1745               isValidPattern = FALSE;
1746             }
1747         }
1748     }
1749 
1750   /* If pattern not found, return failure */
1751 
1752   if (!isValidPattern)
1753     {
1754       MemFree (pattern);
1755       return FALSE;
1756     }
1757 
1758   /* Check for inconsistant declarations ... */
1759 
1760   /* ... of file type */
1761 
1762   if (s_configInfo.declaredInfo.contigOrInter == ALI_INTERLEAVED)
1763     {
1764       errPtr = Ali_AddError (&(fileInfoPtr->errors), ERR_NOT_INTERLEAVED);
1765       errPtr->level = LEVEL_WARNING;
1766     }
1767 
1768   /* ... of number of sequences */
1769 
1770   if ((s_configInfo.declaredInfo.idCount != 0) &&
1771       (s_configInfo.declaredInfo.idCount != pattern->foundInfo.idCount))
1772     {
1773       errPtr = Ali_AddError (&(fileInfoPtr->errors), ERR_ID_COUNT_MISMATCH,
1774                              pattern->foundInfo.idCount,
1775                              s_configInfo.declaredInfo.idCount);
1776       errPtr->level = LEVEL_WARNING;
1777     }
1778 
1779   /* ... of sequence length */
1780 
1781   if ((s_configInfo.declaredInfo.seqLength != 0) &&
1782       (s_configInfo.declaredInfo.seqLength != pattern->foundInfo.seqLength))
1783     {
1784       errPtr = Ali_AddError (&(fileInfoPtr->errors),ERR_SEQ_LENGTH_MISMATCH,
1785                              pattern->foundInfo.seqLength,
1786                              s_configInfo.declaredInfo.seqLength);
1787       errPtr->level = LEVEL_WARNING;
1788     }
1789 
1790   /* If we have some possibly bad sequences that */
1791   /* weren't used, process them seperately.      */
1792 
1793   if (pattern->maybesFound == TRUE)
1794     fileInfoPtr->maybes = s_ProcessMaybes (rowList);
1795 
1796   /* Clean up and return successfully */
1797 
1798   if (pattern->currentId != NULL)
1799     {
1800       MemFree (pattern);
1801       return FALSE;
1802     }
1803   else
1804     {
1805       MemFree (pattern);
1806       return TRUE;
1807     }
1808 }
1809 
1810 /*=========================================================================*/
1811 /*                                                                         */
1812 /* Ali_AddError ()                                                         */
1813 /*                                                                         */
1814 /*=========================================================================*/
1815 
1816 ErrInfoPtr Ali_AddError (ErrInfoPtr PNTR errorListPtr,
1817                          Int4            iError,
1818                          ...)
1819 {
1820   ErrInfoPtr newError;
1821   ErrInfoPtr lastError;
1822   va_list    argPtr;
1823   CharPtr    seqId;
1824   CharPtr    seqStr;
1825   Int4       seqLength;
1826   Int4       prevSeqLength;
1827   CharPtr    defLineStr;
1828   Int4       foundCount;
1829   Int4       declaredCount;
1830   Int4       foundLen;
1831   Int4       declaredLen;
1832   Int4       sequenceCount;
1833   Int4       errorCount;
1834   Int4       invalidChar;
1835 
1836   static Int4 count = 0;
1837 
1838   count++;
1839 
1840   /* Create a new error record */
1841 
1842   newError = (ErrInfoPtr) MemNew (sizeof(ErrInfo));
1843   newError->errNum    = iError;
1844   newError->level     = LEVEL_ERROR;
1845   newError->rowNum    = 0;
1846   newError->extraInfo = NULL;
1847   newError->next      = NULL;
1848 
1849   /* Build the error message text */
1850 
1851   va_start (argPtr, iError);
1852 
1853   switch (iError)
1854     {
1855     case ERR_ID_WITHOUT_SEQ :
1856       seqId = va_arg (argPtr, CharPtr);
1857       newError->info = (CharPtr) MemNew (strlen (seqId) + 80);
1858       sprintf (newError->info, "Unable to match ID %s to any sequence", seqId);
1859       break;
1860     case ERR_SEQ_WITHOUT_ID :
1861       seqStr = va_arg (argPtr, CharPtr);
1862       newError->info = (CharPtr) MemNew (strlen (seqStr) + 80);
1863       sprintf (newError->info, "There is no ID for the sequence:\n%s", seqStr);
1864       break;
1865     case ERR_DUPLICATE_IDS :
1866       seqId = va_arg (argPtr, CharPtr);
1867       newError->info = (CharPtr) MemNew (strlen (seqId) + 80);
1868       sprintf (newError->info, "Duplicate ID: %s is used more than once",
1869                seqId);
1870       break;
1871     case ERR_SEQUENCE_TOO_SHORT :
1872       seqId = va_arg (argPtr, CharPtr);
1873       prevSeqLength = va_arg (argPtr, Int4);
1874       seqLength     = va_arg (argPtr, Int4);
1875       newError->info = (CharPtr) MemNew (strlen (seqId) + 256);
1876       sprintf (newError->info,
1877                "Sequence %s is shorter (%d characters) than the preceding"
1878                " sequences (%d characters)", seqId, seqLength, prevSeqLength);
1879       break;
1880     case ERR_SEQUENCE_TOO_LONG :
1881       seqId         = va_arg (argPtr, CharPtr);
1882       prevSeqLength = va_arg (argPtr, Int4);
1883       seqLength     = va_arg (argPtr, Int4);
1884       newError->info = (CharPtr) MemNew (strlen (seqId) + 256);
1885       sprintf (newError->info,
1886                "Sequence %s is longer (%d characters) than the preceding"
1887                " sequences (%d characters)", seqId, seqLength, prevSeqLength);
1888       break;
1889     case ERR_OUT_OF_MEMORY :
1890       newError->info = (CharPtr) MemNew (80);
1891       sprintf (newError->info, "Out of memory -- memory allocation failed");
1892       break;
1893     case ERR_ID_NO_PRECEDING_SEQ :
1894       seqId = va_arg (argPtr, CharPtr);
1895       newError->info = (CharPtr) MemNew (strlen (seqId) + 100);
1896       sprintf (newError->info,
1897                "ID %s is probably invalid -- it is not immediately"
1898                " preceded by a sequence", seqId);
1899       break;
1900     case ERR_NOT_INTERLEAVED :
1901       newError->info = (CharPtr) MemNew (80);
1902       sprintf (newError->info, "File is declared to be interleaved,"
1903                " but is contiguous");
1904       break;
1905     case ERR_NOT_CONTIGUOUS :
1906       newError->info = (CharPtr) MemNew (80);
1907       sprintf (newError->info, "File is declared to be contiguous,"
1908                " but is interleaved");
1909       break;
1910     case ERR_NO_SEQUENCES_FOUND :
1911       newError->info = (CharPtr) MemNew (80);
1912       sprintf (newError->info, "No sequences were found in the file");
1913       break;
1914     case ERR_ID_COUNT_MISMATCH :
1915       foundCount = va_arg (argPtr, Int4);
1916       declaredCount = va_arg (argPtr, Int4);
1917       newError->info = (CharPtr) MemNew (128);
1918       sprintf (newError->info, "The number of sequences found (%d) doesn't"
1919                " match the number declared (%d)", foundCount, declaredCount);
1920       break;
1921     case ERR_SEQ_LENGTH_MISMATCH :
1922       foundLen = va_arg (argPtr, Int4);
1923       declaredLen = va_arg (argPtr, Int4);
1924       newError->info = (CharPtr) MemNew (128);
1925       sprintf (newError->info, "The length (%d) of the sequences found doesn't"
1926                " match the declared length (%d)", foundLen, declaredLen);
1927       break;
1928     case ERR_DEFLINE_WITH_NO_ID :
1929       defLineStr = va_arg (argPtr, CharPtr);
1930       newError->info = (CharPtr) MemNew (strlen (defLineStr) + 100);
1931       sprintf (newError->info, "Unable to match the following definition"
1932                " line to any sequence :\n%s", defLineStr);
1933       break;
1934     case ERR_ID_WITH_NO_DEFLINE :
1935       seqId = va_arg (argPtr, CharPtr);
1936       newError->info = (CharPtr) MemNew (strlen (seqId) + 80);
1937       sprintf (newError->info, "Could not find a defline for the following"
1938                " sequence :\n%s", seqId);
1939       break;
1940     case ERR_INVALID_DEFLINE :
1941       defLineStr = va_arg (argPtr, CharPtr);
1942       invalidChar = va_arg (argPtr, Int4);
1943       newError->info = (CharPtr) MemNew (strlen (defLineStr) + 100);
1944       sprintf (newError->info, "Invalid definitions line (illegal char '%c'):\n%s",
1945                (Char) invalidChar, defLineStr);
1946       break;
1947     case ERR_DEFLINE_NODEFS :
1948       defLineStr = va_arg (argPtr, CharPtr);
1949       newError->info = (CharPtr) MemNew (strlen (defLineStr) + 100);
1950       sprintf (newError->info, "There is no source info enclosed by"
1951                " brackets on the definition line :\n%s", defLineStr);
1952       break;
1953     case ERR_GLOBAL_DEFLINE_NODEFS :
1954       sequenceCount = va_arg (argPtr, Int4);
1955       newError->info = (CharPtr) MemNew (128);
1956       sprintf (newError->info, "All %d of the file's definition lines are"
1957                " missing source info enclosed in [] brackets", sequenceCount);
1958       newError->level = LEVEL_MULTI;
1959       break;
1960     case ERR_MULTI_DEFLINE_NODEFS :
1961       errorCount = va_arg (argPtr, Int4);
1962       newError->info = (CharPtr) MemNew (128);
1963       sprintf (newError->info, "%d of the file's definition lines are"
1964                " missing source info enclosed in [] brackets", errorCount);
1965       newError->level = LEVEL_MULTI;
1966       break;
1967     default:
1968       newError->info = (CharPtr) MemNew (32);
1969       sprintf (newError->info, "Unknown Error");
1970       break;
1971     }
1972 
1973   va_end (argPtr);
1974 
1975   /* Add it to the end of the linked list */
1976 
1977   if (*errorListPtr == NULL)
1978     *errorListPtr = newError;
1979   else
1980     {
1981       lastError = *errorListPtr;
1982       while (lastError->next != NULL)
1983         lastError = lastError->next;
1984       lastError->next = newError;
1985     }
1986 
1987   /* Return a pointer to new record for easy access */
1988 
1989   return newError;
1990 }
1991 
1992 /*=========================================================================*/
1993 /*                                                                         */
1994 /* s_AnalyzeContents () -                                                  */
1995 /*                                                                         */
1996 /*=========================================================================*/
1997 
1998 static Boolean s_AnalyzeContents (ValNodePtr       rowList,
1999                                   AlignFileDataPtr fileInfoPtr)
2000 {
2001   Int2    idCount;
2002   Boolean result;
2003 
2004   if (s_IsInterleaved (rowList, &idCount))
2005     {
2006       fileInfoPtr->info->contigOrInter = ALI_INTERLEAVED;
2007       result = s_AnalyzeInterleaved (rowList, fileInfoPtr, idCount);
2008     }
2009   else 
2010     {
2011       fileInfoPtr->info->contigOrInter = ALI_CONTIGUOUS;
2012       result = s_AnalyzeContiguous (rowList, fileInfoPtr);
2013     }
2014 
2015   return result;
2016 }
2017 
2018 /*=========================================================================*/
2019 /*                                                                         */
2020 /* SeqLineReEval () - Re-evaluate a line after forcing the first 'word'    */
2021 /*                    to be an ID.                                         */
2022 /*                                                                         */
2023 /*=========================================================================*/
2024 
2025 SeqLineInfoPtr SeqLineReEval (SeqLineInfoPtr seqLinePtr)
2026 {
2027   CharPtr        seqStr;
2028   CharPtr        idStr;
2029   CharPtr        oldStr;
2030   SeqLineInfoPtr newSeqLinePtr;
2031 
2032   /* If the line is already split up, */
2033   /* then this won't work.            */
2034 
2035   if ((seqLinePtr->sequence != NULL) && (seqLinePtr->id != NULL))
2036     return NULL;
2037 
2038   /* Determine the string that we're splitting up */
2039 
2040   if (seqLinePtr->sequence != NULL)
2041     oldStr = seqLinePtr->sequence;
2042   else if (seqLinePtr->id != NULL)
2043     oldStr = seqLinePtr->id;
2044   else
2045     return NULL;
2046 
2047   /* If there's only one 'word' then */
2048   /* we can't split it.              */
2049 
2050   if (StringLen (oldStr) == seqLinePtr->firstWordLen)
2051     return NULL;
2052 
2053   /* Allocate mem for the new strings */
2054 
2055   seqStr = (CharPtr) MemNew (StringLen (oldStr) -
2056                              seqLinePtr->firstWordLen + 1);
2057   if (seqStr == NULL)
2058     return NULL;
2059   idStr = (CharPtr) MemNew (seqLinePtr->firstWordLen + 1);
2060   if (idStr == NULL)
2061     {
2062       MemFree (seqStr);
2063       return NULL;
2064     }
2065 
2066   /* Break up the existing string */
2067 
2068   StringNCpy(idStr, oldStr, seqLinePtr->firstWordLen);
2069   idStr[seqLinePtr->firstWordLen] = '\0';
2070   StringCpy(seqStr, oldStr + seqLinePtr->firstWordLen);
2071 
2072   /* Return successfully */
2073 
2074   newSeqLinePtr = (SeqLineInfoPtr) MemNew (sizeof (SeqLineInfo));
2075   newSeqLinePtr->sequence     = seqStr;
2076   newSeqLinePtr->id           = idStr;
2077   newSeqLinePtr->rowNum       = seqLinePtr->rowNum;
2078   newSeqLinePtr->type         = Ali_SeqLineGetType(seqStr, &s_configInfo);
2079   newSeqLinePtr->maybe        = seqLinePtr->maybe;
2080   newSeqLinePtr->firstWordLen = seqLinePtr->firstWordLen;
2081 
2082   return newSeqLinePtr;
2083 }
2084 
2085 /*=========================================================================*/
2086 /*                                                                         */
2087 /* s_IsExistingId () -- Determine if the given ID is one that has already  */
2088 /*                      been added to the linked list of IDs.              */
2089 /*                                                                         */
2090 /*=========================================================================*/
2091 
2092 static Boolean s_IsExistingId (AlignFileDataPtr fileInfoPtr,
2093                        CharPtr          testIdStr)
2094 {
2095   IdInfoPtr  idListPtr = NULL;
2096 
2097   /* See if this ID already exists */
2098   
2099   idListPtr = fileInfoPtr->sequences;
2100   while (idListPtr != NULL)
2101     {
2102       if (StringCmp(idListPtr->id,testIdStr) == 0)
2103         return TRUE;
2104       idListPtr = idListPtr->next;
2105     }
2106   
2107   /* If we made it to here, then */
2108   /* the ID wasn't found.        */
2109 
2110   return FALSE;
2111 }
2112 
2113 /*=========================================================================*/
2114 /*                                                                         */
2115 /* s_CheckContext ()                                                       */
2116 /*                                                                         */
2117 /*=========================================================================*/
2118 
2119 static Boolean s_CheckContext (ValNodePtr       rowList,
2120                                AlignFileDataPtr fileInfoPtr)
2121 {
2122   ValNodePtr       currentRow;
2123   ValNodePtr       lastRow;
2124   CharPtr          idStr;
2125   SeqLineInfoPtr   seqLinePtr;
2126   SeqLineInfoPtr   reEvalSeqPtr;
2127   SeqLineInfoPtr   prevSeqLinePtr = NULL;
2128   DefLineInfoPtr   defLinePtr;
2129   OtherLineInfoPtr otherLinePtr;
2130   Int2             patternSeqType;
2131   ErrInfoPtr       errPtr;
2132   Boolean          changesMade;
2133   Int4             currLen;
2134   Int4             prevLen;
2135 
2136   do  /* Until no changes are made */
2137     {
2138       currentRow     = rowList;
2139       lastRow        = NULL;
2140       patternSeqType = ALI_AMBIGUOUS;
2141 
2142       changesMade = FALSE;
2143       while (currentRow != NULL)
2144         {
2145           if (currentRow->choice == ALI_SEQLINE)
2146             {
2147               seqLinePtr = (SeqLineInfoPtr) currentRow->data.ptrvalue;
2148               
2149               if ((seqLinePtr->maybe == FALSE) ||
2150                   (seqLinePtr->maybe == TRUE) &&
2151                   (s_configInfo.useMaybes == TRUE))
2152                 {
2153                   /* If there is an ID, make sure that it  */
2154                   /* immediately precedes a sequence line. */
2155                   
2156                   if (seqLinePtr->id != NULL)
2157                     {
2158                       if ((seqLinePtr->sequence == NULL) &&
2159                           (s_GetRowSeqString(currentRow->next) == NULL))
2160                         {
2161                           Ali_ChangeRowToOther (currentRow);
2162                           changesMade = TRUE;
2163                           continue;
2164                         }
2165                     }
2166 
2167                   /* Check for an ID that was accidentally lumped */
2168                   /* in with a sequence due to being composed     */
2169                   /* entirely of sequence characters.             */
2170 
2171                   if (prevSeqLinePtr != NULL)
2172                     {
2173                       currLen = StringLen (seqLinePtr->sequence);
2174                       prevLen = StringLen (prevSeqLinePtr->sequence);
2175 
2176                       if ((currLen > prevLen) && 
2177                           (seqLinePtr->id == NULL) &&
2178                           (prevSeqLinePtr->id != NULL))
2179                         {
2180                           reEvalSeqPtr = SeqLineReEval (seqLinePtr);
2181 
2182                           if (reEvalSeqPtr != NULL)
2183                             {
2184                               currLen = StringLen (reEvalSeqPtr->sequence);
2185                           
2186                               /* If the new seqline fits better, use it */
2187                           
2188                               if (currLen == prevLen)
2189                                 {
2190                                   MemFree(seqLinePtr->sequence);
2191                                   MemFree(seqLinePtr->id);
2192                                   MemFree(seqLinePtr);
2193                                   currentRow->data.ptrvalue = reEvalSeqPtr;
2194                                   continue;
2195                                 }
2196                               else
2197                                 {
2198                                   MemFree(reEvalSeqPtr->sequence);
2199                                   MemFree(reEvalSeqPtr->id);
2200                                   MemFree(reEvalSeqPtr);
2201                                 }
2202                             }
2203                         }
2204                     }
2205                   
2206                   /* If there's an established pattern of sequence */
2207                   /* type, then match the current line against it. */
2208                   /* Otherwise, set the pattern.                   */
2209                   
2210                   if (seqLinePtr->type != ALI_AMBIGUOUS)
2211                     {
2212                       if (patternSeqType != ALI_AMBIGUOUS)
2213                         {
2214                           if (patternSeqType != seqLinePtr->type)
2215                             {
2216                               reEvalSeqPtr = SeqLineReEval (seqLinePtr);
2217                               if ((reEvalSeqPtr == NULL) ||
2218                                   ((reEvalSeqPtr != NULL) &&
2219                                    (patternSeqType != reEvalSeqPtr->type)))
2220                                 {
2221                                   if (reEvalSeqPtr != NULL)
2222                                     {
2223                                       MemFree(reEvalSeqPtr->sequence);
2224                                       MemFree(reEvalSeqPtr->id);
2225                                       MemFree(reEvalSeqPtr);
2226                                     }
2227                                   Ali_ChangeRowToOther (currentRow);
2228                                   changesMade = TRUE;
2229                                   continue;
2230                                 }
2231                               else
2232                                 {
2233                                   MemFree(seqLinePtr->sequence);
2234                                   MemFree(seqLinePtr->id);
2235                                   MemFree(seqLinePtr);
2236                                   currentRow->data.ptrvalue = reEvalSeqPtr;
2237                                   continue;
2238                                 }
2239                             }
2240                         }
2241                       else 
2242                         patternSeqType = seqLinePtr->type;
2243                     }
2244 
2245                   /* */
2246                   
2247                   prevSeqLinePtr = seqLinePtr;
2248 
2249                 }
2250               lastRow = currentRow;
2251             }
2252           
2253           else if (currentRow->choice == ALI_DEFLINE)
2254             {
2255               /* If there is an ID, make sure that it  */
2256               /* immediately precedes a sequence line. */
2257               
2258               defLinePtr = (DefLineInfoPtr) currentRow->data.ptrvalue;
2259               if ((defLinePtr->id != NULL) &&
2260                   (s_IsExistingId(fileInfoPtr, defLinePtr->id) == FALSE) &&
2261                   (s_GetRowSeqString(currentRow->next) == NULL))
2262                 {
2263                   Ali_ChangeRowToOther (currentRow);
2264                   changesMade = TRUE;
2265                   continue;
2266                 }
2267               lastRow = currentRow;
2268             }
2269           
2270           else if (currentRow->choice == ALI_OTHERLINE)
2271             {
2272               /* If there is an ID, make sure that it  */
2273               /* immediately precedes a sequence line. */
2274               
2275               otherLinePtr = (OtherLineInfoPtr) currentRow->data.ptrvalue;
2276               if (otherLinePtr->id != NULL)
2277                 {
2278                   if (s_GetRowSeqString(currentRow->next) == NULL)
2279                     {
2280                       otherLinePtr->other = otherLinePtr->id;
2281                       otherLinePtr->id    = NULL;
2282                       changesMade = TRUE;
2283                     }
2284                 }
2285               lastRow = currentRow;
2286             }
2287           
2288           currentRow = currentRow->next;
2289         }
2290     } while (changesMade == TRUE);
2291 
2292   /* Check for a dangling ID */
2293 
2294   if ((lastRow != NULL) &&
2295       ((idStr = s_GetRowIdString (lastRow)) != NULL) &&
2296       (s_GetRowSeqString (lastRow) == NULL))
2297     {
2298       errPtr = Ali_AddError (&(fileInfoPtr->errors),
2299                              ERR_ID_WITHOUT_SEQ,
2300                              idStr);
2301       return FALSE;
2302     }
2303 
2304   /* Return successfully */
2305 
2306   return TRUE;
2307 }
2308 
2309 /*=========================================================================*/
2310 /*                                                                         */
2311 /* s_SortErrors () -- Sort errors by level, so that the most severe appear */
2312 /*                    first.                                               */
2313 /*                                                                         */
2314 /*   NOTE : Does a lame bubblesort, which nevertheless should be fast      */
2315 /*          enough for the relatively small linked lists we're dealing     */
2316 /*          with here.                                                     */
2317 /*                                                                         */
2318 /*=========================================================================*/
2319 
2320 static void s_SortErrors (AlignFileDataPtr fileInfoPtr)
2321 {
2322   Boolean  swapMade  = TRUE;
2323   ErrInfoPtr prevPtr = NULL;
2324   ErrInfoPtr nextPtr = NULL;
2325   ErrInfoPtr errPtr  = NULL;
2326 
2327   while (swapMade == TRUE)
2328     {
2329       swapMade = FALSE;
2330       errPtr = fileInfoPtr->errors;
2331       while (errPtr->next != NULL)
2332         {
2333           nextPtr = errPtr->next;
2334           if (errPtr->level > nextPtr->level)
2335             {
2336               swapMade = TRUE;
2337 
2338               /* Remove the error from the list */
2339 
2340               if (errPtr == fileInfoPtr->errors)
2341                 fileInfoPtr->errors = nextPtr;
2342               else
2343                 prevPtr->next = nextPtr;
2344 
2345               /* Then re-insert it after the following error */
2346 
2347               errPtr->next = nextPtr->next;
2348               nextPtr->next = errPtr;
2349 
2350               /* The old next error is now the previous error */
2351 
2352               prevPtr = nextPtr;
2353             }
2354           else
2355             {
2356               prevPtr = errPtr;
2357               errPtr = errPtr->next;
2358             }
2359         }
2360     }
2361 
2362   return;
2363 }
2364 
2365 /*=========================================================================*/
2366 /*                                                                         */
2367 /* s_ReplaceUWithT () -- Replace all the Us in a nucleotide sequence with  */
2368 /*                       Ns.                                               */
2369 /*                                                                         */
2370 /*=========================================================================*/
2371 
2372 static void s_ReplaceUWithT (AlignFileDataPtr fileInfoPtr)
2373 {
2374   IdInfoPtr  seqPtr = NULL;
2375   SeqPartPtr seqPart = NULL;
2376   CharPtr    seqString;
2377   Int4       i;
2378 
2379   seqPtr = fileInfoPtr->sequences;
2380   while (seqPtr != NULL)
2381     {
2382       seqPart = seqPtr->sequence;
2383       while (seqPart != NULL)
2384         {
2385           seqString = seqPart->sequence;
2386           for (i = 0; seqString[i] != '\0'; i++)
2387             if (seqString[i] == 'U')
2388               seqString[i] = 'T';
2389             else if (seqString[i] == 'u')
2390               seqString[i] = 't';
2391           seqPart = seqPart->next;
2392         }
2393       seqPtr = seqPtr->next;
2394     }
2395 
2396 }
2397 
2398 /*=========================================================================*/
2399 /*                                                                         */
2400 /* s_AnalyzeErrors () -- Look for patterns in the errors that can be used  */
2401 /*                       to create more general, higher-level errors       */
2402 /*                       instead.                                          */
2403 /*                                                                         */
2404 /*=========================================================================*/
2405 
2406 static void s_AnalyzeErrors (AlignFileDataPtr fileInfoPtr)
2407 {
2408   Int4       seqCount = 0;
2409   Int4       defCount = 0;
2410   Int4       errCount = 0;
2411   IdInfoPtr  seqPtr = NULL;
2412   ErrInfoPtr errPtr = NULL;
2413   ErrInfoPtr prevErrPtr = NULL;
2414   ErrInfoPtr nextErrPtr = NULL;
2415 
2416   if (fileInfoPtr->errors == NULL)
2417     return;
2418 
2419   /* Get counts of sequences and deflines */
2420 
2421   seqPtr = fileInfoPtr->sequences;
2422   while (seqPtr != NULL)
2423     {
2424       seqCount++;
2425       if (seqPtr->defline != NULL)
2426         defCount++;
2427       seqPtr = seqPtr->next;
2428     }
2429 
2430   /* Check for "missing bracket" defline errors */
2431 
2432   errPtr = fileInfoPtr->errors;
2433   while (errPtr != NULL)
2434     {
2435       if (errPtr->errNum == ERR_DEFLINE_NODEFS)
2436           errCount++;
2437       errPtr = errPtr->next;
2438     }
2439 
2440   /* If ALL deflines have missing bracket errors */
2441   /* then replace the msgs with one global msg   */
2442 
2443   if (errCount == seqCount)
2444     {
2445       if (s_configInfo.errExpandLevel != ALI_ERRMSG_EXPAND_ALL)
2446         {
2447           errPtr = fileInfoPtr->errors;
2448           while (errPtr != NULL)
2449             {
2450               nextErrPtr = errPtr->next;
2451               if (errPtr->errNum == ERR_DEFLINE_NODEFS)
2452                 {
2453                   if (errPtr == fileInfoPtr->errors)
2454                     {
2455                       fileInfoPtr->errors = fileInfoPtr->errors->next;
2456                       s_FreeErrorNode (errPtr);
2457                       errPtr = NULL;
2458                     }
2459                   else
2460                     {
2461                       prevErrPtr->next = nextErrPtr;
2462                       s_FreeErrorNode (errPtr);
2463                       errPtr = NULL;
2464                     }
2465                 }
2466               else
2467                 prevErrPtr = errPtr;
2468               errPtr = nextErrPtr;
2469             }
2470         }
2471       Ali_AddError (&(fileInfoPtr->errors), ERR_GLOBAL_DEFLINE_NODEFS,
2472                     seqCount);
2473     }
2474 
2475   /* If SOME deflines have missing bracket errors */
2476   /* then replace the msgs with one global msg    */
2477 
2478   else if (errCount > 1)
2479     {
2480       if (s_configInfo.errExpandLevel == ALI_ERRMSG_EXPAND_NONE)
2481         {
2482           errPtr = fileInfoPtr->errors;
2483           while (errPtr != NULL)
2484             {
2485               nextErrPtr = errPtr->next;
2486               if (errPtr->errNum == ERR_DEFLINE_NODEFS)
2487                 {
2488                   if (errPtr == fileInfoPtr->errors)
2489                     {
2490                       fileInfoPtr->errors = fileInfoPtr->errors->next;
2491                       s_FreeErrorNode (errPtr);
2492                       errPtr = NULL;
2493                     }
2494                   else
2495                     {
2496                       prevErrPtr->next = nextErrPtr;
2497                       s_FreeErrorNode (errPtr);
2498                       errPtr = NULL;
2499                     }
2500                 }
2501               else
2502                 prevErrPtr = errPtr;
2503               errPtr = nextErrPtr;
2504             }
2505         }
2506       Ali_AddError (&(fileInfoPtr->errors), ERR_MULTI_DEFLINE_NODEFS,
2507                     errCount);
2508     }
2509 
2510   /* Finally, sort the errors by type */
2511 
2512   s_SortErrors (fileInfoPtr);
2513 
2514   return;
2515 
2516 }
2517 
2518 /*=========================================================================*/
2519 /*                                                                         */
2520 /* Ali_Read ()                                                             */
2521 /*                                                                         */
2522 /*=========================================================================*/
2523 
2524 AlignFileDataPtr Ali_Read (FILE PNTR alignFilePtr)
2525 {
2526   ValNodePtr        rowList = NULL;
2527   AlignFileDataPtr  fileInfoPtr;
2528   ErrInfoPtr        errorList = NULL;
2529 
2530   /* Check parameters */
2531 
2532   if (alignFilePtr == NULL)
2533     return FALSE;
2534 
2535   /* Initialize */
2536 
2537   fileInfoPtr = (AlignFileDataPtr) MemNew (sizeof(AlignFileData));
2538   fileInfoPtr->sequences = NULL;
2539   fileInfoPtr->maybes    = NULL;
2540   fileInfoPtr->errors    = NULL;
2541   fileInfoPtr->info = (ParsedInfoPtr) MemNew (sizeof (ParsedInfo));
2542   if (fileInfoPtr->info == NULL)
2543     {
2544       Ali_AddError (&(fileInfoPtr->errors), ERR_OUT_OF_MEMORY);
2545       Ali_Free (fileInfoPtr);
2546       return NULL;
2547     }
2548   fileInfoPtr->info->missingChar   = NULL;
2549   fileInfoPtr->info->gapChar       = NULL;
2550   fileInfoPtr->info->unalignedChar = NULL;
2551   
2552   if (s_configurationSet == FALSE)
2553     Ali_SetConfig (NULL, ALI_SET_DEFAULTS);
2554 
2555   /* Read in and parse each row */
2556 
2557   rowList = Ali_ReadLines (alignFilePtr, &errorList, &s_configInfo, fileInfoPtr);
2558   fileInfoPtr->errors = errorList;
2559 
2560   if (rowList == NULL)
2561     return fileInfoPtr;
2562 
2563   /* Make first pass to adjust the rows based on context */
2564 
2565   if (s_CheckContext(rowList, fileInfoPtr) != TRUE)
2566     return fileInfoPtr;
2567 
2568   s_DisplayRowList (rowList, s_configInfo.debugLevel);
2569 
2570   /* Analyze the IDs and sequences for consistancy */
2571 
2572   s_AnalyzeContents (rowList, fileInfoPtr);
2573 
2574   if (fileInfoPtr->sequences == NULL)
2575     Ali_AddError (&(fileInfoPtr->errors), ERR_NO_SEQUENCES_FOUND);
2576 
2577   /* Analyze the errors to see if they can be */
2578   /* combined into more general global errors */
2579 
2580   if (fileInfoPtr->errors != NULL)
2581     s_AnalyzeErrors (fileInfoPtr);
2582 
2583   /* Return the missing, gap, and unaligned chars used */
2584 
2585   fileInfoPtr->info->missingChar = (CharPtr) MemNew(16);
2586   StringCpy (fileInfoPtr->info->missingChar, s_configInfo.missingChar);
2587 
2588   fileInfoPtr->info->gapChar = (CharPtr) MemNew(16);
2589   StringCpy (fileInfoPtr->info->gapChar, s_configInfo.gapChar);
2590 
2591   fileInfoPtr->info->unalignedChar = (CharPtr) MemNew(16);
2592   StringCpy (fileInfoPtr->info->unalignedChar, s_configInfo.unalignedChar);
2593 
2594   /* If these are nucleotide sequences, then */
2595   /* replace all 'U's with 'T's.             */
2596 
2597   s_ReplaceUWithT (fileInfoPtr);
2598 
2599   /* Clean up and return successfully */
2600   
2601   s_FreeRowList_Safe (rowList);
2602   return fileInfoPtr;
2603 }
2604 
2605 /***************************************************************************
2606 *
2607 *  section to convert AlignFileDataPtr content into seqalign/seqentry
2608 *  structures
2609 *
2610 ***************************************************************************/
2611 typedef struct tinyinfo {
2612    Int4  n;
2613    struct tinyinfo PNTR next;
2614 } ALI_TinyInfo, PNTR ALI_TinyInfoPtr;
2615 
2616 
2617 static Boolean is_gap_char(Char c, CharPtr gapChar)
2618 {
2619    if (StrChr(gapChar, c) != NULL)
2620       return TRUE;
2621    return FALSE;
2622 }
2623 
2624 static int LIBCALLBACK ALI_SortTips(VoidPtr ptr1, VoidPtr ptr2)
2625 {
2626    ALI_TinyInfoPtr  tip1;
2627    ALI_TinyInfoPtr  tip2;
2628 
2629    tip1 = *((ALI_TinyInfoPtr PNTR)ptr1);
2630    tip2 = *((ALI_TinyInfoPtr PNTR)ptr2);
2631    if (tip1->n > tip2->n)
2632       return 1;
2633    if (tip1->n < tip2->n)
2634       return -1;
2635    return 0;
2636 }
2637 
2638 static Boolean is_valid_seq(Char c, CharPtr missingChar, CharPtr gapChar)
2639 {
2640   if (StrChr("\0", c))
2641     return FALSE;
2642   if (StrChr(missingChar, c) != NULL)
2643     return TRUE;
2644   if (StrChr(gapChar, c) != NULL)
2645     return TRUE;
2646   if (IS_ALPHA(c))
2647     return TRUE;
2648   if (c == '-')
2649     return TRUE;
2650   if (c == '?')
2651     return TRUE;
2652   return FALSE;
2653 }
2654 
2655 static Boolean is_missing(Char c, CharPtr missingChar)
2656 {
2657    if (StrChr(missingChar, c) != NULL)
2658       return TRUE;
2659    else
2660       return FALSE;
2661 }
2662 
2663 static SeqAlignPtr ALI_MakeSeqAlign(AlignFileDataPtr afp, CharPtr PNTR PNTR stringsptr, Int4Ptr numseq, CharPtr PNTR PNTR deflineptr)
2664 {
2665    Int4             alnlen;
2666    CharPtr          buf;
2667    CharPtr          c;
2668    Int4             ctr;
2669    Int4             ctr_prev;
2670    CharPtr          PNTR deflines;
2671    DenseSegPtr      dsp;
2672    Int4             i;
2673    IdInfoPtr        id_head;
2674    IdInfoPtr        iip;
2675    Boolean          ingap;
2676    Boolean          isgap;
2677    Int4             j;
2678    Int4             last;
2679    Int4             len;
2680    Int4             maxlen;
2681    Int4             numtips;
2682    SeqAlignPtr      sap;
2683    SeqPartPtr       seq;
2684    SeqIdPtr         sip;
2685    SeqIdPtr         sip_prev;
2686    CharPtr          PNTR strings;
2687    Char             text[100];
2688    ALI_TinyInfoPtr  tip;
2689    ALI_TinyInfoPtr  tip_head;
2690    ALI_TinyInfoPtr  tip_prev;
2691    ALI_TinyInfoPtr  PNTR tiparray;
2692 
2693    if (afp->info == NULL)
2694    {
2695       ErrPostEx(SEV_ERROR, 0, 0, "NULL afp->info -- alignment not read correctly\n");
2696       return NULL;
2697    }
2698    i = 0;
2699    id_head = afp->sequences;
2700    iip = id_head;
2701    while (iip != NULL)
2702    {
2703       i++;
2704       if (iip->id == NULL)
2705       {
2706          sprintf(text, "No id read for sequence %d\n", i);
2707          ErrPostEx(SEV_ERROR, 0, 0, text);
2708          return NULL;
2709       }
2710       iip = iip->next;
2711    }
2712    sap = SeqAlignNew();
2713    sap->type = SAT_PARTIAL;
2714    sap->segtype = SAS_DENSEG;
2715    sap->dim = i;
2716    dsp = DenseSegNew();
2717    dsp->dim = i;
2718    strings = (CharPtr PNTR)MemNew(i*sizeof(CharPtr));
2719    deflines = (CharPtr PNTR)MemNew(i*sizeof(CharPtr));
2720    tip_head = tip_prev = NULL;
2721    iip = id_head;
2722    maxlen = 0;
2723    tip_head = tip_prev = NULL;
2724    numtips = 0;
2725    alnlen = 0;
2726    i = 1;
2727    while (iip != NULL)
2728    {
2729       len = 0;
2730       ctr = 0;
2731       seq = iip->sequence;
2732       if (seq == NULL || seq->sequence == NULL)
2733       {
2734          sprintf(text, "Error in reading sequence %d -- no sequence characters read\n", i);
2735          ErrPostEx(SEV_ERROR, 0, 0, text);
2736          return NULL;
2737       }
2738       c = seq->sequence;
2739       if (is_gap_char(*c, afp->info->gapChar))
2740          ingap = TRUE;
2741       else
2742          ingap = FALSE;
2743       while (seq != NULL)
2744       {
2745          c = seq->sequence;
2746          if (c == NULL)
2747          {
2748             sprintf(text, "Error in reading sequence %d -- no sequence characters read\n", i);
2749             ErrPostEx(SEV_ERROR, 0, 0, text);
2750             return NULL;
2751          }
2752          while (is_valid_seq(*c, afp->info->missingChar, afp->info->gapChar))
2753          {
2754             if (is_gap_char(*c, afp->info->gapChar) && !ingap)
2755             {
2756                tip = (ALI_TinyInfoPtr)MemNew(sizeof(ALI_TinyInfo));
2757                tip->n = ctr;
2758                if (tip_head != NULL)
2759                {
2760                   tip_prev->next = tip;
2761                   tip_prev = tip;
2762                } else
2763                   tip_head = tip_prev = tip;
2764                ingap = TRUE;
2765                numtips++;
2766             } else if (!is_gap_char(*c, afp->info->gapChar) && ingap)
2767             {
2768                tip = (ALI_TinyInfoPtr)MemNew(sizeof(ALI_TinyInfo));
2769                tip->n = ctr;
2770                if (tip_head != NULL)
2771                {
2772                   tip_prev->next = tip;
2773                   tip_prev = tip;
2774                } else
2775                   tip_head = tip_prev = tip;
2776                ingap = FALSE;
2777                numtips++;
2778             }
2779             if (!is_gap_char(*c, afp->info->gapChar))
2780                len++;
2781             ctr++;
2782             c++;
2783          }
2784          seq = seq->next;
2785       }
2786       if (ctr > alnlen)
2787          alnlen = ctr;
2788       if (len > maxlen)
2789          maxlen = len;
2790       iip = iip->next;
2791       i++;
2792    }
2793    if (tip_head == NULL) /* this is a gapless alignment */
2794    {
2795       dsp->numseg = 1;
2796       dsp->starts = (Int4Ptr)MemNew((dsp->dim)*sizeof(Int4));
2797       dsp->lens = (Int4Ptr)MemNew(sizeof(Int4));
2798       dsp->strands = (Uint1Ptr)MemNew((dsp->dim)*sizeof(Uint1));
2799       for (i=0; i<dsp->dim; i++)
2800       {
2801          dsp->strands[i] = Seq_strand_plus;
2802       }
2803       dsp->lens[0] = id_head->length;
2804       /* all the starts are 0 anyway, just leave them and get the ids & seqs */
2805       iip = id_head;
2806       sip_prev = NULL;
2807       buf = (CharPtr)MemNew((maxlen+1)*sizeof(Char));
2808       i = 0;
2809       while (iip != NULL)
2810       {
2811          sip = MakeSeqID(iip->id);
2812          deflines[i] = StringSave(iip->defline);
2813          if (sip_prev != NULL)
2814          {
2815             sip_prev->next = sip;
2816             sip_prev = sip;
2817          } else
2818             dsp->ids = sip_prev = sip;
2819          seq = iip->sequence;
2820          for (ctr = 0; ctr<(maxlen+1); ctr++)
2821          {
2822             buf[ctr] = '\0';
2823          }
2824          ctr = 0;
2825          while (seq != NULL)
2826          {
2827             c = seq->sequence;
2828             while (is_valid_seq(*c, afp->info->missingChar, afp->info->gapChar))
2829             {
2830                if (is_missing(*c, afp->info->missingChar))
2831                   buf[ctr] = 'N';
2832                else
2833                   buf[ctr] = *c;
2834                ctr++;
2835                c++;
2836             }
2837             seq = seq->next;
2838          }
2839          strings[i] = StringSave(buf);
2840          iip = iip->next;
2841          i++;
2842       }
2843       sap->segs = (Pointer)dsp;
2844       MemFree(buf);
2845       *numseq = dsp->dim;
2846       *stringsptr = strings;
2847       *deflineptr = deflines;
2848       return sap;
2849    }
2850    /* now all the segment boundaries have been collected, so sort them */
2851    tiparray = (ALI_TinyInfoPtr PNTR)MemNew(numtips*sizeof(ALI_TinyInfoPtr));
2852    i = 0;
2853    tip = tip_head;
2854    while (tip != NULL)
2855    {
2856       tiparray[i] = tip;
2857       i++;
2858       tip = tip->next;
2859    }
2860    HeapSort(tiparray, numtips, sizeof(ALI_TinyInfoPtr), ALI_SortTips);
2861    dsp->numseg = 2; /* one for the first, one for the last */
2862    for (i=1; i<numtips; i++)
2863    {
2864       if (tiparray[i]->n != tiparray[i-1]->n)
2865          dsp->numseg++;
2866    }
2867    dsp->starts = (Int4Ptr)MemNew((dsp->dim)*(dsp->numseg)*sizeof(Int4));
2868    dsp->lens = (Int4Ptr)MemNew((dsp->numseg)*sizeof(Int4));
2869    last = 0;
2870    j=0;
2871    dsp->lens[0] = tiparray[0]->n;
2872    last = tiparray[0]->n;
2873    j++;
2874    for (i=1; i<numtips; i++)
2875    {
2876       if (tiparray[i]->n != tiparray[i-1]->n)
2877       {
2878          dsp->lens[j] = tiparray[i]->n-last;
2879          last = tiparray[i]->n;
2880          j++;
2881       }
2882    }
2883    dsp->lens[j] = alnlen - last;
2884    dsp->strands = (Uint1Ptr)MemNew((dsp->dim)*(dsp->numseg)*sizeof(Uint1));
2885 /* do we have any strand info to the contrary? */
2886    for (i=0; i<(dsp->dim)*(dsp->numseg); i++)
2887    {
2888       dsp->strands[i] = Seq_strand_plus;
2889    }
2890    iip = id_head;
2891    i = 0;
2892    buf = (CharPtr)MemNew((maxlen+1)*sizeof(Char));
2893    sip_prev = NULL;
2894    while (iip != NULL)
2895    {
2896       j = 0;
2897       for (ctr = 0; ctr<(maxlen+1); ctr++)
2898       {
2899          buf[ctr] = '\0';
2900       }
2901       sip = MakeSeqID(iip->id);
2902       SeqIdSetFree(sip->next);
2903       sip->next = NULL;
2904       deflines[i] = StringSave(iip->defline);
2905       if (sip_prev != NULL)
2906       {
2907          sip_prev->next = sip;
2908          sip_prev = sip;
2909       } else
2910          dsp->ids = sip_prev = sip;
2911       ctr = 0;
2912       ctr_prev = 0;
2913       len = 0;
2914       seq = iip->sequence;
2915       while (seq != NULL)
2916       {
2917          c = seq->sequence;
2918          while (is_valid_seq(*c, afp->info->missingChar, afp->info->gapChar))
2919          {
2920             isgap = is_gap_char(*c, afp->info->gapChar);
2921             if (!isgap)
2922             {
2923                if (is_missing(*c, afp->info->missingChar))
2924                   buf[ctr] = 'N';
2925                else
2926                   buf[ctr] = *c;
2927                ctr++;
2928             }
2929             len++;
2930             if (len == dsp->lens[j])
2931             {
2932                if (isgap)
2933                   dsp->starts[dsp->dim*j+i] = -1;
2934                else
2935                {
2936                   dsp->starts[dsp->dim*j+i] = ctr_prev;
2937                   ctr_prev = ctr;
2938                }
2939                j++;
2940                len = 0;
2941             }
2942             if (*(c+1) == '\0' && seq->next == NULL && j < dsp->numseg)
2943             {
2944                if (isgap)
2945                   dsp->starts[dsp->dim*j+i] = -1;
2946                else
2947                   dsp->starts[dsp->dim*j+i] = ctr_prev;
2948             }
2949             c++;
2950          }
2951          seq = seq->next;
2952       }
2953       strings[i] = StringSave(buf);
2954       iip = iip->next;
2955       i++;
2956    }
2957    sap->segs = (Pointer)dsp;
2958    MemFree(buf);
2959    for (i=0; i<numtips; i++)
2960    {
2961       MemFree(tiparray[i]);
2962    }
2963    MemFree(tiparray);
2964    *numseq = dsp->dim;
2965    *stringsptr = strings;
2966    *deflineptr = deflines;
2967    return sap;
2968 }
2969 
2970 static SeqEntryPtr ALI_make_seqentry_for_seqentry (SeqEntryPtr sep)
2971 {
2972    BioseqPtr     bsp;
2973    BioseqSetPtr  bssp;
2974    SeqEntryPtr   sep_new;
2975    SeqEntryPtr   sep_tmp;
2976 
2977    if (IS_Bioseq(sep) || IS_Bioseq_set(sep))
2978    {
2979       if (sep->next)
2980       {
2981          bssp = BioseqSetNew ();
2982          bssp->_class = 14;
2983          bssp->seq_set = sep;
2984          sep_new = SeqEntryNew ();
2985          sep_new->choice = 2;
2986          sep_new->data.ptrvalue = bssp;
2987          SeqMgrLinkSeqEntry (sep_new, 0, NULL);
2988          sep_tmp = bssp->seq_set;
2989          while (sep_tmp != NULL)
2990          {
2991             if (IS_Bioseq(sep_tmp))
2992             {
2993                bsp = (BioseqPtr)sep_tmp->data.ptrvalue;
2994                ObjMgrConnect (OBJ_BIOSEQ, (Pointer) bsp, OBJ_BIOSEQSET, (Pointer) bssp);
2995             }
2996             sep_tmp = sep_tmp->next;
2997          }
2998       } else
2999          return sep;
3000    }
3001    return sep_new;
3002 }
3003 
3004 static Uint1 ALI_GuessMoltype(CharPtr string)
3005 {
3006    CharPtr  c;
3007 
3008    c = string;
3009    while (*c != '\0')
3010    {
3011        if (StringChr("EFIJLOPQUXZefijlopquxz", *c) != NULL) /* protein */
3012          return Seq_mol_aa;
3013       c++;
3014    }
3015    return Seq_mol_na;
3016 }
3017 static Int4 SPI_MapRowCoords(SeqAlignPtr sap, Int4 from, Int4 to, Int4 row, Uint1 direction)
3018 {
3019    Int4  pos;
3020 
3021    if (direction == 1)
3022    {
3023       pos = AlnMgrMapRowCoords(sap, from, row, NULL);
3024       from++;
3025       while (pos < 0 && from <= to)
3026       {
3027          pos = AlnMgrMapRowCoords(sap, from, row, NULL);
3028          from++;
3029       }
3030    } else
3031    {
3032       pos = AlnMgrMapRowCoords(sap, to, row, NULL);
3033       to--;
3034       while (pos < 0 && to >= from)
3035       {
3036          pos = AlnMgrMapRowCoords(sap, to, row, NULL);
3037          to--;
3038       }
3039    }
3040    if (pos < 0)
3041       return -1;
3042    return pos;
3043 }
3044 
3045 static CharPtr SPI_WriteAlnLine(Int4 row, Int4 from, Int4 to, SeqAlignPtr sap)
3046 {
3047    AlnMsgPtr   amp;
3048    BioseqPtr   bsp;
3049    Uint1       buf[65+2];
3050    Int4        ctr;
3051    Int4        i;
3052    Boolean     more;
3053    Int4        n;
3054    SeqIdPtr    sip;
3055    SeqPortPtr  spp;
3056    CharPtr     string;
3057 
3058    n = AlnMgrGetNumRows(sap);
3059    if (row > n || row < 1)
3060       return NULL;
3061    string = (CharPtr)MemNew((65+2)*sizeof(Char));
3062    for (n=0; n<(65+2); n++)
3063    {
3064       string[n] = '\0';
3065    }
3066    sip = AlnMgrGetNthSeqIdPtr(sap, row);
3067    bsp = BioseqLockById(sip);
3068    amp = AlnMsgNew();
3069    amp->row_num = row;
3070    amp->from_m = from;
3071    amp->to_m = to;
3072    if (amp->to_m < 0)
3073       amp->to_m = -1;
3074    n = 0;
3075    while ((more = AlnMgrGetNextAlnBit(sap, amp)) == TRUE)
3076    {
3077       if (amp->to_b - amp->from_b > amp->to_m - amp->from_m) /* kludge */
3078       {
3079          if (amp->strand == Seq_strand_minus)
3080             amp->from_b = amp->to_b - (amp->to_m - amp->from_m);
3081          else
3082             amp->to_b = amp->from_b + (amp->to_m - amp->from_m);
3083       }
3084       if (amp->gap == 0)
3085       {
3086          spp = SeqPortNew(bsp, amp->from_b, amp->to_b, amp->strand, Seq_code_iupacna);
3087          ctr = SeqPortRead(spp, buf, (amp->to_b - amp->from_b + 1));
3088          SeqPortFree(spp);
3089          for (i=n; i<n+ctr; i++)
3090          {
3091             string[i] = buf[i-n];
3092          }
3093          n += ctr;
3094       } else
3095       {
3096          for (i=n; i<(n+amp->to_b-amp->from_b+1); i++)
3097          {
3098             string[i] = '-';
3099          }
3100          n += amp->to_b-amp->from_b+1;
3101       }
3102    }
3103    AlnMsgFree(amp);
3104    SeqIdFree(sip);
3105    return string;
3106 }
3107 static Int4 spi_get_num_places(Int4 num)
3108 {
3109    FloatHi  f;
3110    Int4     i;
3111    Int4     x;
3112 
3113    x = 10;
3114    for (i=1; i<21; i++)
3115    {
3116       f = (FloatHi)num/(FloatHi)x;
3117       if (f < 1)
3118       {
3119          if (num < 0)
3120             return (i+1);
3121          else
3122             return i;
3123       }
3124       x = x*10;
3125    }
3126    if (num < 0)
3127       i++;
3128    return i;
3129 }
3130 static void PrintOutMultAlign(SeqAlignPtr sap)
3131 {
3132    Int4     c;
3133    Int4Ptr  coord;
3134    Int4     ctr;
3135    Int4     d;
3136    Int4     j;
3137    Int4     len;
3138    Int4     n;
3139    Int4     spacer;
3140    CharPtr  PNTR stringptr;
3141 
3142    spacer = 12;
3143    AlnMgrIndexSingleChildSeqAlign(sap);
3144    n = AlnMgrGetNumRows(sap);
3145    stringptr = (CharPtr PNTR)MemNew(n*sizeof(CharPtr));
3146    coord = (Int4Ptr)MemNew(n*sizeof(Int4));
3147    len = AlnMgrGetAlnLength(sap, FALSE);
3148    for (c=0; c<len; c+=65-10)
3149    {
3150       for (j=0; j<n; j++)
3151       {
3152          stringptr[j] = SPI_WriteAlnLine(j+1, c, MIN(c+65-10-1, len-1), sap);
3153          coord[j] = SPI_MapRowCoords(sap, c, MIN(c+65-10-1, len-1), j+1, 1);
3154          if (coord[j] >= 0)
3155             coord[j]++;
3156       }
3157       for (j=0; j<n; j++)
3158       {
3159          printf("%d", coord[j]);
3160          d = spi_get_num_places(coord[j]);
3161          for (d; d<spacer; d++)
3162          {
3163             printf(" ");
3164          }
3165          if (j == 0)
3166             printf("%s", stringptr[j]);
3167          else
3168          {
3169             for (ctr=0; ctr<MIN(65-10, len-c); ctr++)
3170             {
3171                if (stringptr[j][ctr] == stringptr[0][ctr])
3172                   printf(".");
3173                else
3174                   printf("%c", stringptr[j][ctr]);
3175             }
3176          }
3177          printf("\n");
3178          MemFree(stringptr[j]);
3179       }
3180       if (c+65-10 < len)
3181          printf("\n");
3182    }
3183    fflush(stdout);
3184 }
3185 
3186 static void PrintOutSegs(SeqAlignPtr sap)
3187 {
3188    DenseSegPtr  dsp;
3189    Int4         i;
3190    Int4         j;
3191 
3192    dsp = (DenseSegPtr)(sap->segs);
3193    printf("nums:\t");
3194    for (i=0; i<dsp->numseg; i++)
3195    {
3196       printf("%d\t", i+1);
3197    }
3198    printf("\n");
3199    printf("lens:\t");
3200    for (i=0; i<dsp->numseg; i++)
3201    {
3202       printf("%d\t", dsp->lens[i]);
3203    }
3204    printf("\n");
3205    for (i=0; i<dsp->dim; i++)
3206    {
3207       printf("row %d\t", i+1);
3208       for (j=0; j<dsp->numseg; j++)
3209       {
3210          printf("%d\t", dsp->starts[(dsp->dim)*j+i]);
3211       }
3212       printf("\n");
3213    }
3214    fflush(stdout);
3215 }
3216 
3217 NLM_EXTERN SeqEntryPtr ALI_ConvertToNCBIData(AlignFileDataPtr afp)
3218 {
3219    BioseqPtr    bsp;
3220    CharPtr      PNTR deflines;
3221    Int4         i;
3222    Int4         len;
3223    Uint1        moltype;
3224    Int4         numseq;
3225    SeqAnnotPtr  sanp;
3226    SeqAlignPtr  sap;
3227    SeqDescrPtr  sdp;
3228    SeqEntryPtr  sep;
3229    SeqEntryPtr  sep_head;
3230    SeqEntryPtr  sep_prev;
3231    SeqIdPtr     sip;
3232    CharPtr      str;
3233    CharPtr      PNTR strings;
3234 
3235    if (afp == NULL || afp->sequences == NULL)
3236    {
3237       ErrPostEx(SEV_ERROR, 0, 0, "NULL Data Passed to ConvertToNCBIData");
3238       return NULL;
3239    }
3240    sap = ALI_MakeSeqAlign(afp, &strings, &numseq, &deflines);
3241    if (sap == NULL)
3242    {
3243       ErrPostEx(SEV_ERROR, 0, 0, "Unable to create seqentry\n");
3244       return NULL;
3245    }
3246    sanp = SeqAnnotNew();
3247    sanp->type = 2;
3248    sanp->data = (Pointer)sap;
3249    moltype = ALI_GuessMoltype(strings[0]);
3250    sip = ((DenseSegPtr)(sap->segs))->ids;
3251    sep_head = sep_prev = NULL;
3252    for (i=0; i<numseq; i++)
3253    {
3254       len = StringLen(strings[i]);
3255       sep = StringToSeqEntry (strings[i], sip, len, moltype);
3256       if (sep != NULL) {
3257         bsp = (BioseqPtr)(sep->data.ptrvalue);
3258         if (! StringHasNoText (deflines[i])) {
3259           str = deflines[i];
3260           sdp = SeqDescrAddPointer(&(bsp->descr), Seq_descr_title, str);
3261         }
3262         if (sep != NULL)
3263           {
3264             if (sep_head != NULL)
3265               {
3266                 sep_prev->next = sep;
3267                 sep_prev = sep;
3268               } else
3269                 sep_head = sep_prev = sep;
3270           }
3271         sip = sip->next;
3272         MemFree(strings[i]);
3273       }
3274    }
3275    sep_head = ALI_make_seqentry_for_seqentry (sep_head);
3276    SeqAlignAddInSeqEntry (sep_head, sanp);
3277    MemFree(strings);
3278    MemFree(deflines);
3279    return sep_head;
3280 }
3281 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.