NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/api/aliparse.c |
source navigation diff markup identifier search freetext search file search |
1 /*=========================================================================*/ 2 /* */ 3 /* aliparse.c */ 4 /* */ 5 /*=========================================================================*/ 6 7 #include <stdarg.h> 8 9 #include <aliparse.h> 10 #include <aliread.h> 11 12 /* Defined constants */ 13 14 #define ALI_USE_MAYBES FALSE /* Default values for */ 15 #define ALI_READ_BUFFSIZE 80 /* configuration options */ 16 #define ALI_GAP_CHAR '-' /* | */ 17 #define ALI_MISSING_CHAR '?' /* | */ 18 #define ALI_CORRUPT_SEQ_THRESHOLD 95 /* | */ 19 #define ALI_NUCL_LINE_MAX_THRESHOLD 75 /* | */ 20 #define ALI_NUCL_LINE_MIN_THRESHOLD 25 /* V */ 21 22 /* Data structures */ 23 24 typedef struct 25 { 26 DataInfo foundInfo; 27 IdInfoPtr currentId; 28 IdInfoPtr currentDeflineId; 29 ValNodePtr lastRow; 30 Boolean hasFullLength; 31 Boolean isFirstGroup; 32 Boolean isFirstId; 33 Boolean maybesFound; 34 SeqPartPtr lastSeqPart; 35 Boolean gotAllIds; 36 Int4 idCount; 37 Int4 currentIdCount; 38 } PatternInfo, PNTR PatternInfoPtr; 39 40 /* Filewide static variables */ 41 42 static AliConfigInfo s_configInfo; 43 static Boolean s_configurationSet = FALSE; 44 45 /* Function prototypes */ 46 47 static void s_FreeErrorList (ErrInfoPtr errorList); 48 static void s_FreeSequenceList (SeqPartPtr seqPtr); 49 static void s_FreeIdList (IdInfoPtr idList); 50 static void s_FreeRowList (ValNodePtr rowList); 51 static void s_FreeRowList_Safe (ValNodePtr rowList); 52 static void s_DisplayRowList (ValNodePtr rowList, 53 Int2 mask); 54 static CharPtr s_GetRowIdString (ValNodePtr row); 55 static CharPtr s_GetRowSeqString (ValNodePtr row); 56 static IdInfoPtr s_ProcessMaybes (ValNodePtr rowList); 57 static int s_SegCompare(const void *i, 58 const void *j); 59 static Boolean s_IsInterleaved (ValNodePtr rowList, 60 Int2 PNTR idCount); 61 static Boolean s_ProcessInterId (CharPtr newIdStr, 62 PatternInfoPtr pattern, 63 AlignFileDataPtr fileInfoPtr, 64 Boolean isMaybe); 65 static Boolean s_ProcessInterSeq (CharPtr newSeqStr, 66 PatternInfoPtr pattern, 67 AlignFileDataPtr fileInfoPtr, 68 Boolean isMaybe); 69 static Boolean s_AnalyzeInterleaved (ValNodePtr rowList, 70 AlignFileDataPtr fileInfoPtr, 71 Int2 idCount); 72 static Boolean s_ProcessContigId (CharPtr newIdStr, 73 PatternInfoPtr pattern, 74 AlignFileDataPtr fileInfoPtr); 75 static Boolean s_ProcessContigSeq (CharPtr newSeqStr, 76 PatternInfoPtr pattern, 77 AlignFileDataPtr fileInfoPtr); 78 static Boolean s_AnalyzeContiguous (ValNodePtr rowList, 79 AlignFileDataPtr fileInfoPtr); 80 static Boolean s_AnalyzeContents (ValNodePtr rowList, 81 AlignFileDataPtr fileInfoPtr); 82 static void s_SortErrors (AlignFileDataPtr fileInfoPtr); 83 static void s_AnalyzeErrors (AlignFileDataPtr fileInfoPtr); 84 static Boolean s_CheckContext (ValNodePtr rowList, 85 AlignFileDataPtr fileInfoPtr); 86 87 88 /*=========================================================================*/ 89 /* */ 90 /* Ali_GetConfig () -- Get the current configuration settings. */ 91 /* */ 92 /*=========================================================================*/ 93 94 AliConfigInfoPtr Ali_GetConfig (void) 95 { 96 97 AliConfigInfoPtr configPtr; 98 99 /* If configuration hasn't been set yet, */ 100 /* then set it to the defaults. */ 101 102 if (s_configurationSet == FALSE) 103 { 104 s_configInfo.useMaybes = ALI_USE_MAYBES; 105 s_configInfo.readBuffSize = ALI_READ_BUFFSIZE; 106 s_configInfo.debugLevel = ALI_SHOW_NONE; 107 s_configInfo.corruptSeqThreshold = ALI_CORRUPT_SEQ_THRESHOLD; 108 s_configInfo.nuclLineMinThreshold = ALI_NUCL_LINE_MIN_THRESHOLD; 109 s_configInfo.nuclLineMaxThreshold = ALI_NUCL_LINE_MAX_THRESHOLD; 110 s_configInfo.errExpandLevel = ALI_ERRMSG_EXPAND_SOME; 111 s_configInfo.declaredInfo.dataType = ALI_UNKNOWN; 112 s_configInfo.declaredInfo.contigOrInter = ALI_UNKNOWN; 113 s_configInfo.declaredInfo.idCount = 0; 114 s_configInfo.declaredInfo.seqLength = 0; 115 116 s_configInfo.gapChar = (CharPtr) MemNew (32); 117 sprintf (s_configInfo.gapChar , "%c%c", ALI_GAP_CHAR, '.'); 118 s_configInfo.missingChar = (CharPtr) MemNew (32); 119 sprintf (s_configInfo.missingChar, "%c", ALI_MISSING_CHAR); 120 121 s_configurationSet = TRUE; 122 } 123 124 /* Copy the current settings to the return struct */ 125 126 configPtr = (AliConfigInfoPtr) MemNew (sizeof (AliConfigInfo)); 127 MemSet (configPtr, 0, sizeof (AliConfigInfo)); 128 129 configPtr->useMaybes = s_configInfo.useMaybes; 130 configPtr->readBuffSize = s_configInfo.readBuffSize; 131 configPtr->debugLevel = s_configInfo.debugLevel; 132 configPtr->corruptSeqThreshold = s_configInfo.corruptSeqThreshold; 133 configPtr->nuclLineMinThreshold = s_configInfo.nuclLineMinThreshold; 134 configPtr->nuclLineMaxThreshold = s_configInfo.nuclLineMaxThreshold; 135 configPtr->errExpandLevel = s_configInfo.errExpandLevel; 136 137 configPtr->gapChar = (CharPtr) MemNew (32); 138 StringCpy (configPtr->gapChar, s_configInfo.gapChar); 139 configPtr->missingChar = (CharPtr) MemNew (32); 140 StringCpy (configPtr->missingChar, s_configInfo.missingChar); 141 142 /* Return successfully */ 143 144 return configPtr; 145 } 146 147 /*=========================================================================*/ 148 /* */ 149 /* Ali_SetConfig () - Sets various runtime configuration options used by */ 150 /* the Ali_Read () function. */ 151 /* */ 152 /* configPtr */ 153 /* --------- */ 154 /* */ 155 /* The configPtr parameter contains new values for one or more */ 156 /* configuration settings. The values that are applied are selected by */ 157 /* the options parameter. */ 158 /* */ 159 /* gapChar - [default: '-'] -- This is the character that will be used */ 160 /* as the gap character if the file does not define one. */ 161 /* */ 162 /* missingChar - [default: '?'] -- This is the character that will be */ 163 /* used as missing character if the file does not define */ 164 /* one. */ 165 /* */ 166 /* useMaybes - [default: FALSE] -- If a line is found that doesn't */ 167 /* quite meet the criteria for being a sequence, but is */ 168 /* close enough that it might be a slightly mangled */ 169 /* sequence line, then it is marked as a 'maybe'. The */ 170 /* useMaybes setting determines how these 'maybe' */ 171 /* sequences are treated. If set to FALSE, they ARE NOT */ 172 /* treated as sequences, if set to TRUE they ARE treated */ 173 /* as sequences. */ 174 /* */ 175 /* readBuffSize - [default: 2048] -- This is size (in bytes) of the */ 176 /* chunks that are read when reading in the file. */ 177 /* Setting it to higher values may increase the */ 178 /* efficiency, but with operating system and hardware */ 179 /* buffering going on, it probably doesn't make much */ 180 /* difference. */ 181 /* */ 182 /* debugLevel - [default: ALI_SHOW_NONE] -- Determines what debugging */ 183 /* information to display to stderr during processing. */ 184 /* Can be set to one of the following: */ 185 /* */ 186 /* ALI_SHOW_NONE : Show no debugging info [default] */ 187 /* ALI_SHOW_SEQUENCES : Show lines classified as seqs */ 188 /* ALI_SHOW_DEFLINES : Show lines classified as deflines*/ 189 /* ALI_SHOW_OTHERS : Show lines classified as others */ 190 /* (ie, not sequences or deflines). */ 191 /* ALI_SHOW_ALL : Show all lines and their */ 192 /* classification. */ 193 /* */ 194 /* corruptSeqThreshold - [Default: 95] -- Used to guess that a line is */ 195 /* actually a corrupted sequence. If the line */ 196 /* contains a percentage of sequence characters */ 197 /* equal to or above the corruptSeqThreshold */ 198 /* then it is marked as maybe a sequence line. */ 199 /* */ 200 /* nuclLineMaxThreshold - [Default: 75] -- Used to determine whether a */ 201 /* sequence is DNA or protein. If the line has */ 202 /* MORE than nuclLineMaxThreshold percent of */ 203 /* the characters "ACGT" and the missing and */ 204 /* gap chars (and all the other characters are */ 205 /* ambiguous protein/DNA characters), then it */ 206 /* is marked as a nucleotide sequence. */ 207 /* */ 208 /* nuclLineMinThreshold - [Default: 25] -- Used to determine whether a */ 209 /* sequence is DNA or protein. If the line has */ 210 /* LESS than nuclLineMinThreshold percent of */ 211 /* the characters "ACGT" and the missing and */ 212 /* gap chars (and all the other characters are */ 213 /* ambiguous protein/DNA characters), then it */ 214 /* is marked as a protein sequence. */ 215 /* */ 216 /* errExpandLevel - */ 217 /* */ 218 /* */ 219 /* options parameter */ 220 /* ----------------- */ 221 /* */ 222 /* The options parameter determines which fields in the configPtr are */ 223 /* being given new values. It contains one or more of the following */ 224 /* values OR'd together : */ 225 /* */ 226 /* ALI_SET_DEFAULTS */ 227 /* ALI_SET_ALL */ 228 /* */ 229 /* ALI_SET_GAP_CHAR */ 230 /* ALI_SET_MISSING_CHAR */ 231 /* ALI_SET_MAYBES */ 232 /* ALI_SET_READBUFF */ 233 /* ALI_SET_NUCL_MIN */ 234 /* ALI_SET_NUCL_MAX */ 235 /* ALI_SET_CORRUPT_MAX */ 236 /* ALI_SET_DEBUG_LEVEL */ 237 /* ALI_SET_ERRMSG_EXPAND */ 238 /* */ 239 /* If ALI_SET_DEFAULTS or ALI_SET_ALL are used then any others are */ 240 /* ignored. */ 241 /* */ 242 /*=========================================================================*/ 243 244 Boolean Ali_SetConfig (AliConfigInfoPtr configPtr, 245 Int2 mask) 246 { 247 248 /* If this is the first time called, or we're restoring */ 249 /* the defaults, then set all options to the defaults. */ 250 251 if ((s_configurationSet == FALSE) || 252 (configPtr == NULL) || 253 (mask == ALI_SET_DEFAULTS)) 254 { 255 s_configInfo.useMaybes = ALI_USE_MAYBES; 256 s_configInfo.readBuffSize = ALI_READ_BUFFSIZE; 257 s_configInfo.debugLevel = ALI_SHOW_NONE; 258 s_configInfo.corruptSeqThreshold = ALI_CORRUPT_SEQ_THRESHOLD; 259 s_configInfo.nuclLineMinThreshold = ALI_NUCL_LINE_MIN_THRESHOLD; 260 s_configInfo.nuclLineMaxThreshold = ALI_NUCL_LINE_MAX_THRESHOLD; 261 s_configInfo.declaredInfo.dataType = ALI_UNKNOWN; 262 s_configInfo.declaredInfo.contigOrInter = ALI_UNKNOWN; 263 s_configInfo.errExpandLevel = ALI_ERRMSG_EXPAND_SOME; 264 s_configInfo.declaredInfo.idCount = 0; 265 s_configInfo.declaredInfo.seqLength = 0; 266 s_configInfo.gapChar = (CharPtr) MemNew (32); 267 sprintf (s_configInfo.gapChar , "%c%c", ALI_GAP_CHAR, '.'); 268 s_configInfo.missingChar = (CharPtr) MemNew (32); 269 sprintf (s_configInfo.missingChar, "%c", ALI_MISSING_CHAR); 270 } 271 272 s_configurationSet = TRUE; 273 274 /* If we're setting to the defaults, then we're done */ 275 276 if ((configPtr == NULL) || (mask == ALI_SET_DEFAULTS)) 277 return TRUE; 278 279 /* Otherwise, override the current settings */ 280 /* where instructed. */ 281 282 if ((mask & ALI_SET_GAP_CHAR) || (mask == ALI_SET_ALL)) 283 StringCpy (s_configInfo.gapChar, configPtr->gapChar); 284 285 if ((mask & ALI_SET_MISSING_CHAR) || (mask == ALI_SET_ALL)) 286 StringCpy (s_configInfo.missingChar, configPtr->missingChar); 287 288 if ((mask & ALI_SET_MAYBES) || (mask == ALI_SET_ALL)) 289 s_configInfo.useMaybes = configPtr->useMaybes; 290 291 if ((mask & ALI_SET_READBUFF) || (mask == ALI_SET_ALL)) 292 s_configInfo.readBuffSize = configPtr->readBuffSize; 293 294 if ((mask & ALI_SET_NUCL_MIN) || (mask == ALI_SET_ALL)) 295 s_configInfo.nuclLineMinThreshold = configPtr->nuclLineMinThreshold; 296 297 if ((mask & ALI_SET_NUCL_MAX) || (mask == ALI_SET_ALL)) 298 s_configInfo.nuclLineMaxThreshold = configPtr->nuclLineMaxThreshold; 299 300 if ((mask & ALI_SET_CORRUPT_MAX) || (mask == ALI_SET_ALL)) 301 s_configInfo.corruptSeqThreshold = configPtr->corruptSeqThreshold; 302 303 if ((mask & ALI_SET_DEBUG_LEVEL) || (mask == ALI_SET_ALL)) 304 s_configInfo.debugLevel = configPtr->debugLevel; 305 306 if ((mask & ALI_SET_ERRMSG_EXPAND) || (mask == ALI_SET_ALL)) 307 s_configInfo.errExpandLevel = configPtr->errExpandLevel; 308 309 /* Return successfully */ 310 311 return TRUE; 312 } 313 314 /*=========================================================================*/ 315 /* */ 316 /* s_FreeErrorNode () - Free one error structure. */ 317 /* */ 318 /*=========================================================================*/ 319 320 static void s_FreeErrorNode (ErrInfoPtr errorPtr) 321 { 322 if (errorPtr->info != NULL) 323 { 324 MemFree (errorPtr->info); 325 errorPtr->info = NULL; 326 } 327 if (errorPtr->extraInfo != NULL) 328 { 329 MemFree (errorPtr->extraInfo); 330 errorPtr->extraInfo = NULL; 331 } 332 MemFree (errorPtr); 333 } 334 335 /*=========================================================================*/ 336 /* */ 337 /* s_FreeErrorList () - Free a linked list of error structures and all */ 338 /* the memory that they point to. */ 339 /* */ 340 /*=========================================================================*/ 341 342 static void s_FreeErrorList (ErrInfoPtr errorPtr) 343 { 344 ErrInfoPtr currentErr; 345 346 while (errorPtr != NULL) 347 { 348 currentErr = errorPtr; 349 errorPtr = errorPtr->next; 350 s_FreeErrorNode (currentErr); 351 } 352 } 353 354 /*=========================================================================*/ 355 /* */ 356 /* s_FreeSequenceList () - Free a linked list of SeqPart structures and */ 357 /* all the memory that they point to. */ 358 /* */ 359 /*=========================================================================*/ 360 361 static void s_FreeSequenceList (SeqPartPtr seqPtr) 362 { 363 SeqPartPtr currentSeq; 364 365 while (seqPtr != NULL) 366 { 367 MemFree (seqPtr->sequence); 368 currentSeq = seqPtr; 369 seqPtr = seqPtr->next; 370 MemFree (currentSeq); 371 } 372 } 373 374 /*=========================================================================*/ 375 /* */ 376 /* s_FreeIdList () - Free a linked list of ID structures and all the */ 377 /* memory that they point to. */ 378 /* */ 379 /*=========================================================================*/ 380 381 static void s_FreeIdList (IdInfoPtr idPtr) 382 { 383 IdInfoPtr currentId; 384 385 while (idPtr != NULL) 386 { 387 MemFree (idPtr->id); 388 s_FreeSequenceList (idPtr->sequence); 389 MemFree (idPtr->defline); 390 currentId = idPtr; 391 idPtr = idPtr->next; 392 MemFree (currentId); 393 } 394 } 395 396 /*=========================================================================*/ 397 /* */ 398 /* s_FreeParsedInfo () - Free a ParsedInfo structure and the memory that */ 399 /* it points to. */ 400 /* */ 401 /*=========================================================================*/ 402 403 static void s_FreeParsedInfo (ParsedInfoPtr info) 404 { 405 if (info->missingChar != NULL) 406 MemFree (info->missingChar); 407 if (info->gapChar != NULL) 408 MemFree (info->gapChar); 409 if (info->unalignedChar != NULL) 410 MemFree (info->unalignedChar); 411 MemFree (info); 412 } 413 414 /*=========================================================================*/ 415 /* */ 416 /* Ali_Free () - Free a AlignFileData structure and all the memory that */ 417 /* it points to. */ 418 /* */ 419 /*=========================================================================*/ 420 421 void Ali_Free (AlignFileDataPtr fileInfoPtr) 422 { 423 424 s_FreeIdList (fileInfoPtr->sequences); 425 fileInfoPtr->sequences = NULL; 426 s_FreeIdList (fileInfoPtr->maybes); 427 fileInfoPtr->maybes = NULL; 428 s_FreeErrorList (fileInfoPtr->errors); 429 fileInfoPtr->errors = NULL; 430 s_FreeParsedInfo (fileInfoPtr->info); 431 fileInfoPtr->info = NULL; 432 433 MemFree (fileInfoPtr); 434 435 return; 436 } 437 438 /*=========================================================================*/ 439 /* */ 440 /* s_FreeRowList () - Free all row data structures and the strings that */ 441 /* they point to. */ 442 /* */ 443 /* NOTE: The actual data strings in the row list may be pointed */ 444 /* to by other structures, in which case */ 445 /* s_FreeRowList_Safe () should be used instead. */ 446 /* */ 447 /*=========================================================================*/ 448 449 static void s_FreeRowList (ValNodePtr rowList) 450 { 451 ValNodePtr currentRow; 452 SeqLineInfoPtr seqLine; 453 DefLineInfoPtr defLine; 454 OtherLineInfoPtr otherLine; 455 456 while (rowList != NULL) 457 { 458 switch (rowList->choice) 459 { 460 case ALI_DEFLINE : 461 defLine = (DefLineInfoPtr) rowList->data.ptrvalue; 462 if (defLine->definitions != NULL) 463 MemFree (defLine->definitions); 464 if (defLine->id != NULL) 465 MemFree (defLine->id); 466 MemFree (defLine); 467 break; 468 case ALI_SEQLINE : 469 seqLine = (SeqLineInfoPtr) rowList->data.ptrvalue; 470 if (seqLine->sequence != NULL) 471 MemFree (seqLine->sequence); 472 if (seqLine->id != NULL) 473 MemFree (seqLine->id); 474 if (seqLine->junk != NULL) 475 MemFree (seqLine->junk); 476 MemFree (seqLine); 477 break; 478 case ALI_OTHERLINE : 479 otherLine = (OtherLineInfoPtr) rowList->data.ptrvalue; 480 if (otherLine->other != NULL) 481 MemFree (otherLine->other); 482 if (otherLine->id != NULL) 483 MemFree (otherLine->id); 484 MemFree (otherLine); 485 break; 486 default: 487 break; 488 } 489 currentRow = rowList; 490 rowList = rowList->next; 491 MemFree (currentRow); 492 } 493 } 494 495 /*=========================================================================*/ 496 /* */ 497 /* s_FreeRowList_Safe () - Free all row data structures, but don't free */ 498 /* the strings that they point since they are */ 499 /* still being used in the ID structures. */ 500 /* */ 501 /*=========================================================================*/ 502 503 static void s_FreeRowList_Safe (ValNodePtr rowList) 504 { 505 ValNodePtr currentRow; 506 SeqLineInfoPtr seqLine; 507 DefLineInfoPtr defLine; 508 OtherLineInfoPtr otherLine; 509 510 while (rowList != NULL) 511 { 512 switch (rowList->choice) 513 { 514 case ALI_DEFLINE : 515 defLine = (DefLineInfoPtr) rowList->data.ptrvalue; 516 MemFree (defLine); 517 break; 518 case ALI_SEQLINE : 519 seqLine = (SeqLineInfoPtr) rowList->data.ptrvalue; 520 MemFree (seqLine); 521 break; 522 case ALI_OTHERLINE : 523 otherLine = (OtherLineInfoPtr) rowList->data.ptrvalue; 524 MemFree (otherLine); 525 break; 526 default: 527 break; 528 } 529 currentRow = rowList; 530 rowList = rowList->next; 531 MemFree (currentRow); 532 } 533 } 534 535 /*=========================================================================*/ 536 /* */ 537 /* s_GetRowIdStr () */ 538 /* */ 539 /*=========================================================================*/ 540 541 static CharPtr s_GetRowIdString (ValNodePtr row) 542 { 543 CharPtr newIdStr; 544 SeqLineInfoPtr seqLinePtr; 545 DefLineInfoPtr defLinePtr; 546 OtherLineInfoPtr otherLinePtr; 547 548 if (row == NULL) 549 return NULL; 550 551 if (row->choice == ALI_SEQLINE) 552 { 553 seqLinePtr = (SeqLineInfoPtr) row->data.ptrvalue; 554 if (seqLinePtr->id != NULL) 555 { 556 if ((seqLinePtr->maybe == TRUE) && (s_configInfo.useMaybes == FALSE)) 557 newIdStr = NULL; 558 else 559 newIdStr = seqLinePtr->id; 560 } 561 else 562 newIdStr = NULL; 563 } 564 else if (row->choice == ALI_DEFLINE) 565 { 566 defLinePtr = (DefLineInfoPtr) row->data.ptrvalue; 567 if (defLinePtr->id != NULL) 568 newIdStr = defLinePtr->id; 569 else 570 newIdStr = NULL; 571 } 572 else if (row->choice == ALI_OTHERLINE) 573 { 574 otherLinePtr = (OtherLineInfoPtr) row->data.ptrvalue; 575 if (otherLinePtr->id != NULL) 576 newIdStr = otherLinePtr->id; 577 else 578 newIdStr = NULL; 579 } 580 581 return newIdStr; 582 } 583 584 /*=========================================================================*/ 585 /* */ 586 /* s_GetRowSeqStr () */ 587 /* */ 588 /*=========================================================================*/ 589 590 static CharPtr s_GetRowSeqString (ValNodePtr row) 591 { 592 CharPtr newSeqStr; 593 SeqLineInfoPtr seqLinePtr; 594 595 if (row == NULL) 596 return NULL; 597 598 if (row->choice == ALI_SEQLINE) 599 { 600 seqLinePtr = (SeqLineInfoPtr) row->data.ptrvalue; 601 if (seqLinePtr->sequence != NULL) 602 { 603 if ((seqLinePtr->maybe == TRUE) && (s_configInfo.useMaybes == FALSE)) 604 newSeqStr = NULL; 605 else 606 newSeqStr = seqLinePtr->sequence; 607 } 608 else 609 newSeqStr = NULL; 610 } 611 else 612 newSeqStr = NULL; 613 614 return newSeqStr; 615 } 616 617 /*=========================================================================*/ 618 /* */ 619 /* s_ProcessMaybes () */ 620 /* */ 621 /*=========================================================================*/ 622 623 static IdInfoPtr s_ProcessMaybes (ValNodePtr rowList) 624 { 625 ValNodePtr currentRow; 626 IdInfoPtr badIdList = NULL; 627 IdInfoPtr existingId = NULL; 628 IdInfoPtr currentId = NULL; 629 IdInfoPtr lastId = NULL; 630 CharPtr idStr; 631 CharPtr currentIdStr; 632 SeqPartPtr newSeqPart; 633 SeqPartPtr lastSeqPart; 634 SeqLineInfoPtr seqLinePtr; 635 636 currentRow = rowList; 637 638 while (currentRow != NULL) 639 { 640 idStr = s_GetRowIdString (currentRow); 641 if (idStr != NULL) 642 currentIdStr = idStr; 643 644 if (currentRow->choice == ALI_SEQLINE) 645 { 646 seqLinePtr = (SeqLineInfoPtr) currentRow->data.ptrvalue; 647 if (seqLinePtr->maybe == TRUE) 648 { 649 650 /* Find the ID that this sequence 'belongs to' */ 651 652 existingId = badIdList; 653 while (existingId != NULL) 654 { 655 if (StringCmp(existingId->id,currentIdStr) == 0) 656 break; 657 existingId = existingId->next; 658 } 659 660 if (existingId != NULL) 661 currentId = existingId; 662 else 663 { 664 currentId = (IdInfoPtr) MemNew (sizeof(IdInfo)); 665 if (currentId == NULL) 666 return NULL; 667 668 currentId->sequence = NULL; 669 currentId->id = currentIdStr; 670 currentId->length = 0; 671 currentId->next = NULL; 672 673 if (badIdList == NULL) 674 badIdList = currentId; 675 else 676 { 677 lastId = badIdList; 678 while (lastId->next != NULL) 679 lastId = lastId->next; 680 lastId->next = currentId; 681 } 682 } 683 684 /* Add the sequence to the current ID */ 685 686 newSeqPart = (SeqPartPtr) MemNew(sizeof(SeqPart)); 687 if (newSeqPart == NULL) 688 return NULL; 689 690 newSeqPart->sequence = (CharPtr) currentRow->data.ptrvalue; 691 newSeqPart->next = NULL; 692 693 if (currentId->sequence == NULL) 694 currentId->sequence = newSeqPart; 695 else 696 lastSeqPart->next = newSeqPart; 697 698 currentId->length += StringLen (newSeqPart->sequence); 699 lastSeqPart = newSeqPart; 700 701 } 702 } 703 currentRow = currentRow->next; 704 } 705 706 return badIdList; 707 } 708 709 /*=========================================================================*/ 710 /* */ 711 /* DisplayRowList() - Prints to stderr the linked list of ValNodes that */ 712 /* contain the data read in from the alignment file. */ 713 /* */ 714 /*=========================================================================*/ 715 716 static void s_DisplayRowList (ValNodePtr rowList, 717 Int2 mask) 718 { 719 ValNodePtr currRow; 720 SeqLineInfoPtr seqLinePtr; 721 DefLineInfoPtr defLinePtr; 722 OtherLineInfoPtr otherLinePtr; 723 Char cLineType; 724 725 currRow = rowList; 726 while (currRow != NULL) 727 { 728 if ((currRow->choice == ALI_SEQLINE) && 729 ((mask & ALI_SHOW_SEQUENCES) || 730 (mask == ALI_SHOW_ALL))) 731 { 732 seqLinePtr = (SeqLineInfoPtr) currRow->data.ptrvalue; 733 734 if (seqLinePtr->type == ALI_NUCLEOTIDE) 735 cLineType = 'N'; 736 else if (seqLinePtr->type == ALI_PROTEIN) 737 cLineType = 'P'; 738 else if (seqLinePtr->type == ALI_AMBIGUOUS) 739 cLineType = 'U'; 740 741 if (seqLinePtr->maybe == FALSE) 742 { 743 if (seqLinePtr->id != NULL) 744 fprintf(stderr,"%04d: ID : %s\n", 745 seqLinePtr->rowNum, 746 seqLinePtr->id); 747 if (seqLinePtr->sequence != NULL) 748 fprintf(stderr,"%04d: SEQUENCE[%c] : %s\n", 749 seqLinePtr->rowNum, 750 cLineType, 751 seqLinePtr->sequence); 752 } 753 else 754 { 755 if (seqLinePtr->id != NULL) 756 fprintf(stderr,"%04d: MAYBE ID : %s\n", 757 seqLinePtr->rowNum, 758 seqLinePtr->id); 759 if (seqLinePtr->sequence != NULL) 760 fprintf(stderr,"%04d: MAYBE SEQUENCE[%c] : %s\n", 761 seqLinePtr->rowNum, 762 cLineType, 763 seqLinePtr->sequence); 764 } 765 } 766 else if ((currRow->choice == ALI_DEFLINE) && 767 ((mask & ALI_SHOW_DEFLINES) || 768 (mask == ALI_SHOW_ALL))) 769 { 770 defLinePtr = (DefLineInfoPtr) currRow->data.ptrvalue; 771 if (defLinePtr->id != NULL) 772 fprintf(stderr,"%04d: DEFLINE ID : %s\n", 773 defLinePtr->rowNum, 774 defLinePtr->id); 775 if (defLinePtr->definitions != NULL) 776 fprintf(stderr,"%04d: DEFLINE DEFINITIONS : %s\n", 777 defLinePtr->rowNum, 778 defLinePtr->definitions); 779 } 780 else if ((currRow->choice == ALI_OTHERLINE) && 781 ((mask & ALI_SHOW_OTHERS) || 782 (mask == ALI_SHOW_ALL))) 783 { 784 otherLinePtr = (OtherLineInfoPtr) currRow->data.ptrvalue; 785 if (otherLinePtr->id != NULL) 786 fprintf(stderr,"%04d: OTHER ID : %s\n", otherLinePtr->rowNum, 787 otherLinePtr->id); 788 if (otherLinePtr->other != NULL) 789 fprintf(stderr,"%04d: OTHER : %s\n", otherLinePtr->rowNum, 790 otherLinePtr->other); 791 } 792 currRow = currRow->next; 793 } 794 795 return; 796 } 797 798 799 /*=========================================================================*/ 800 /* */ 801 /* s_isInterleaved () */ 802 /* */ 803 /*=========================================================================*/ 804 805 static Boolean s_IsInterleaved (ValNodePtr rowList, 806 Int2 PNTR idCount) 807 { 808 ValNodePtr currentRow; 809 CharPtr newIdStr; 810 IdInfoPtr idList = NULL; 811 IdInfoPtr lastId = NULL; 812 IdInfoPtr currentId = NULL; 813 IdInfoPtr existingId = NULL; 814 Boolean isInterleaved; 815 Int4 patternRowCount; 816 Int4 patternCharCount; 817 Int4 currentRowCount; 818 Int4 currentCharCount; 819 Boolean isFirstId; 820 SeqLineInfoPtr seqLinePtr; 821 DefLineInfoPtr defLinePtr; 822 OtherLineInfoPtr otherLinePtr; 823 Boolean isMaybe; 824 825 isInterleaved = FALSE; 826 currentRow = rowList; 827 828 patternRowCount = 0; 829 patternCharCount = 0; 830 currentRowCount = 0; 831 currentCharCount = 0; 832 isFirstId = TRUE; 833 *idCount = 0; 834 835 /* Search the row list for IDs */ 836 837 while (currentRow != NULL) 838 { 839 840 /* Look for an ID */ 841 842 newIdStr = NULL; 843 isMaybe = FALSE; 844 845 if (currentRow->choice == ALI_SEQLINE) 846 { 847 seqLinePtr = (SeqLineInfoPtr) currentRow->data.ptrvalue; 848 if (seqLinePtr->id != NULL) 849 { 850 if ((seqLinePtr->maybe == TRUE) && 851 (s_configInfo.useMaybes == FALSE)) 852 newIdStr = NULL; 853 else 854 newIdStr = seqLinePtr->id; 855 } 856 } 857 else if (currentRow->choice == ALI_DEFLINE) 858 { 859 defLinePtr = (DefLineInfoPtr) currentRow->data.ptrvalue; 860 if (defLinePtr->id != NULL) 861 newIdStr = defLinePtr->id; 862 } 863 else if (currentRow->choice == ALI_OTHERLINE) 864 { 865 otherLinePtr = (OtherLineInfoPtr) currentRow->data.ptrvalue; 866 if (otherLinePtr->id != NULL) 867 newIdStr = otherLinePtr->id; 868 } 869 870 /* If we find an ID, see if it's one */ 871 /* that we already have. */ 872 873 if (newIdStr != NULL) 874 { 875 876 existingId = idList; 877 while (existingId != NULL) 878 { 879 if (StringCmp(existingId->id,newIdStr) == 0) 880 break; 881 existingId = existingId->next; 882 } 883 884 /* Already have -- break and return TRUE */ 885 886 if (existingId != NULL) 887 { 888 isInterleaved = TRUE; 889 break; 890 } 891 892 /* Otherwise, add the ID to the list */ 893 894 currentRowCount = 0; 895 currentCharCount = 0; 896 897 if (idList != NULL) 898 isFirstId = FALSE; 899 900 (*idCount)++; 901 902 currentId = (IdInfoPtr) MemNew (sizeof(IdInfo)); 903 if (currentId == NULL) 904 return FALSE; 905 906 currentId->sequence = NULL; 907 currentId->id = newIdStr; 908 currentId->length = 0; 909 currentId->next = NULL; 910 911 if (idList == NULL) 912 idList = currentId; 913 else 914 { 915 lastId = idList; 916 while (lastId->next != NULL) 917 lastId = lastId->next; 918 lastId->next = currentId; 919 } 920 } 921 922 /* Process sequence rows */ 923 924 if (currentRow->choice == ALI_SEQLINE) 925 { 926 927 if (seqLinePtr->sequence != NULL) 928 if ((s_configInfo.useMaybes == TRUE) || 929 (s_configInfo.useMaybes == FALSE) && 930 (seqLinePtr->maybe == FALSE)) 931 { 932 /* There must be an ID before the first sequence */ 933 934 if (currentId == NULL) 935 { 936 isInterleaved = FALSE; 937 break; 938 } 939 940 /* Look for sequences that probably */ 941 /* have no ID assigned to them. */ 942 943 seqLinePtr = (SeqLineInfoPtr) currentRow->data.ptrvalue; 944 if (isFirstId) 945 { 946 patternRowCount++; 947 patternCharCount += StringLen (seqLinePtr->sequence); 948 } 949 else 950 { 951 currentRowCount++; 952 currentCharCount += StringLen (seqLinePtr->sequence); 953 if ((currentRowCount > patternRowCount) && 954 (currentCharCount > patternCharCount)) 955 { 956 isInterleaved = TRUE; 957 break; 958 } 959 } 960 } 961 962 } 963 964 /* Go to next row */ 965 966 currentRow = currentRow->next; 967 } 968 969 /* Delete the ID records that we created */ 970 /* NOTE -- The ID strings themselves */ 971 /* are stored elsewhere and */ 972 /* only pointed to here, so */ 973 /* DON"T delete them. o */ 974 975 while (idList != NULL) 976 { 977 lastId = idList; 978 idList = idList->next; 979 MemFree(lastId); 980 } 981 982 /* Return result of search */ 983 984 return isInterleaved; 985 } 986 987 /*=========================================================================*/ 988 /* */ 989 /* s_ProcessInterId () */ 990 /* */ 991 /*=========================================================================*/ 992 993 static Boolean s_ProcessInterId (CharPtr newIdStr, 994 PatternInfoPtr pattern, 995 AlignFileDataPtr fileInfoPtr, 996 Boolean isMaybe) 997 { 998 IdInfoPtr lastId = NULL; 999 IdInfoPtr existingId = NULL; 1000 ErrInfoPtr errPtr; 1001 1002 /* If we've got all our ID's then */ 1003 /* ignore any further ones. */ 1004 1005 if (pattern->gotAllIds == TRUE) 1006 return TRUE; 1007 1008 /* All ID's, except for the first one, should */ 1009 /* immediately follow a sequence line. */ 1010 1011 if (pattern->isFirstId == FALSE) 1012 { 1013 if (pattern->lastRow->choice != ALI_SEQLINE) 1014 { 1015 errPtr = Ali_AddError (&(fileInfoPtr->errors), 1016 ERR_ID_NO_PRECEDING_SEQ, 1017 newIdStr); 1018 return FALSE; 1019 } 1020 else 1021 pattern->isFirstGroup = FALSE; 1022 } 1023 1024 /* If this id already exists, */ 1025 /* make it the current ID. */ 1026 1027 existingId = fileInfoPtr->sequences; 1028 while (existingId != NULL) 1029 { 1030 if (StringCmp(existingId->id,newIdStr) == 0) 1031 break; 1032 existingId = existingId->next; 1033 } 1034 1035 if (existingId != NULL) 1036 pattern->currentId = existingId; 1037 1038 /* Otherwise create a new Id record */ 1039 /* and add it to the end of list. */ 1040 1041 else 1042 { 1043 pattern->currentId = (IdInfoPtr) MemNew (sizeof(IdInfo)); 1044 if (pattern->currentId == NULL) 1045 { 1046 Ali_AddError (&(fileInfoPtr->errors), ERR_OUT_OF_MEMORY); 1047 return FALSE; 1048 } 1049 1050 pattern->currentId->sequence = NULL; 1051 pattern->currentId->id = newIdStr; 1052 pattern->currentId->length = 0; 1053 pattern->currentId->next = NULL; 1054 1055 if (fileInfoPtr->sequences == NULL) 1056 fileInfoPtr->sequences = pattern->currentId; 1057 else 1058 { 1059 lastId = fileInfoPtr->sequences; 1060 while (lastId->next != NULL) 1061 lastId = lastId->next; 1062 lastId->next = pattern->currentId; 1063 } 1064 1065 pattern->currentIdCount++; 1066 if (pattern->currentIdCount == pattern->idCount) 1067 pattern->gotAllIds = TRUE; 1068 } 1069 1070 if (pattern->isFirstId) 1071 pattern->isFirstId = FALSE; 1072 1073 /* Return successfully */ 1074 1075 return TRUE; 1076 } 1077 1078 /*=========================================================================*/ 1079 /* */ 1080 /* s_ProcessInterSeq () */ 1081 /* */ 1082 /*=========================================================================*/ 1083 1084 static Boolean s_ProcessInterSeq (CharPtr newSeqStr, 1085 PatternInfoPtr pattern, 1086 AlignFileDataPtr fileInfoPtr, 1087 Boolean isMaybe) 1088 { 1089 SeqPartPtr newSeqPart = NULL; 1090 ErrInfoPtr errPtr = NULL; 1091 1092 /* There must be an ID before the first sequence */ 1093 1094 if (pattern->currentId == NULL) 1095 { 1096 errPtr = Ali_AddError (&(fileInfoPtr->errors), ERR_SEQ_WITHOUT_ID, 1097 newSeqStr); 1098 return FALSE; 1099 } 1100 1101 /* Add the sequence to the current ID */ 1102 1103 newSeqPart = (SeqPartPtr) MemNew(sizeof(SeqPart)); 1104 if (newSeqPart == NULL) 1105 { 1106 Ali_AddError (&(fileInfoPtr->errors), ERR_OUT_OF_MEMORY); 1107 return FALSE; 1108 } 1109 1110 newSeqPart->sequence = newSeqStr; 1111 newSeqPart->next = NULL; 1112 1113 if (pattern->currentId->sequence == NULL) 1114 pattern->currentId->sequence = newSeqPart; 1115 else 1116 pattern->lastSeqPart->next = newSeqPart; 1117 1118 pattern->currentId->length += StringLen (newSeqPart->sequence); 1119 pattern->lastSeqPart = newSeqPart; 1120 1121 /* If we've started repeating IDs then */ 1122 /* rotate through the id list. */ 1123 1124 if (pattern->gotAllIds == TRUE) 1125 { 1126 if (pattern->currentId->next == NULL) 1127 pattern->currentId = fileInfoPtr->sequences; 1128 else 1129 pattern->currentId = pattern->currentId->next; 1130 1131 pattern->lastSeqPart = pattern->currentId->sequence; 1132 while (pattern->lastSeqPart->next != NULL) 1133 pattern->lastSeqPart = pattern->lastSeqPart->next; 1134 } 1135 1136 /* Return successfully */ 1137 1138 return TRUE; 1139 } 1140 1141 /*=========================================================================*/ 1142 /* */ 1143 /* s_AnalyzeInterleaved () */ 1144 /* */ 1145 /*=========================================================================*/ 1146 1147 static Boolean s_AnalyzeInterleaved (ValNodePtr rowList, 1148 AlignFileDataPtr fileInfoPtr, 1149 Int2 idCount) 1150 { 1151 ValNodePtr currentRow; 1152 Boolean isValidPattern; 1153 IdInfoPtr currentId = NULL; 1154 Int4 previousLength; 1155 ErrInfoPtr errPtr; 1156 PatternInfoPtr pattern; 1157 SeqLineInfoPtr seqLinePtr; 1158 DefLineInfoPtr defLinePtr; 1159 OtherLineInfoPtr otherLinePtr; 1160 Boolean firstDefline = TRUE; 1161 IdInfoPtr lastId = NULL; 1162 1163 pattern = (PatternInfoPtr) MemNew (sizeof (PatternInfo)); 1164 1165 pattern->currentDeflineId = NULL; 1166 pattern->lastRow = NULL; 1167 pattern->isFirstId = TRUE; 1168 pattern->isFirstGroup = TRUE; 1169 pattern->maybesFound = FALSE; 1170 pattern->gotAllIds = FALSE; 1171 pattern->idCount = idCount; 1172 pattern->currentIdCount = 0; 1173 1174 pattern->foundInfo.dataType = ALI_UNKNOWN; 1175 pattern->foundInfo.contigOrInter = ALI_UNKNOWN; 1176 pattern->foundInfo.idCount = 0; 1177 pattern->foundInfo.seqLength = 0; 1178 1179 /* Match the sequences up with the IDs */ 1180 1181 currentRow = rowList; 1182 isValidPattern = TRUE; 1183 1184 while (currentRow != NULL) 1185 { 1186 1187 if (currentRow->choice == ALI_SEQLINE) 1188 { 1189 1190 seqLinePtr = (SeqLineInfoPtr) currentRow->data.ptrvalue; 1191 1192 if ((seqLinePtr->maybe == FALSE) || 1193 (seqLinePtr->maybe == TRUE) && (s_configInfo.useMaybes == TRUE)) 1194 { 1195 if (seqLinePtr->id != NULL) 1196 { 1197 /* Process the ID */ 1198 1199 isValidPattern = s_ProcessInterId (seqLinePtr->id, 1200 pattern, 1201 fileInfoPtr, 1202 seqLinePtr->maybe); 1203 if (isValidPattern == FALSE) 1204 break; 1205 } 1206 1207 if (seqLinePtr->sequence != NULL) 1208 { 1209 isValidPattern = s_ProcessInterSeq (seqLinePtr->sequence, 1210 pattern, 1211 fileInfoPtr, 1212 seqLinePtr->maybe); 1213 if (isValidPattern == FALSE) 1214 break; 1215 } 1216 pattern->lastRow = currentRow; 1217 } 1218 else 1219 pattern->maybesFound = TRUE; 1220 } 1221 else if (currentRow->choice == ALI_DEFLINE) 1222 { 1223 defLinePtr = (DefLineInfoPtr) currentRow->data.ptrvalue; 1224 if (defLinePtr->id != NULL) 1225 { 1226 isValidPattern = s_ProcessInterId (defLinePtr->id, 1227 pattern, 1228 fileInfoPtr, 1229 FALSE); 1230 if (isValidPattern == FALSE) 1231 break; 1232 } 1233 if (defLinePtr->definitions != NULL) 1234 { 1235 if (firstDefline) 1236 { 1237 firstDefline = FALSE; 1238 pattern->currentDeflineId = fileInfoPtr->sequences; 1239 } 1240 else 1241 pattern->currentDeflineId = 1242 pattern->currentDeflineId->next; 1243 1244 if (pattern->currentDeflineId == NULL) 1245 { 1246 errPtr = Ali_AddError (&(fileInfoPtr->errors), 1247 ERR_DEFLINE_WITH_NO_ID, 1248 defLinePtr->definitions); 1249 errPtr->rowNum = defLinePtr->rowNum; 1250 isValidPattern = FALSE; 1251 break; 1252 } 1253 else 1254 { 1255 pattern->currentDeflineId->defline = 1256 defLinePtr->definitions; 1257 } 1258 } 1259 pattern->lastRow = currentRow; 1260 } 1261 else if (currentRow->choice == ALI_OTHERLINE) 1262 { 1263 otherLinePtr = (OtherLineInfoPtr) currentRow->data.ptrvalue; 1264 if (otherLinePtr->id != NULL) 1265 { 1266 isValidPattern = s_ProcessInterId (otherLinePtr->id, 1267 pattern, 1268 fileInfoPtr, 1269 FALSE); 1270 if (isValidPattern == FALSE) 1271 break; 1272 } 1273 pattern->lastRow = currentRow; 1274 } 1275 1276 currentRow = currentRow->next; 1277 } 1278 1279 /* If we found one defline, then */ 1280 /* make sure they were all there */ 1281 1282 if (firstDefline == FALSE) 1283 { 1284 lastId = fileInfoPtr->sequences; 1285 if (lastId != NULL) 1286 { 1287 while (lastId->next != NULL) 1288 lastId = lastId->next; 1289 if (lastId->defline == NULL) 1290 { 1291 errPtr = Ali_AddError (&(fileInfoPtr->errors), 1292 ERR_ID_WITH_NO_DEFLINE, 1293 lastId->id); 1294 isValidPattern = FALSE; 1295 } 1296 } 1297 } 1298 1299 /* If pattern not found, return failure */ 1300 1301 if (!isValidPattern) 1302 return FALSE; 1303 1304 /* If there was a declared number of sequences then */ 1305 /* check to see that it matches the number found. */ 1306 1307 if ((s_configInfo.declaredInfo.idCount !=0) && 1308 (s_configInfo.declaredInfo.idCount != idCount)) 1309 { 1310 errPtr = Ali_AddError (&(fileInfoPtr->errors), ERR_ID_COUNT_MISMATCH, 1311 idCount, s_configInfo.declaredInfo.idCount); 1312 errPtr->level = LEVEL_WARNING; 1313 } 1314 1315 /* Sequences should all be the same length. */ 1316 1317 currentId = fileInfoPtr->sequences; 1318 pattern->isFirstId = TRUE; 1319 1320 while (currentId != NULL) 1321 { 1322 if (pattern->isFirstId) 1323 pattern->isFirstId = FALSE; 1324 else 1325 { 1326 if (previousLength < currentId->length) 1327 { 1328 errPtr = Ali_AddError (&(fileInfoPtr->errors), 1329 ERR_SEQUENCE_TOO_LONG, 1330 currentId->id, 1331 previousLength, 1332 currentId->length); 1333 break; 1334 } 1335 else if (previousLength > currentId->length) 1336 { 1337 errPtr = Ali_AddError (&(fileInfoPtr->errors), 1338 ERR_SEQUENCE_TOO_SHORT, 1339 currentId->id, 1340 previousLength, 1341 currentId->length); 1342 break; 1343 } 1344 } 1345 previousLength = currentId->length; 1346 currentId = currentId->next; 1347 } 1348 1349 /* Check to see that declared sequence */ 1350 /* length matches the lengths found. */ 1351 1352 if ((s_configInfo.declaredInfo.seqLength != 0) && 1353 (s_configInfo.declaredInfo.seqLength != previousLength)) 1354 { 1355 errPtr = Ali_AddError (&(fileInfoPtr->errors),ERR_SEQ_LENGTH_MISMATCH, 1356 previousLength, 1357 s_configInfo.declaredInfo.seqLength); 1358 errPtr->level = LEVEL_WARNING; 1359 } 1360 1361 /* Process the maybes if they weren't used already */ 1362 1363 if (pattern->maybesFound == TRUE) 1364 fileInfoPtr->maybes = s_ProcessMaybes (rowList); 1365 1366 /* Return successfully */ 1367 1368 if (currentId == NULL) 1369 return TRUE; 1370 else 1371 return FALSE; 1372 } 1373 1374 /*=========================================================================*/ 1375 /* */ 1376 /* s_ProcessContigId () */ 1377 /* */ 1378 /*=========================================================================*/ 1379 1380 static Boolean s_ProcessContigId (CharPtr newIdStr, 1381 PatternInfoPtr pattern, 1382 AlignFileDataPtr fileInfoPtr) 1383 { 1384 IdInfoPtr existingId = NULL; 1385 ErrInfoPtr errPtr; 1386 IdInfoPtr lastId = NULL; 1387 1388 if (pattern->isFirstId == FALSE) 1389 { 1390 pattern->isFirstGroup = FALSE; 1391 1392 /* The length of the last pattern must match */ 1393 /* the length of previous ones. */ 1394 1395 if (pattern->currentId->length < pattern->foundInfo.seqLength) 1396 { 1397 errPtr = Ali_AddError (&(fileInfoPtr->errors), 1398 ERR_SEQUENCE_TOO_SHORT, 1399 pattern->currentId->id, 1400 pattern->foundInfo.seqLength, 1401 pattern->currentId->length); 1402 return FALSE; 1403 } 1404 } 1405 1406 pattern->hasFullLength = FALSE; 1407 1408 /* See if this ID already exists */ 1409 1410 existingId = fileInfoPtr->sequences; 1411 while (existingId != NULL) 1412 { 1413 if (StringCmp(existingId->id,newIdStr) == 0) 1414 { 1415 errPtr = Ali_AddError (&(fileInfoPtr->errors), ERR_DUPLICATE_IDS, 1416 newIdStr); 1417 return FALSE; 1418 } 1419 existingId = existingId->next; 1420 } 1421 1422 /* If this id already exists, */ 1423 /* make it the current ID. */ 1424 1425 if (existingId != NULL) 1426 pattern->currentId = existingId; 1427 1428 /* Otherwise create a new Id record */ 1429 /* and add it to the end of list. */ 1430 1431 else 1432 { 1433 pattern->currentId = (IdInfoPtr) MemNew (sizeof(IdInfo)); 1434 if (pattern->currentId == NULL) 1435 { 1436 Ali_AddError (&(fileInfoPtr->errors), ERR_OUT_OF_MEMORY); 1437 return FALSE; 1438 } 1439 1440 pattern->currentId->sequence = NULL; 1441 pattern->currentId->id = newIdStr; 1442 pattern->currentId->length = 0; 1443 pattern->currentId->next = NULL; 1444 1445 if (fileInfoPtr->sequences == NULL) 1446 fileInfoPtr->sequences = pattern->currentId; 1447 else 1448 { 1449 lastId = fileInfoPtr->sequences; 1450 while (lastId->next != NULL) 1451 lastId = lastId->next; 1452 lastId->next = pattern->currentId; 1453 } 1454 pattern->foundInfo.idCount++; 1455 } 1456 1457 if (pattern->isFirstId) 1458 pattern->isFirstId = FALSE; 1459 1460 /* Return successfully */ 1461 1462 return TRUE; 1463 } 1464 1465 /*=========================================================================*/ 1466 /* */ 1467 /* s_ProcessContigSeq () */ 1468 /* */ 1469 /*=========================================================================*/ 1470 1471 static Boolean s_ProcessContigSeq (CharPtr newSeqStr, 1472 PatternInfoPtr pattern, 1473 AlignFileDataPtr fileInfoPtr) 1474 { 1475 SeqPartPtr newSeqPart = NULL; 1476 ErrInfoPtr errPtr; 1477 1478 /* There must be an ID before we get a sequence */ 1479 1480 if (pattern->currentId == NULL) 1481 { 1482 errPtr = Ali_AddError (&(fileInfoPtr->errors), ERR_SEQ_WITHOUT_ID, 1483 newSeqStr); 1484 return FALSE; 1485 } 1486 1487 /* Add the sequence to the current ID */ 1488 1489 newSeqPart = (SeqPartPtr) MemNew(sizeof(SeqPart)); 1490 if (newSeqPart == NULL) 1491 { 1492 Ali_AddError (&(fileInfoPtr->errors), ERR_OUT_OF_MEMORY); 1493 return FALSE; 1494 } 1495 1496 newSeqPart->sequence = newSeqStr; 1497 newSeqPart->next = NULL; 1498 1499 if (pattern->currentId->sequence == NULL) 1500 pattern->currentId->sequence = newSeqPart; 1501 else 1502 pattern->lastSeqPart->next = newSeqPart; 1503 1504 /* Make sure that sequence length hasn't */ 1505 /* exceeded that of previous sequences. */ 1506 1507 pattern->currentId->length += StringLen (newSeqPart->sequence); 1508 pattern->lastSeqPart = newSeqPart; 1509 1510 if (pattern->isFirstGroup) 1511 { 1512 pattern->foundInfo.seqLength += StringLen (newSeqPart->sequence); 1513 } 1514 else 1515 { 1516 1517 if (pattern->currentId->length == pattern->foundInfo.seqLength) 1518 pattern->hasFullLength = TRUE; 1519 else if (pattern->currentId->length > pattern->foundInfo.seqLength) 1520 { 1521 errPtr = Ali_AddError (&(fileInfoPtr->errors), 1522 ERR_SEQUENCE_TOO_LONG, 1523 pattern->currentId->id, 1524 pattern->foundInfo.seqLength, 1525 pattern->currentId->length); 1526 return FALSE; 1527 } 1528 } 1529 1530 /* Return successfully */ 1531 1532 return TRUE; 1533 } 1534 1535 /*=========================================================================*/ 1536 /* */ 1537 /* s_AnalyzeContiguous () */ 1538 /* */ 1539 /*=========================================================================*/ 1540 1541 static Boolean s_AnalyzeContiguous (ValNodePtr rowList, 1542 AlignFileDataPtr fileInfoPtr) 1543 { 1544 ValNodePtr currentRow; 1545 SeqLineInfoPtr seqLinePtr; 1546 DefLineInfoPtr defLinePtr; 1547 OtherLineInfoPtr otherLinePtr; 1548 Boolean isValidPattern; 1549 IdInfoPtr lastId = NULL; 1550 IdInfoPtr nextToLastId = NULL; 1551 ErrInfoPtr errPtr; 1552 PatternInfoPtr pattern; 1553 Boolean firstDefline = TRUE; 1554 1555 /* Initialize the pattern info */ 1556 1557 pattern = (PatternInfoPtr) MemNew (sizeof (PatternInfo)); 1558 1559 pattern->currentDeflineId = NULL; 1560 pattern->currentId = NULL; 1561 pattern->lastSeqPart = NULL; 1562 pattern->hasFullLength = FALSE; 1563 pattern->isFirstId = TRUE; 1564 pattern->isFirstGroup = TRUE; 1565 pattern->maybesFound = FALSE; 1566 1567 pattern->foundInfo.dataType = ALI_UNKNOWN; 1568 pattern->foundInfo.contigOrInter = ALI_UNKNOWN; 1569 pattern->foundInfo.idCount = 0; 1570 pattern->foundInfo.seqLength = 0; 1571 1572 /* Match the sequences up with the IDS */ 1573 1574 currentRow = rowList; 1575 isValidPattern = TRUE; 1576 1577 while (currentRow != NULL) 1578 { 1579 1580 /* Process sequence lines */ 1581 1582 if (currentRow->choice == ALI_SEQLINE) 1583 { 1584 1585 seqLinePtr = (SeqLineInfoPtr) currentRow->data.ptrvalue; 1586 1587 /* If we already have a sequence equal in */ 1588 /* in length to those that came before, */ 1589 /* then this line may actually be an */ 1590 /* ID. */ 1591 1592 1593 if ((pattern->hasFullLength == TRUE) && 1594 (seqLinePtr->id == NULL)) 1595 { 1596 Ali_ChangeRowToOther (currentRow); 1597 continue; 1598 } 1599 1600 /* Process the line as a sequence */ 1601 1602 if ((seqLinePtr->maybe == FALSE) || 1603 (seqLinePtr->maybe == TRUE) && (s_configInfo.useMaybes == TRUE)) 1604 { 1605 if (seqLinePtr->id != NULL) 1606 { 1607 isValidPattern = s_ProcessContigId (seqLinePtr->id, 1608 pattern, 1609 fileInfoPtr); 1610 if (isValidPattern == FALSE) 1611 break; 1612 } 1613 1614 if (seqLinePtr->sequence != NULL) 1615 { 1616 isValidPattern = s_ProcessContigSeq (seqLinePtr->sequence, 1617 pattern, 1618 fileInfoPtr); 1619 if (isValidPattern == FALSE) 1620 break; 1621 } 1622 pattern->lastRow = currentRow; 1623 } 1624 else 1625 pattern->maybesFound = TRUE; 1626 } 1627 1628 /* Process Definition lines */ 1629 1630 else if (currentRow->choice == ALI_DEFLINE) 1631 { 1632 defLinePtr = (DefLineInfoPtr) currentRow->data.ptrvalue; 1633 if (defLinePtr->id != NULL) 1634 { 1635 isValidPattern = s_ProcessContigId (defLinePtr->id, 1636 pattern, 1637 fileInfoPtr); 1638 if (isValidPattern == FALSE) 1639 break; 1640 } 1641 1642 if (defLinePtr->definitions != NULL) 1643 { 1644 if (firstDefline) 1645 { 1646 firstDefline = FALSE; 1647 pattern->currentDeflineId = fileInfoPtr->sequences; 1648 } 1649 else 1650 pattern->currentDeflineId = 1651 pattern->currentDeflineId->next; 1652 1653 if (pattern->currentDeflineId == NULL) 1654 { 1655 errPtr = Ali_AddError (&(fileInfoPtr->errors), 1656 ERR_DEFLINE_WITH_NO_ID, 1657 defLinePtr->definitions); 1658 errPtr->rowNum = defLinePtr->rowNum; 1659 isValidPattern = FALSE; 1660 break; 1661 } 1662 else 1663 { 1664 pattern->currentDeflineId->defline = 1665 defLinePtr->definitions; 1666 } 1667 } 1668 pattern->lastRow = currentRow; 1669 } 1670 1671 /* Process Other lines */ 1672 1673 else if (currentRow->choice == ALI_OTHERLINE) 1674 { 1675 otherLinePtr = (OtherLineInfoPtr) currentRow->data.ptrvalue; 1676 if (otherLinePtr->id != NULL) 1677 { 1678 isValidPattern = s_ProcessContigId (otherLinePtr->id, 1679 pattern, 1680 fileInfoPtr); 1681 if (isValidPattern == FALSE) 1682 break; 1683 } 1684 pattern->lastRow = currentRow; 1685 } 1686 1687 currentRow = currentRow->next; 1688 } 1689 1690 /* If the last sequence is too short, mark */ 1691 /* it as a maybe. */ 1692 1693 if (pattern->lastRow->choice == ALI_SEQLINE) 1694 { 1695 if (s_configInfo.useMaybes == FALSE) 1696 { 1697 pattern->maybesFound = TRUE; 1698 if (pattern->currentId->length < pattern->foundInfo.seqLength) 1699 { 1700 seqLinePtr = (SeqLineInfoPtr)pattern->lastRow->data.ptrvalue; 1701 seqLinePtr->maybe = TRUE; 1702 nextToLastId = NULL; 1703 lastId = fileInfoPtr->sequences; 1704 while (lastId->next != NULL) 1705 { 1706 nextToLastId = lastId; 1707 lastId = lastId->next; 1708 } 1709 MemFree(lastId); 1710 if (nextToLastId == NULL) 1711 fileInfoPtr->sequences = NULL; 1712 else 1713 nextToLastId->next = NULL; 1714 } 1715 } 1716 else 1717 { 1718 if (pattern->currentId->length < pattern->foundInfo.seqLength) 1719 { 1720 errPtr = Ali_AddError (&(fileInfoPtr->errors), 1721 ERR_SEQUENCE_TOO_SHORT, 1722 pattern->currentId->id, 1723 pattern->foundInfo.seqLength, 1724 pattern->currentId->length); 1725 isValidPattern = FALSE; 1726 } 1727 } 1728 } 1729 1730 /* If we found one defline, then */ 1731 /* make sure they were all there */ 1732 1733 if (firstDefline == FALSE) 1734 { 1735 lastId = fileInfoPtr->sequences; 1736 if (lastId != NULL) 1737 { 1738 while (lastId->next != NULL) 1739 lastId = lastId->next; 1740 if (lastId->defline == NULL) 1741 { 1742 errPtr = Ali_AddError (&(fileInfoPtr->errors), 1743 ERR_ID_WITH_NO_DEFLINE, 1744 lastId->id); 1745 isValidPattern = FALSE; 1746 } 1747 } 1748 } 1749 1750 /* If pattern not found, return failure */ 1751 1752 if (!isValidPattern) 1753 { 1754 MemFree (pattern); 1755 return FALSE; 1756 } 1757 1758 /* Check for inconsistant declarations ... */ 1759 1760 /* ... of file type */ 1761 1762 if (s_configInfo.declaredInfo.contigOrInter == ALI_INTERLEAVED) 1763 { 1764 errPtr = Ali_AddError (&(fileInfoPtr->errors), ERR_NOT_INTERLEAVED); 1765 errPtr->level = LEVEL_WARNING; 1766 } 1767 1768 /* ... of number of sequences */ 1769 1770 if ((s_configInfo.declaredInfo.idCount != 0) && 1771 (s_configInfo.declaredInfo.idCount != pattern->foundInfo.idCount)) 1772 { 1773 errPtr = Ali_AddError (&(fileInfoPtr->errors), ERR_ID_COUNT_MISMATCH, 1774 pattern->foundInfo.idCount, 1775 s_configInfo.declaredInfo.idCount); 1776 errPtr->level = LEVEL_WARNING; 1777 } 1778 1779 /* ... of sequence length */ 1780 1781 if ((s_configInfo.declaredInfo.seqLength != 0) && 1782 (s_configInfo.declaredInfo.seqLength != pattern->foundInfo.seqLength)) 1783 { 1784 errPtr = Ali_AddError (&(fileInfoPtr->errors),ERR_SEQ_LENGTH_MISMATCH, 1785 pattern->foundInfo.seqLength, 1786 s_configInfo.declaredInfo.seqLength); 1787 errPtr->level = LEVEL_WARNING; 1788 } 1789 1790 /* If we have some possibly bad sequences that */ 1791 /* weren't used, process them seperately. */ 1792 1793 if (pattern->maybesFound == TRUE) 1794 fileInfoPtr->maybes = s_ProcessMaybes (rowList); 1795 1796 /* Clean up and return successfully */ 1797 1798 if (pattern->currentId != NULL) 1799 { 1800 MemFree (pattern); 1801 return FALSE; 1802 } 1803 else 1804 { 1805 MemFree (pattern); 1806 return TRUE; 1807 } 1808 } 1809 1810 /*=========================================================================*/ 1811 /* */ 1812 /* Ali_AddError () */ 1813 /* */ 1814 /*=========================================================================*/ 1815 1816 ErrInfoPtr Ali_AddError (ErrInfoPtr PNTR errorListPtr, 1817 Int4 iError, 1818 ...) 1819 { 1820 ErrInfoPtr newError; 1821 ErrInfoPtr lastError; 1822 va_list argPtr; 1823 CharPtr seqId; 1824 CharPtr seqStr; 1825 Int4 seqLength; 1826 Int4 prevSeqLength; 1827 CharPtr defLineStr; 1828 Int4 foundCount; 1829 Int4 declaredCount; 1830 Int4 foundLen; 1831 Int4 declaredLen; 1832 Int4 sequenceCount; 1833 Int4 errorCount; 1834 Int4 invalidChar; 1835 1836 static Int4 count = 0; 1837 1838 count++; 1839 1840 /* Create a new error record */ 1841 1842 newError = (ErrInfoPtr) MemNew (sizeof(ErrInfo)); 1843 newError->errNum = iError; 1844 newError->level = LEVEL_ERROR; 1845 newError->rowNum = 0; 1846 newError->extraInfo = NULL; 1847 newError->next = NULL; 1848 1849 /* Build the error message text */ 1850 1851 va_start (argPtr, iError); 1852 1853 switch (iError) 1854 { 1855 case ERR_ID_WITHOUT_SEQ : 1856 seqId = va_arg (argPtr, CharPtr); 1857 newError->info = (CharPtr) MemNew (strlen (seqId) + 80); 1858 sprintf (newError->info, "Unable to match ID %s to any sequence", seqId); 1859 break; 1860 case ERR_SEQ_WITHOUT_ID : 1861 seqStr = va_arg (argPtr, CharPtr); 1862 newError->info = (CharPtr) MemNew (strlen (seqStr) + 80); 1863 sprintf (newError->info, "There is no ID for the sequence:\n%s", seqStr); 1864 break; 1865 case ERR_DUPLICATE_IDS : 1866 seqId = va_arg (argPtr, CharPtr); 1867 newError->info = (CharPtr) MemNew (strlen (seqId) + 80); 1868 sprintf (newError->info, "Duplicate ID: %s is used more than once", 1869 seqId); 1870 break; 1871 case ERR_SEQUENCE_TOO_SHORT : 1872 seqId = va_arg (argPtr, CharPtr); 1873 prevSeqLength = va_arg (argPtr, Int4); 1874 seqLength = va_arg (argPtr, Int4); 1875 newError->info = (CharPtr) MemNew (strlen (seqId) + 256); 1876 sprintf (newError->info, 1877 "Sequence %s is shorter (%d characters) than the preceding" 1878 " sequences (%d characters)", seqId, seqLength, prevSeqLength); 1879 break; 1880 case ERR_SEQUENCE_TOO_LONG : 1881 seqId = va_arg (argPtr, CharPtr); 1882 prevSeqLength = va_arg (argPtr, Int4); 1883 seqLength = va_arg (argPtr, Int4); 1884 newError->info = (CharPtr) MemNew (strlen (seqId) + 256); 1885 sprintf (newError->info, 1886 "Sequence %s is longer (%d characters) than the preceding" 1887 " sequences (%d characters)", seqId, seqLength, prevSeqLength); 1888 break; 1889 case ERR_OUT_OF_MEMORY : 1890 newError->info = (CharPtr) MemNew (80); 1891 sprintf (newError->info, "Out of memory -- memory allocation failed"); 1892 break; 1893 case ERR_ID_NO_PRECEDING_SEQ : 1894 seqId = va_arg (argPtr, CharPtr); 1895 newError->info = (CharPtr) MemNew (strlen (seqId) + 100); 1896 sprintf (newError->info, 1897 "ID %s is probably invalid -- it is not immediately" 1898 " preceded by a sequence", seqId); 1899 break; 1900 case ERR_NOT_INTERLEAVED : 1901 newError->info = (CharPtr) MemNew (80); 1902 sprintf (newError->info, "File is declared to be interleaved," 1903 " but is contiguous"); 1904 break; 1905 case ERR_NOT_CONTIGUOUS : 1906 newError->info = (CharPtr) MemNew (80); 1907 sprintf (newError->info, "File is declared to be contiguous," 1908 " but is interleaved"); 1909 break; 1910 case ERR_NO_SEQUENCES_FOUND : 1911 newError->info = (CharPtr) MemNew (80); 1912 sprintf (newError->info, "No sequences were found in the file"); 1913 break; 1914 case ERR_ID_COUNT_MISMATCH : 1915 foundCount = va_arg (argPtr, Int4); 1916 declaredCount = va_arg (argPtr, Int4); 1917 newError->info = (CharPtr) MemNew (128); 1918 sprintf (newError->info, "The number of sequences found (%d) doesn't" 1919 " match the number declared (%d)", foundCount, declaredCount); 1920 break; 1921 case ERR_SEQ_LENGTH_MISMATCH : 1922 foundLen = va_arg (argPtr, Int4); 1923 declaredLen = va_arg (argPtr, Int4); 1924 newError->info = (CharPtr) MemNew (128); 1925 sprintf (newError->info, "The length (%d) of the sequences found doesn't" 1926 " match the declared length (%d)", foundLen, declaredLen); 1927 break; 1928 case ERR_DEFLINE_WITH_NO_ID : 1929 defLineStr = va_arg (argPtr, CharPtr); 1930 newError->info = (CharPtr) MemNew (strlen (defLineStr) + 100); 1931 sprintf (newError->info, "Unable to match the following definition" 1932 " line to any sequence :\n%s", defLineStr); 1933 break; 1934 case ERR_ID_WITH_NO_DEFLINE : 1935 seqId = va_arg (argPtr, CharPtr); 1936 newError->info = (CharPtr) MemNew (strlen (seqId) + 80); 1937 sprintf (newError->info, "Could not find a defline for the following" 1938 " sequence :\n%s", seqId); 1939 break; 1940 case ERR_INVALID_DEFLINE : 1941 defLineStr = va_arg (argPtr, CharPtr); 1942 invalidChar = va_arg (argPtr, Int4); 1943 newError->info = (CharPtr) MemNew (strlen (defLineStr) + 100); 1944 sprintf (newError->info, "Invalid definitions line (illegal char '%c'):\n%s", 1945 (Char) invalidChar, defLineStr); 1946 break; 1947 case ERR_DEFLINE_NODEFS : 1948 defLineStr = va_arg (argPtr, CharPtr); 1949 newError->info = (CharPtr) MemNew (strlen (defLineStr) + 100); 1950 sprintf (newError->info, "There is no source info enclosed by" 1951 " brackets on the definition line :\n%s", defLineStr); 1952 break; 1953 case ERR_GLOBAL_DEFLINE_NODEFS : 1954 sequenceCount = va_arg (argPtr, Int4); 1955 newError->info = (CharPtr) MemNew (128); 1956 sprintf (newError->info, "All %d of the file's definition lines are" 1957 " missing source info enclosed in [] brackets", sequenceCount); 1958 newError->level = LEVEL_MULTI; 1959 break; 1960 case ERR_MULTI_DEFLINE_NODEFS : 1961 errorCount = va_arg (argPtr, Int4); 1962 newError->info = (CharPtr) MemNew (128); 1963 sprintf (newError->info, "%d of the file's definition lines are" 1964 " missing source info enclosed in [] brackets", errorCount); 1965 newError->level = LEVEL_MULTI; 1966 break; 1967 default: 1968 newError->info = (CharPtr) MemNew (32); 1969 sprintf (newError->info, "Unknown Error"); 1970 break; 1971 } 1972 1973 va_end (argPtr); 1974 1975 /* Add it to the end of the linked list */ 1976 1977 if (*errorListPtr == NULL) 1978 *errorListPtr = newError; 1979 else 1980 { 1981 lastError = *errorListPtr; 1982 while (lastError->next != NULL) 1983 lastError = lastError->next; 1984 lastError->next = newError; 1985 } 1986 1987 /* Return a pointer to new record for easy access */ 1988 1989 return newError; 1990 } 1991 1992 /*=========================================================================*/ 1993 /* */ 1994 /* s_AnalyzeContents () - */ 1995 /* */ 1996 /*=========================================================================*/ 1997 1998 static Boolean s_AnalyzeContents (ValNodePtr rowList, 1999 AlignFileDataPtr fileInfoPtr) 2000 { 2001 Int2 idCount; 2002 Boolean result; 2003 2004 if (s_IsInterleaved (rowList, &idCount)) 2005 { 2006 fileInfoPtr->info->contigOrInter = ALI_INTERLEAVED; 2007 result = s_AnalyzeInterleaved (rowList, fileInfoPtr, idCount); 2008 } 2009 else 2010 { 2011 fileInfoPtr->info->contigOrInter = ALI_CONTIGUOUS; 2012 result = s_AnalyzeContiguous (rowList, fileInfoPtr); 2013 } 2014 2015 return result; 2016 } 2017 2018 /*=========================================================================*/ 2019 /* */ 2020 /* SeqLineReEval () - Re-evaluate a line after forcing the first 'word' */ 2021 /* to be an ID. */ 2022 /* */ 2023 /*=========================================================================*/ 2024 2025 SeqLineInfoPtr SeqLineReEval (SeqLineInfoPtr seqLinePtr) 2026 { 2027 CharPtr seqStr; 2028 CharPtr idStr; 2029 CharPtr oldStr; 2030 SeqLineInfoPtr newSeqLinePtr; 2031 2032 /* If the line is already split up, */ 2033 /* then this won't work. */ 2034 2035 if ((seqLinePtr->sequence != NULL) && (seqLinePtr->id != NULL)) 2036 return NULL; 2037 2038 /* Determine the string that we're splitting up */ 2039 2040 if (seqLinePtr->sequence != NULL) 2041 oldStr = seqLinePtr->sequence; 2042 else if (seqLinePtr->id != NULL) 2043 oldStr = seqLinePtr->id; 2044 else 2045 return NULL; 2046 2047 /* If there's only one 'word' then */ 2048 /* we can't split it. */ 2049 2050 if (StringLen (oldStr) == seqLinePtr->firstWordLen) 2051 return NULL; 2052 2053 /* Allocate mem for the new strings */ 2054 2055 seqStr = (CharPtr) MemNew (StringLen (oldStr) - 2056 seqLinePtr->firstWordLen + 1); 2057 if (seqStr == NULL) 2058 return NULL; 2059 idStr = (CharPtr) MemNew (seqLinePtr->firstWordLen + 1); 2060 if (idStr == NULL) 2061 { 2062 MemFree (seqStr); 2063 return NULL; 2064 } 2065 2066 /* Break up the existing string */ 2067 2068 StringNCpy(idStr, oldStr, seqLinePtr->firstWordLen); 2069 idStr[seqLinePtr->firstWordLen] = '\0'; 2070 StringCpy(seqStr, oldStr + seqLinePtr->firstWordLen); 2071 2072 /* Return successfully */ 2073 2074 newSeqLinePtr = (SeqLineInfoPtr) MemNew (sizeof (SeqLineInfo)); 2075 newSeqLinePtr->sequence = seqStr; 2076 newSeqLinePtr->id = idStr; 2077 newSeqLinePtr->rowNum = seqLinePtr->rowNum; 2078 newSeqLinePtr->type = Ali_SeqLineGetType(seqStr, &s_configInfo); 2079 newSeqLinePtr->maybe = seqLinePtr->maybe; 2080 newSeqLinePtr->firstWordLen = seqLinePtr->firstWordLen; 2081 2082 return newSeqLinePtr; 2083 } 2084 2085 /*=========================================================================*/ 2086 /* */ 2087 /* s_IsExistingId () -- Determine if the given ID is one that has already */ 2088 /* been added to the linked list of IDs. */ 2089 /* */ 2090 /*=========================================================================*/ 2091 2092 static Boolean s_IsExistingId (AlignFileDataPtr fileInfoPtr, 2093 CharPtr testIdStr) 2094 { 2095 IdInfoPtr idListPtr = NULL; 2096 2097 /* See if this ID already exists */ 2098 2099 idListPtr = fileInfoPtr->sequences; 2100 while (idListPtr != NULL) 2101 { 2102 if (StringCmp(idListPtr->id,testIdStr) == 0) 2103 return TRUE; 2104 idListPtr = idListPtr->next; 2105 } 2106 2107 /* If we made it to here, then */ 2108 /* the ID wasn't found. */ 2109 2110 return FALSE; 2111 } 2112 2113 /*=========================================================================*/ 2114 /* */ 2115 /* s_CheckContext () */ 2116 /* */ 2117 /*=========================================================================*/ 2118 2119 static Boolean s_CheckContext (ValNodePtr rowList, 2120 AlignFileDataPtr fileInfoPtr) 2121 { 2122 ValNodePtr currentRow; 2123 ValNodePtr lastRow; 2124 CharPtr idStr; 2125 SeqLineInfoPtr seqLinePtr; 2126 SeqLineInfoPtr reEvalSeqPtr; 2127 SeqLineInfoPtr prevSeqLinePtr = NULL; 2128 DefLineInfoPtr defLinePtr; 2129 OtherLineInfoPtr otherLinePtr; 2130 Int2 patternSeqType; 2131 ErrInfoPtr errPtr; 2132 Boolean changesMade; 2133 Int4 currLen; 2134 Int4 prevLen; 2135 2136 do /* Until no changes are made */ 2137 { 2138 currentRow = rowList; 2139 lastRow = NULL; 2140 patternSeqType = ALI_AMBIGUOUS; 2141 2142 changesMade = FALSE; 2143 while (currentRow != NULL) 2144 { 2145 if (currentRow->choice == ALI_SEQLINE) 2146 { 2147 seqLinePtr = (SeqLineInfoPtr) currentRow->data.ptrvalue; 2148 2149 if ((seqLinePtr->maybe == FALSE) || 2150 (seqLinePtr->maybe == TRUE) && 2151 (s_configInfo.useMaybes == TRUE)) 2152 { 2153 /* If there is an ID, make sure that it */ 2154 /* immediately precedes a sequence line. */ 2155 2156 if (seqLinePtr->id != NULL) 2157 { 2158 if ((seqLinePtr->sequence == NULL) && 2159 (s_GetRowSeqString(currentRow->next) == NULL)) 2160 { 2161 Ali_ChangeRowToOther (currentRow); 2162 changesMade = TRUE; 2163 continue; 2164 } 2165 } 2166 2167 /* Check for an ID that was accidentally lumped */ 2168 /* in with a sequence due to being composed */ 2169 /* entirely of sequence characters. */ 2170 2171 if (prevSeqLinePtr != NULL) 2172 { 2173 currLen = StringLen (seqLinePtr->sequence); 2174 prevLen = StringLen (prevSeqLinePtr->sequence); 2175 2176 if ((currLen > prevLen) && 2177 (seqLinePtr->id == NULL) && 2178 (prevSeqLinePtr->id != NULL)) 2179 { 2180 reEvalSeqPtr = SeqLineReEval (seqLinePtr); 2181 2182 if (reEvalSeqPtr != NULL) 2183 { 2184 currLen = StringLen (reEvalSeqPtr->sequence); 2185 2186 /* If the new seqline fits better, use it */ 2187 2188 if (currLen == prevLen) 2189 { 2190 MemFree(seqLinePtr->sequence); 2191 MemFree(seqLinePtr->id); 2192 MemFree(seqLinePtr); 2193 currentRow->data.ptrvalue = reEvalSeqPtr; 2194 continue; 2195 } 2196 else 2197 { 2198 MemFree(reEvalSeqPtr->sequence); 2199 MemFree(reEvalSeqPtr->id); 2200 MemFree(reEvalSeqPtr); 2201 } 2202 } 2203 } 2204 } 2205 2206 /* If there's an established pattern of sequence */ 2207 /* type, then match the current line against it. */ 2208 /* Otherwise, set the pattern. */ 2209 2210 if (seqLinePtr->type != ALI_AMBIGUOUS) 2211 { 2212 if (patternSeqType != ALI_AMBIGUOUS) 2213 { 2214 if (patternSeqType != seqLinePtr->type) 2215 { 2216 reEvalSeqPtr = SeqLineReEval (seqLinePtr); 2217 if ((reEvalSeqPtr == NULL) || 2218 ((reEvalSeqPtr != NULL) && 2219 (patternSeqType != reEvalSeqPtr->type))) 2220 { 2221 if (reEvalSeqPtr != NULL) 2222 { 2223 MemFree(reEvalSeqPtr->sequence); 2224 MemFree(reEvalSeqPtr->id); 2225 MemFree(reEvalSeqPtr); 2226 } 2227 Ali_ChangeRowToOther (currentRow); 2228 changesMade = TRUE; 2229 continue; 2230 } 2231 else 2232 { 2233 MemFree(seqLinePtr->sequence); 2234 MemFree(seqLinePtr->id); 2235 MemFree(seqLinePtr); 2236 currentRow->data.ptrvalue = reEvalSeqPtr; 2237 continue; 2238 } 2239 } 2240 } 2241 else 2242 patternSeqType = seqLinePtr->type; 2243 } 2244 2245 /* */ 2246 2247 prevSeqLinePtr = seqLinePtr; 2248 2249 } 2250 lastRow = currentRow; 2251 } 2252 2253 else if (currentRow->choice == ALI_DEFLINE) 2254 { 2255 /* If there is an ID, make sure that it */ 2256 /* immediately precedes a sequence line. */ 2257 2258 defLinePtr = (DefLineInfoPtr) currentRow->data.ptrvalue; 2259 if ((defLinePtr->id != NULL) && 2260 (s_IsExistingId(fileInfoPtr, defLinePtr->id) == FALSE) && 2261 (s_GetRowSeqString(currentRow->next) == NULL)) 2262 { 2263 Ali_ChangeRowToOther (currentRow); 2264 changesMade = TRUE; 2265 continue; 2266 } 2267 lastRow = currentRow; 2268 } 2269 2270 else if (currentRow->choice == ALI_OTHERLINE) 2271 { 2272 /* If there is an ID, make sure that it */ 2273 /* immediately precedes a sequence line. */ 2274 2275 otherLinePtr = (OtherLineInfoPtr) currentRow->data.ptrvalue; 2276 if (otherLinePtr->id != NULL) 2277 { 2278 if (s_GetRowSeqString(currentRow->next) == NULL) 2279 { 2280 otherLinePtr->other = otherLinePtr->id; 2281 otherLinePtr->id = NULL; 2282 changesMade = TRUE; 2283 } 2284 } 2285 lastRow = currentRow; 2286 } 2287 2288 currentRow = currentRow->next; 2289 } 2290 } while (changesMade == TRUE); 2291 2292 /* Check for a dangling ID */ 2293 2294 if ((lastRow != NULL) && 2295 ((idStr = s_GetRowIdString (lastRow)) != NULL) && 2296 (s_GetRowSeqString (lastRow) == NULL)) 2297 { 2298 errPtr = Ali_AddError (&(fileInfoPtr->errors), 2299 ERR_ID_WITHOUT_SEQ, 2300 idStr); 2301 return FALSE; 2302 } 2303 2304 /* Return successfully */ 2305 2306 return TRUE; 2307 } 2308 2309 /*=========================================================================*/ 2310 /* */ 2311 /* s_SortErrors () -- Sort errors by level, so that the most severe appear */ 2312 /* first. */ 2313 /* */ 2314 /* NOTE : Does a lame bubblesort, which nevertheless should be fast */ 2315 /* enough for the relatively small linked lists we're dealing */ 2316 /* with here. */ 2317 /* */ 2318 /*=========================================================================*/ 2319 2320 static void s_SortErrors (AlignFileDataPtr fileInfoPtr) 2321 { 2322 Boolean swapMade = TRUE; 2323 ErrInfoPtr prevPtr = NULL; 2324 ErrInfoPtr nextPtr = NULL; 2325 ErrInfoPtr errPtr = NULL; 2326 2327 while (swapMade == TRUE) 2328 { 2329 swapMade = FALSE; 2330 errPtr = fileInfoPtr->errors; 2331 while (errPtr->next != NULL) 2332 { 2333 nextPtr = errPtr->next; 2334 if (errPtr->level > nextPtr->level) 2335 { 2336 swapMade = TRUE; 2337 2338 /* Remove the error from the list */ 2339 2340 if (errPtr == fileInfoPtr->errors) 2341 fileInfoPtr->errors = nextPtr; 2342 else 2343 prevPtr->next = nextPtr; 2344 2345 /* Then re-insert it after the following error */ 2346 2347 errPtr->next = nextPtr->next; 2348 nextPtr->next = errPtr; 2349 2350 /* The old next error is now the previous error */ 2351 2352 prevPtr = nextPtr; 2353 } 2354 else 2355 { 2356 prevPtr = errPtr; 2357 errPtr = errPtr->next; 2358 } 2359 } 2360 } 2361 2362 return; 2363 } 2364 2365 /*=========================================================================*/ 2366 /* */ 2367 /* s_ReplaceUWithT () -- Replace all the Us in a nucleotide sequence with */ 2368 /* Ns. */ 2369 /* */ 2370 /*=========================================================================*/ 2371 2372 static void s_ReplaceUWithT (AlignFileDataPtr fileInfoPtr) 2373 { 2374 IdInfoPtr seqPtr = NULL; 2375 SeqPartPtr seqPart = NULL; 2376 CharPtr seqString; 2377 Int4 i; 2378 2379 seqPtr = fileInfoPtr->sequences; 2380 while (seqPtr != NULL) 2381 { 2382 seqPart = seqPtr->sequence; 2383 while (seqPart != NULL) 2384 { 2385 seqString = seqPart->sequence; 2386 for (i = 0; seqString[i] != '\0'; i++) 2387 if (seqString[i] == 'U') 2388 seqString[i] = 'T'; 2389 else if (seqString[i] == 'u') 2390 seqString[i] = 't'; 2391 seqPart = seqPart->next; 2392 } 2393 seqPtr = seqPtr->next; 2394 } 2395 2396 } 2397 2398 /*=========================================================================*/ 2399 /* */ 2400 /* s_AnalyzeErrors () -- Look for patterns in the errors that can be used */ 2401 /* to create more general, higher-level errors */ 2402 /* instead. */ 2403 /* */ 2404 /*=========================================================================*/ 2405 2406 static void s_AnalyzeErrors (AlignFileDataPtr fileInfoPtr) 2407 { 2408 Int4 seqCount = 0; 2409 Int4 defCount = 0; 2410 Int4 errCount = 0; 2411 IdInfoPtr seqPtr = NULL; 2412 ErrInfoPtr errPtr = NULL; 2413 ErrInfoPtr prevErrPtr = NULL; 2414 ErrInfoPtr nextErrPtr = NULL; 2415 2416 if (fileInfoPtr->errors == NULL) 2417 return; 2418 2419 /* Get counts of sequences and deflines */ 2420 2421 seqPtr = fileInfoPtr->sequences; 2422 while (seqPtr != NULL) 2423 { 2424 seqCount++; 2425 if (seqPtr->defline != NULL) 2426 defCount++; 2427 seqPtr = seqPtr->next; 2428 } 2429 2430 /* Check for "missing bracket" defline errors */ 2431 2432 errPtr = fileInfoPtr->errors; 2433 while (errPtr != NULL) 2434 { 2435 if (errPtr->errNum == ERR_DEFLINE_NODEFS) 2436 errCount++; 2437 errPtr = errPtr->next; 2438 } 2439 2440 /* If ALL deflines have missing bracket errors */ 2441 /* then replace the msgs with one global msg */ 2442 2443 if (errCount == seqCount) 2444 { 2445 if (s_configInfo.errExpandLevel != ALI_ERRMSG_EXPAND_ALL) 2446 { 2447 errPtr = fileInfoPtr->errors; 2448 while (errPtr != NULL) 2449 { 2450 nextErrPtr = errPtr->next; 2451 if (errPtr->errNum == ERR_DEFLINE_NODEFS) 2452 { 2453 if (errPtr == fileInfoPtr->errors) 2454 { 2455 fileInfoPtr->errors = fileInfoPtr->errors->next; 2456 s_FreeErrorNode (errPtr); 2457 errPtr = NULL; 2458 } 2459 else 2460 { 2461 prevErrPtr->next = nextErrPtr; 2462 s_FreeErrorNode (errPtr); 2463 errPtr = NULL; 2464 } 2465 } 2466 else 2467 prevErrPtr = errPtr; 2468 errPtr = nextErrPtr; 2469 } 2470 } 2471 Ali_AddError (&(fileInfoPtr->errors), ERR_GLOBAL_DEFLINE_NODEFS, 2472 seqCount); 2473 } 2474 2475 /* If SOME deflines have missing bracket errors */ 2476 /* then replace the msgs with one global msg */ 2477 2478 else if (errCount > 1) 2479 { 2480 if (s_configInfo.errExpandLevel == ALI_ERRMSG_EXPAND_NONE) 2481 { 2482 errPtr = fileInfoPtr->errors; 2483 while (errPtr != NULL) 2484 { 2485 nextErrPtr = errPtr->next; 2486 if (errPtr->errNum == ERR_DEFLINE_NODEFS) 2487 { 2488 if (errPtr == fileInfoPtr->errors) 2489 { 2490 fileInfoPtr->errors = fileInfoPtr->errors->next; 2491 s_FreeErrorNode (errPtr); 2492 errPtr = NULL; 2493 } 2494 else 2495 { 2496 prevErrPtr->next = nextErrPtr; 2497 s_FreeErrorNode (errPtr); 2498 errPtr = NULL; 2499 } 2500 } 2501 else 2502 prevErrPtr = errPtr; 2503 errPtr = nextErrPtr; 2504 } 2505 } 2506 Ali_AddError (&(fileInfoPtr->errors), ERR_MULTI_DEFLINE_NODEFS, 2507 errCount); 2508 } 2509 2510 /* Finally, sort the errors by type */ 2511 2512 s_SortErrors (fileInfoPtr); 2513 2514 return; 2515 2516 } 2517 2518 /*=========================================================================*/ 2519 /* */ 2520 /* Ali_Read () */ 2521 /* */ 2522 /*=========================================================================*/ 2523 2524 AlignFileDataPtr Ali_Read (FILE PNTR alignFilePtr) 2525 { 2526 ValNodePtr rowList = NULL; 2527 AlignFileDataPtr fileInfoPtr; 2528 ErrInfoPtr errorList = NULL; 2529 2530 /* Check parameters */ 2531 2532 if (alignFilePtr == NULL) 2533 return FALSE; 2534 2535 /* Initialize */ 2536 2537 fileInfoPtr = (AlignFileDataPtr) MemNew (sizeof(AlignFileData)); 2538 fileInfoPtr->sequences = NULL; 2539 fileInfoPtr->maybes = NULL; 2540 fileInfoPtr->errors = NULL; 2541 fileInfoPtr->info = (ParsedInfoPtr) MemNew (sizeof (ParsedInfo)); 2542 if (fileInfoPtr->info == NULL) 2543 { 2544 Ali_AddError (&(fileInfoPtr->errors), ERR_OUT_OF_MEMORY); 2545 Ali_Free (fileInfoPtr); 2546 return NULL; 2547 } 2548 fileInfoPtr->info->missingChar = NULL; 2549 fileInfoPtr->info->gapChar = NULL; 2550 fileInfoPtr->info->unalignedChar = NULL; 2551 2552 if (s_configurationSet == FALSE) 2553 Ali_SetConfig (NULL, ALI_SET_DEFAULTS); 2554 2555 /* Read in and parse each row */ 2556 2557 rowList = Ali_ReadLines (alignFilePtr, &errorList, &s_configInfo, fileInfoPtr); 2558 fileInfoPtr->errors = errorList; 2559 2560 if (rowList == NULL) 2561 return fileInfoPtr; 2562 2563 /* Make first pass to adjust the rows based on context */ 2564 2565 if (s_CheckContext(rowList, fileInfoPtr) != TRUE) 2566 return fileInfoPtr; 2567 2568 s_DisplayRowList (rowList, s_configInfo.debugLevel); 2569 2570 /* Analyze the IDs and sequences for consistancy */ 2571 2572 s_AnalyzeContents (rowList, fileInfoPtr); 2573 2574 if (fileInfoPtr->sequences == NULL) 2575 Ali_AddError (&(fileInfoPtr->errors), ERR_NO_SEQUENCES_FOUND); 2576 2577 /* Analyze the errors to see if they can be */ 2578 /* combined into more general global errors */ 2579 2580 if (fileInfoPtr->errors != NULL) 2581 s_AnalyzeErrors (fileInfoPtr); 2582 2583 /* Return the missing, gap, and unaligned chars used */ 2584 2585 fileInfoPtr->info->missingChar = (CharPtr) MemNew(16); 2586 StringCpy (fileInfoPtr->info->missingChar, s_configInfo.missingChar); 2587 2588 fileInfoPtr->info->gapChar = (CharPtr) MemNew(16); 2589 StringCpy (fileInfoPtr->info->gapChar, s_configInfo.gapChar); 2590 2591 fileInfoPtr->info->unalignedChar = (CharPtr) MemNew(16); 2592 StringCpy (fileInfoPtr->info->unalignedChar, s_configInfo.unalignedChar); 2593 2594 /* If these are nucleotide sequences, then */ 2595 /* replace all 'U's with 'T's. */ 2596 2597 s_ReplaceUWithT (fileInfoPtr); 2598 2599 /* Clean up and return successfully */ 2600 2601 s_FreeRowList_Safe (rowList); 2602 return fileInfoPtr; 2603 } 2604 2605 /*************************************************************************** 2606 * 2607 * section to convert AlignFileDataPtr content into seqalign/seqentry 2608 * structures 2609 * 2610 ***************************************************************************/ 2611 typedef struct tinyinfo { 2612 Int4 n; 2613 struct tinyinfo PNTR next; 2614 } ALI_TinyInfo, PNTR ALI_TinyInfoPtr; 2615 2616 2617 static Boolean is_gap_char(Char c, CharPtr gapChar) 2618 { 2619 if (StrChr(gapChar, c) != NULL) 2620 return TRUE; 2621 return FALSE; 2622 } 2623 2624 static int LIBCALLBACK ALI_SortTips(VoidPtr ptr1, VoidPtr ptr2) 2625 { 2626 ALI_TinyInfoPtr tip1; 2627 ALI_TinyInfoPtr tip2; 2628 2629 tip1 = *((ALI_TinyInfoPtr PNTR)ptr1); 2630 tip2 = *((ALI_TinyInfoPtr PNTR)ptr2); 2631 if (tip1->n > tip2->n) 2632 return 1; 2633 if (tip1->n < tip2->n) 2634 return -1; 2635 return 0; 2636 } 2637 2638 static Boolean is_valid_seq(Char c, CharPtr missingChar, CharPtr gapChar) 2639 { 2640 if (StrChr("\0", c)) 2641 return FALSE; 2642 if (StrChr(missingChar, c) != NULL) 2643 return TRUE; 2644 if (StrChr(gapChar, c) != NULL) 2645 return TRUE; 2646 if (IS_ALPHA(c)) 2647 return TRUE; 2648 if (c == '-') 2649 return TRUE; 2650 if (c == '?') 2651 return TRUE; 2652 return FALSE; 2653 } 2654 2655 static Boolean is_missing(Char c, CharPtr missingChar) 2656 { 2657 if (StrChr(missingChar, c) != NULL) 2658 return TRUE; 2659 else 2660 return FALSE; 2661 } 2662 2663 static SeqAlignPtr ALI_MakeSeqAlign(AlignFileDataPtr afp, CharPtr PNTR PNTR stringsptr, Int4Ptr numseq, CharPtr PNTR PNTR deflineptr) 2664 { 2665 Int4 alnlen; 2666 CharPtr buf; 2667 CharPtr c; 2668 Int4 ctr; 2669 Int4 ctr_prev; 2670 CharPtr PNTR deflines; 2671 DenseSegPtr dsp; 2672 Int4 i; 2673 IdInfoPtr id_head; 2674 IdInfoPtr iip; 2675 Boolean ingap; 2676 Boolean isgap; 2677 Int4 j; 2678 Int4 last; 2679 Int4 len; 2680 Int4 maxlen; 2681 Int4 numtips; 2682 SeqAlignPtr sap; 2683 SeqPartPtr seq; 2684 SeqIdPtr sip; 2685 SeqIdPtr sip_prev; 2686 CharPtr PNTR strings; 2687 Char text[100]; 2688 ALI_TinyInfoPtr tip; 2689 ALI_TinyInfoPtr tip_head; 2690 ALI_TinyInfoPtr tip_prev; 2691 ALI_TinyInfoPtr PNTR tiparray; 2692 2693 if (afp->info == NULL) 2694 { 2695 ErrPostEx(SEV_ERROR, 0, 0, "NULL afp->info -- alignment not read correctly\n"); 2696 return NULL; 2697 } 2698 i = 0; 2699 id_head = afp->sequences; 2700 iip = id_head; 2701 while (iip != NULL) 2702 { 2703 i++; 2704 if (iip->id == NULL) 2705 { 2706 sprintf(text, "No id read for sequence %d\n", i); 2707 ErrPostEx(SEV_ERROR, 0, 0, text); 2708 return NULL; 2709 } 2710 iip = iip->next; 2711 } 2712 sap = SeqAlignNew(); 2713 sap->type = SAT_PARTIAL; 2714 sap->segtype = SAS_DENSEG; 2715 sap->dim = i; 2716 dsp = DenseSegNew(); 2717 dsp->dim = i; 2718 strings = (CharPtr PNTR)MemNew(i*sizeof(CharPtr)); 2719 deflines = (CharPtr PNTR)MemNew(i*sizeof(CharPtr)); 2720 tip_head = tip_prev = NULL; 2721 iip = id_head; 2722 maxlen = 0; 2723 tip_head = tip_prev = NULL; 2724 numtips = 0; 2725 alnlen = 0; 2726 i = 1; 2727 while (iip != NULL) 2728 { 2729 len = 0; 2730 ctr = 0; 2731 seq = iip->sequence; 2732 if (seq == NULL || seq->sequence == NULL) 2733 { 2734 sprintf(text, "Error in reading sequence %d -- no sequence characters read\n", i); 2735 ErrPostEx(SEV_ERROR, 0, 0, text); 2736 return NULL; 2737 } 2738 c = seq->sequence; 2739 if (is_gap_char(*c, afp->info->gapChar)) 2740 ingap = TRUE; 2741 else 2742 ingap = FALSE; 2743 while (seq != NULL) 2744 { 2745 c = seq->sequence; 2746 if (c == NULL) 2747 { 2748 sprintf(text, "Error in reading sequence %d -- no sequence characters read\n", i); 2749 ErrPostEx(SEV_ERROR, 0, 0, text); 2750 return NULL; 2751 } 2752 while (is_valid_seq(*c, afp->info->missingChar, afp->info->gapChar)) 2753 { 2754 if (is_gap_char(*c, afp->info->gapChar) && !ingap) 2755 { 2756 tip = (ALI_TinyInfoPtr)MemNew(sizeof(ALI_TinyInfo)); 2757 tip->n = ctr; 2758 if (tip_head != NULL) 2759 { 2760 tip_prev->next = tip; 2761 tip_prev = tip; 2762 } else 2763 tip_head = tip_prev = tip; 2764 ingap = TRUE; 2765 numtips++; 2766 } else if (!is_gap_char(*c, afp->info->gapChar) && ingap) 2767 { 2768 tip = (ALI_TinyInfoPtr)MemNew(sizeof(ALI_TinyInfo)); 2769 tip->n = ctr; 2770 if (tip_head != NULL) 2771 { 2772 tip_prev->next = tip; 2773 tip_prev = tip; 2774 } else 2775 tip_head = tip_prev = tip; 2776 ingap = FALSE; 2777 numtips++; 2778 } 2779 if (!is_gap_char(*c, afp->info->gapChar)) 2780 len++; 2781 ctr++; 2782 c++; 2783 } 2784 seq = seq->next; 2785 } 2786 if (ctr > alnlen) 2787 alnlen = ctr; 2788 if (len > maxlen) 2789 maxlen = len; 2790 iip = iip->next; 2791 i++; 2792 } 2793 if (tip_head == NULL) /* this is a gapless alignment */ 2794 { 2795 dsp->numseg = 1; 2796 dsp->starts = (Int4Ptr)MemNew((dsp->dim)*sizeof(Int4)); 2797 dsp->lens = (Int4Ptr)MemNew(sizeof(Int4)); 2798 dsp->strands = (Uint1Ptr)MemNew((dsp->dim)*sizeof(Uint1)); 2799 for (i=0; i<dsp->dim; i++) 2800 { 2801 dsp->strands[i] = Seq_strand_plus; 2802 } 2803 dsp->lens[0] = id_head->length; 2804 /* all the starts are 0 anyway, just leave them and get the ids & seqs */ 2805 iip = id_head; 2806 sip_prev = NULL; 2807 buf = (CharPtr)MemNew((maxlen+1)*sizeof(Char)); 2808 i = 0; 2809 while (iip != NULL) 2810 { 2811 sip = MakeSeqID(iip->id); 2812 deflines[i] = StringSave(iip->defline); 2813 if (sip_prev != NULL) 2814 { 2815 sip_prev->next = sip; 2816 sip_prev = sip; 2817 } else 2818 dsp->ids = sip_prev = sip; 2819 seq = iip->sequence; 2820 for (ctr = 0; ctr<(maxlen+1); ctr++) 2821 { 2822 buf[ctr] = '\0'; 2823 } 2824 ctr = 0; 2825 while (seq != NULL) 2826 { 2827 c = seq->sequence; 2828 while (is_valid_seq(*c, afp->info->missingChar, afp->info->gapChar)) 2829 { 2830 if (is_missing(*c, afp->info->missingChar)) 2831 buf[ctr] = 'N'; 2832 else 2833 buf[ctr] = *c; 2834 ctr++; 2835 c++; 2836 } 2837 seq = seq->next; 2838 } 2839 strings[i] = StringSave(buf); 2840 iip = iip->next; 2841 i++; 2842 } 2843 sap->segs = (Pointer)dsp; 2844 MemFree(buf); 2845 *numseq = dsp->dim; 2846 *stringsptr = strings; 2847 *deflineptr = deflines; 2848 return sap; 2849 } 2850 /* now all the segment boundaries have been collected, so sort them */ 2851 tiparray = (ALI_TinyInfoPtr PNTR)MemNew(numtips*sizeof(ALI_TinyInfoPtr)); 2852 i = 0; 2853 tip = tip_head; 2854 while (tip != NULL) 2855 { 2856 tiparray[i] = tip; 2857 i++; 2858 tip = tip->next; 2859 } 2860 HeapSort(tiparray, numtips, sizeof(ALI_TinyInfoPtr), ALI_SortTips); 2861 dsp->numseg = 2; /* one for the first, one for the last */ 2862 for (i=1; i<numtips; i++) 2863 { 2864 if (tiparray[i]->n != tiparray[i-1]->n) 2865 dsp->numseg++; 2866 } 2867 dsp->starts = (Int4Ptr)MemNew((dsp->dim)*(dsp->numseg)*sizeof(Int4)); 2868 dsp->lens = (Int4Ptr)MemNew((dsp->numseg)*sizeof(Int4)); 2869 last = 0; 2870 j=0; 2871 dsp->lens[0] = tiparray[0]->n; 2872 last = tiparray[0]->n; 2873 j++; 2874 for (i=1; i<numtips; i++) 2875 { 2876 if (tiparray[i]->n != tiparray[i-1]->n) 2877 { 2878 dsp->lens[j] = tiparray[i]->n-last; 2879 last = tiparray[i]->n; 2880 j++; 2881 } 2882 } 2883 dsp->lens[j] = alnlen - last; 2884 dsp->strands = (Uint1Ptr)MemNew((dsp->dim)*(dsp->numseg)*sizeof(Uint1)); 2885 /* do we have any strand info to the contrary? */ 2886 for (i=0; i<(dsp->dim)*(dsp->numseg); i++) 2887 { 2888 dsp->strands[i] = Seq_strand_plus; 2889 } 2890 iip = id_head; 2891 i = 0; 2892 buf = (CharPtr)MemNew((maxlen+1)*sizeof(Char)); 2893 sip_prev = NULL; 2894 while (iip != NULL) 2895 { 2896 j = 0; 2897 for (ctr = 0; ctr<(maxlen+1); ctr++) 2898 { 2899 buf[ctr] = '\0'; 2900 } 2901 sip = MakeSeqID(iip->id); 2902 SeqIdSetFree(sip->next); 2903 sip->next = NULL; 2904 deflines[i] = StringSave(iip->defline); 2905 if (sip_prev != NULL) 2906 { 2907 sip_prev->next = sip; 2908 sip_prev = sip; 2909 } else 2910 dsp->ids = sip_prev = sip; 2911 ctr = 0; 2912 ctr_prev = 0; 2913 len = 0; 2914 seq = iip->sequence; 2915 while (seq != NULL) 2916 { 2917 c = seq->sequence; 2918 while (is_valid_seq(*c, afp->info->missingChar, afp->info->gapChar)) 2919 { 2920 isgap = is_gap_char(*c, afp->info->gapChar); 2921 if (!isgap) 2922 { 2923 if (is_missing(*c, afp->info->missingChar)) 2924 buf[ctr] = 'N'; 2925 else 2926 buf[ctr] = *c; 2927 ctr++; 2928 } 2929 len++; 2930 if (len == dsp->lens[j]) 2931 { 2932 if (isgap) 2933 dsp->starts[dsp->dim*j+i] = -1; 2934 else 2935 { 2936 dsp->starts[dsp->dim*j+i] = ctr_prev; 2937 ctr_prev = ctr; 2938 } 2939 j++; 2940 len = 0; 2941 } 2942 if (*(c+1) == '\0' && seq->next == NULL && j < dsp->numseg) 2943 { 2944 if (isgap) 2945 dsp->starts[dsp->dim*j+i] = -1; 2946 else 2947 dsp->starts[dsp->dim*j+i] = ctr_prev; 2948 } 2949 c++; 2950 } 2951 seq = seq->next; 2952 } 2953 strings[i] = StringSave(buf); 2954 iip = iip->next; 2955 i++; 2956 } 2957 sap->segs = (Pointer)dsp; 2958 MemFree(buf); 2959 for (i=0; i<numtips; i++) 2960 { 2961 MemFree(tiparray[i]); 2962 } 2963 MemFree(tiparray); 2964 *numseq = dsp->dim; 2965 *stringsptr = strings; 2966 *deflineptr = deflines; 2967 return sap; 2968 } 2969 2970 static SeqEntryPtr ALI_make_seqentry_for_seqentry (SeqEntryPtr sep) 2971 { 2972 BioseqPtr bsp; 2973 BioseqSetPtr bssp; 2974 SeqEntryPtr sep_new; 2975 SeqEntryPtr sep_tmp; 2976 2977 if (IS_Bioseq(sep) || IS_Bioseq_set(sep)) 2978 { 2979 if (sep->next) 2980 { 2981 bssp = BioseqSetNew (); 2982 bssp->_class = 14; 2983 bssp->seq_set = sep; 2984 sep_new = SeqEntryNew (); 2985 sep_new->choice = 2; 2986 sep_new->data.ptrvalue = bssp; 2987 SeqMgrLinkSeqEntry (sep_new, 0, NULL); 2988 sep_tmp = bssp->seq_set; 2989 while (sep_tmp != NULL) 2990 { 2991 if (IS_Bioseq(sep_tmp)) 2992 { 2993 bsp = (BioseqPtr)sep_tmp->data.ptrvalue; 2994 ObjMgrConnect (OBJ_BIOSEQ, (Pointer) bsp, OBJ_BIOSEQSET, (Pointer) bssp); 2995 } 2996 sep_tmp = sep_tmp->next; 2997 } 2998 } else 2999 return sep; 3000 } 3001 return sep_new; 3002 } 3003 3004 static Uint1 ALI_GuessMoltype(CharPtr string) 3005 { 3006 CharPtr c; 3007 3008 c = string; 3009 while (*c != '\0') 3010 { 3011 if (StringChr("EFIJLOPQUXZefijlopquxz", *c) != NULL) /* protein */ 3012 return Seq_mol_aa; 3013 c++; 3014 } 3015 return Seq_mol_na; 3016 } 3017 static Int4 SPI_MapRowCoords(SeqAlignPtr sap, Int4 from, Int4 to, Int4 row, Uint1 direction) 3018 { 3019 Int4 pos; 3020 3021 if (direction == 1) 3022 { 3023 pos = AlnMgrMapRowCoords(sap, from, row, NULL); 3024 from++; 3025 while (pos < 0 && from <= to) 3026 { 3027 pos = AlnMgrMapRowCoords(sap, from, row, NULL); 3028 from++; 3029 } 3030 } else 3031 { 3032 pos = AlnMgrMapRowCoords(sap, to, row, NULL); 3033 to--; 3034 while (pos < 0 && to >= from) 3035 { 3036 pos = AlnMgrMapRowCoords(sap, to, row, NULL); 3037 to--; 3038 } 3039 } 3040 if (pos < 0) 3041 return -1; 3042 return pos; 3043 } 3044 3045 static CharPtr SPI_WriteAlnLine(Int4 row, Int4 from, Int4 to, SeqAlignPtr sap) 3046 { 3047 AlnMsgPtr amp; 3048 BioseqPtr bsp; 3049 Uint1 buf[65+2]; 3050 Int4 ctr; 3051 Int4 i; 3052 Boolean more; 3053 Int4 n; 3054 SeqIdPtr sip; 3055 SeqPortPtr spp; 3056 CharPtr string; 3057 3058 n = AlnMgrGetNumRows(sap); 3059 if (row > n || row < 1) 3060 return NULL; 3061 string = (CharPtr)MemNew((65+2)*sizeof(Char)); 3062 for (n=0; n<(65+2); n++) 3063 { 3064 string[n] = '\0'; 3065 } 3066 sip = AlnMgrGetNthSeqIdPtr(sap, row); 3067 bsp = BioseqLockById(sip); 3068 amp = AlnMsgNew(); 3069 amp->row_num = row; 3070 amp->from_m = from; 3071 amp->to_m = to; 3072 if (amp->to_m < 0) 3073 amp->to_m = -1; 3074 n = 0; 3075 while ((more = AlnMgrGetNextAlnBit(sap, amp)) == TRUE) 3076 { 3077 if (amp->to_b - amp->from_b > amp->to_m - amp->from_m) /* kludge */ 3078 { 3079 if (amp->strand == Seq_strand_minus) 3080 amp->from_b = amp->to_b - (amp->to_m - amp->from_m); 3081 else 3082 amp->to_b = amp->from_b + (amp->to_m - amp->from_m); 3083 } 3084 if (amp->gap == 0) 3085 { 3086 spp = SeqPortNew(bsp, amp->from_b, amp->to_b, amp->strand, Seq_code_iupacna); 3087 ctr = SeqPortRead(spp, buf, (amp->to_b - amp->from_b + 1)); 3088 SeqPortFree(spp); 3089 for (i=n; i<n+ctr; i++) 3090 { 3091 string[i] = buf[i-n]; 3092 } 3093 n += ctr; 3094 } else 3095 { 3096 for (i=n; i<(n+amp->to_b-amp->from_b+1); i++) 3097 { 3098 string[i] = '-'; 3099 } 3100 n += amp->to_b-amp->from_b+1; 3101 } 3102 } 3103 AlnMsgFree(amp); 3104 SeqIdFree(sip); 3105 return string; 3106 } 3107 static Int4 spi_get_num_places(Int4 num) 3108 { 3109 FloatHi f; 3110 Int4 i; 3111 Int4 x; 3112 3113 x = 10; 3114 for (i=1; i<21; i++) 3115 { 3116 f = (FloatHi)num/(FloatHi)x; 3117 if (f < 1) 3118 { 3119 if (num < 0) 3120 return (i+1); 3121 else 3122 return i; 3123 } 3124 x = x*10; 3125 } 3126 if (num < 0) 3127 i++; 3128 return i; 3129 } 3130 static void PrintOutMultAlign(SeqAlignPtr sap) 3131 { 3132 Int4 c; 3133 Int4Ptr coord; 3134 Int4 ctr; 3135 Int4 d; 3136 Int4 j; 3137 Int4 len; 3138 Int4 n; 3139 Int4 spacer; 3140 CharPtr PNTR stringptr; 3141 3142 spacer = 12; 3143 AlnMgrIndexSingleChildSeqAlign(sap); 3144 n = AlnMgrGetNumRows(sap); 3145 stringptr = (CharPtr PNTR)MemNew(n*sizeof(CharPtr)); 3146 coord = (Int4Ptr)MemNew(n*sizeof(Int4)); 3147 len = AlnMgrGetAlnLength(sap, FALSE); 3148 for (c=0; c<len; c+=65-10) 3149 { 3150 for (j=0; j<n; j++) 3151 { 3152 stringptr[j] = SPI_WriteAlnLine(j+1, c, MIN(c+65-10-1, len-1), sap); 3153 coord[j] = SPI_MapRowCoords(sap, c, MIN(c+65-10-1, len-1), j+1, 1); 3154 if (coord[j] >= 0) 3155 coord[j]++; 3156 } 3157 for (j=0; j<n; j++) 3158 { 3159 printf("%d", coord[j]); 3160 d = spi_get_num_places(coord[j]); 3161 for (d; d<spacer; d++) 3162 { 3163 printf(" "); 3164 } 3165 if (j == 0) 3166 printf("%s", stringptr[j]); 3167 else 3168 { 3169 for (ctr=0; ctr<MIN(65-10, len-c); ctr++) 3170 { 3171 if (stringptr[j][ctr] == stringptr[0][ctr]) 3172 printf("."); 3173 else 3174 printf("%c", stringptr[j][ctr]); 3175 } 3176 } 3177 printf("\n"); 3178 MemFree(stringptr[j]); 3179 } 3180 if (c+65-10 < len) 3181 printf("\n"); 3182 } 3183 fflush(stdout); 3184 } 3185 3186 static void PrintOutSegs(SeqAlignPtr sap) 3187 { 3188 DenseSegPtr dsp; 3189 Int4 i; 3190 Int4 j; 3191 3192 dsp = (DenseSegPtr)(sap->segs); 3193 printf("nums:\t"); 3194 for (i=0; i<dsp->numseg; i++) 3195 { 3196 printf("%d\t", i+1); 3197 } 3198 printf("\n"); 3199 printf("lens:\t"); 3200 for (i=0; i<dsp->numseg; i++) 3201 { 3202 printf("%d\t", dsp->lens[i]); 3203 } 3204 printf("\n"); 3205 for (i=0; i<dsp->dim; i++) 3206 { 3207 printf("row %d\t", i+1); 3208 for (j=0; j<dsp->numseg; j++) 3209 { 3210 printf("%d\t", dsp->starts[(dsp->dim)*j+i]); 3211 } 3212 printf("\n"); 3213 } 3214 fflush(stdout); 3215 } 3216 3217 NLM_EXTERN SeqEntryPtr ALI_ConvertToNCBIData(AlignFileDataPtr afp) 3218 { 3219 BioseqPtr bsp; 3220 CharPtr PNTR deflines; 3221 Int4 i; 3222 Int4 len; 3223 Uint1 moltype; 3224 Int4 numseq; 3225 SeqAnnotPtr sanp; 3226 SeqAlignPtr sap; 3227 SeqDescrPtr sdp; 3228 SeqEntryPtr sep; 3229 SeqEntryPtr sep_head; 3230 SeqEntryPtr sep_prev; 3231 SeqIdPtr sip; 3232 CharPtr str; 3233 CharPtr PNTR strings; 3234 3235 if (afp == NULL || afp->sequences == NULL) 3236 { 3237 ErrPostEx(SEV_ERROR, 0, 0, "NULL Data Passed to ConvertToNCBIData"); 3238 return NULL; 3239 } 3240 sap = ALI_MakeSeqAlign(afp, &strings, &numseq, &deflines); 3241 if (sap == NULL) 3242 { 3243 ErrPostEx(SEV_ERROR, 0, 0, "Unable to create seqentry\n"); 3244 return NULL; 3245 } 3246 sanp = SeqAnnotNew(); 3247 sanp->type = 2; 3248 sanp->data = (Pointer)sap; 3249 moltype = ALI_GuessMoltype(strings[0]); 3250 sip = ((DenseSegPtr)(sap->segs))->ids; 3251 sep_head = sep_prev = NULL; 3252 for (i=0; i<numseq; i++) 3253 { 3254 len = StringLen(strings[i]); 3255 sep = StringToSeqEntry (strings[i], sip, len, moltype); 3256 if (sep != NULL) { 3257 bsp = (BioseqPtr)(sep->data.ptrvalue); 3258 if (! StringHasNoText (deflines[i])) { 3259 str = deflines[i]; 3260 sdp = SeqDescrAddPointer(&(bsp->descr), Seq_descr_title, str); 3261 } 3262 if (sep != NULL) 3263 { 3264 if (sep_head != NULL) 3265 { 3266 sep_prev->next = sep; 3267 sep_prev = sep; 3268 } else 3269 sep_head = sep_prev = sep; 3270 } 3271 sip = sip->next; 3272 MemFree(strings[i]); 3273 } 3274 } 3275 sep_head = ALI_make_seqentry_for_seqentry (sep_head); 3276 SeqAlignAddInSeqEntry (sep_head, sanp); 3277 MemFree(strings); 3278 MemFree(deflines); 3279 return sep_head; 3280 } 3281
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |