src/gui/widgets/aln_multiple/demo/make_score_method.cpp

Go to the documentation of this file.
00001 /*  $Id: make_score_method.cpp 14515 2007-05-04 17:18:18Z kazimird $
00002  * ===========================================================================
00003  *
00004  *                            PUBLIC DOMAIN NOTICE
00005  *               National Center for Biotechnology Information
00006  *
00007  *  This software/database is a "United States Government Work" under the
00008  *  terms of the United States Copyright Act.  It was written as part of
00009  *  the author's official duties as a United States Government employee and
00010  *  thus cannot be copyrighted.  This software/database is freely available
00011  *  to the public for use. The National Library of Medicine and the U.S.
00012  *  Government have not placed any restriction on its use or reproduction.
00013  *
00014  *  Although all reasonable efforts have been taken to ensure the accuracy
00015  *  and reliability of the software and data, the NLM and the U.S.
00016  *  Government do not and cannot warrant the performance or results that
00017  *  may be obtained by using this software or data. The NLM and the U.S.
00018  *  Government disclaim all warranties, express or implied, including
00019  *  warranties of performance, merchantability or fitness for any particular
00020  *  purpose.
00021  *
00022  *  Please cite the author in any work or product based on this material.
00023  *
00024  * ===========================================================================
00025  *
00026  * Authors:  Robert Smith
00027  *
00028  * File Description:
00029  *   Create Alignment score method files from:
00030  *   1. Builtin scoring tables (-sm)
00031  *   2. aaindex tables and matrixes from
00032  *      http://www.genome.ad.jp/dbget/aaindex.html
00033  *
00034  */
00035 
00036 #include <ncbi_pch.hpp>
00037 #include <corelib/ncbiapp.hpp>
00038 #include <util/tables/raw_scoremat.h>
00039 #include <corelib/ncbireg.hpp>
00040 #include <corelib/ncbi_limits.hpp>
00041 
00042 #include <ctype.h>
00043 #include <iostream>
00044 #include <iomanip>
00045 #include <algorithm>
00046 
00047 
00048 BEGIN_NCBI_SCOPE
00049 
00050 class CMakeScoreMethodApp : public CNcbiApplication
00051 {
00052 private:
00053     void Init(void);
00054     int  Run(void);
00055 };
00056 
00057 
00058 void CMakeScoreMethodApp::Init(void)
00059 {
00060     auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
00061     arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
00062                               "Make alignment scoring method files");
00063 
00064     arg_desc->AddDefaultKey("out", "output", "File name for scoring method",
00065                 CArgDescriptions::eOutputFile, "-");
00066 
00067     arg_desc->AddOptionalKey("sm", "matrix", "name of score matrix to use",
00068                             CArgDescriptions::eString);
00069     arg_desc->SetConstraint
00070         ("sm", &(*new CArgAllow_Strings,
00071                  "blosum45", "blosum62", "blosum80", "pam30", "pam70", "pam250"));
00072 
00073     arg_desc->AddOptionalKey("aa", "accession",
00074         "Amino Acid Index Database accession number",
00075         CArgDescriptions::eString);
00076     arg_desc->AddDefaultKey("in", "aaindex_file",
00077         "Amino Acid Index Database input file.",
00078         CArgDescriptions::eInputFile, "-");
00079 
00080     SetupArgDescriptions(arg_desc.release());
00081 }
00082 
00083 
00084 inline static string s_FormatAA(int aa) {
00085     return isprint((unsigned char) aa) ? string(1, (char) aa) : NStr::IntToString(aa);
00086 }
00087 
00088 
00089 static void s_BuiltInSM(const string& sm, map<string, string>& out_map)
00090 {
00091     const SNCBIPackedScoreMatrix*  psm = NULL;
00092     string desc;
00093 
00094     if        (sm == "blosum45") {
00095         psm = &NCBISM_Blosum45;
00096         desc =
00097         "    Matrix made by matblas from blosum45.iij\\\n"
00098         "    BLOSUM Clustered Scoring Matrix in 1/3 Bit Units\\\n"
00099         "    Blocks Database = /data/blocks_5.0/blocks.dat\\\n"
00100         "    Cluster Percentage: >= 45\\\n"
00101         "    Entropy =   0.3795, Expected =  -0.2789";
00102     } else if (sm == "blosum62") {
00103         psm = &NCBISM_Blosum62;
00104         desc =
00105         "    Matrix made by matblas from blosum62.iij\\\n"
00106         "    BLOSUM Clustered Scoring Matrix in 1/2 Bit Units\\\n"
00107         "    Blocks Database = /data/blocks_5.0/blocks.dat\\\n"
00108         "    Cluster Percentage: >= 62\\\n"
00109         "    Entropy =   0.6979, Expected =  -0.5209";
00110     } else if (sm == "blosum80") {
00111         psm = &NCBISM_Blosum80;
00112         desc =
00113         "    Matrix made by matblas from blosum80.iij\\\n"
00114         "    BLOSUM Clustered Scoring Matrix in 1/2 Bit Units\\\n"
00115         "    Blocks Database = /data/blocks_5.0/blocks.dat\\\n"
00116         "    Cluster Percentage: >= 80\\\n"
00117         "    Entropy =   0.9868, Expected =  -0.7442";
00118     } else if (sm == "pam30") {
00119         psm = &NCBISM_Pam30;
00120         desc =
00121         "    This matrix was produced by \\\"pam\\\" Version 1.0.6 [28-Jul-93]\\\n"
00122         "    PAM 30 substitution matrix, scale = ln(2)/2 = 0.346574\\\n"
00123         "    Expected score = -5.06, Entropy = 2.57 bits\\\n"
00124         "    Lowest score = -17, Highest score = 13";
00125     } else if (sm == "pam70") {
00126         psm = &NCBISM_Pam70;
00127         desc =
00128         "    This matrix was produced by \\\"pam\\\" Version 1.0.6 [28-Jul-93]\\\n"
00129         "    PAM 70 substitution matrix, scale = ln(2)/2 = 0.346574\\\n"
00130         "    Expected score = -2.77, Entropy = 1.60 bits\\\n"
00131         "    Lowest score = -11, Highest score = 13";
00132     } else if (sm == "pam250") {
00133         psm = &NCBISM_Pam250;
00134         desc =
00135         "    This matrix was produced by \\\"pam\\\" Version 1.0.7 [01-Feb-98]\\\n"
00136         "    using Dayhoff et al. (1978) mutability data.\\\n"
00137         "    PAM 250 substitution matrix, scale = ln(2)/3 = 0.231049\\\n"
00138         "    Expected score = -0.844, Entropy = 0.354 bits\\\n"
00139         "    Lowest score = -8, Highest score = 17";
00140     } else {
00141         _TROUBLE;
00142     }
00143 
00144     out_map["Name"] = sm;
00145     out_map["Builtin"] = sm;
00146     out_map["Description"] = desc.empty() ? sm : desc;
00147 
00148 
00149     int maxscore = numeric_limits<int>::min();
00150     int minscore = numeric_limits<int>::max();
00151 
00152     int l = strlen(psm->symbols);
00153 
00154     // find max and min.
00155     for (int i = 0;  i < l;  ++i) {
00156         if (isupper((unsigned char) psm->symbols[i])) {
00157             for (int j = 0;  j < l;  ++j) {
00158                 int value = psm->scores[i * l + j];
00159                 maxscore = max(value, maxscore);
00160                 minscore = min(value, minscore);
00161             }
00162         }
00163     }
00164     out_map["MinimumValue"] = NStr::IntToString(minscore);
00165     out_map["MaximumValue"] = NStr::IntToString(maxscore);
00166 }
00167 
00168 
00169 static void sWriteLine(CNcbiOstream& out, map<string, string>& out_map, const string& name)
00170 {
00171     if ( ! out_map[name].empty()) {
00172         out << name << " = " << out_map[name] << endl;
00173     }
00174 }
00175 
00176 
00177 static bool sReadAA_M(
00178     CNcbiIstream& in,
00179     const string& m_line,
00180     map<string, string>& out_map )
00181 {
00182     out_map["Method"] = "MatrixScore";
00183 
00184     // parse the rest of the M line.
00185     // look for '='.  The token after the first is the rows.
00186     // the token after the second '=' is the columns.
00187     list<string> toks;
00188     NStr::Split(m_line, " ,", toks);
00189     list<string>::iterator tok_it;
00190     tok_it = find(toks.begin(), toks.end(), "=");
00191     if (tok_it == toks.end())
00192         return false;
00193     ++tok_it;
00194     string row_bases(*tok_it);
00195 
00196     tok_it = find(++tok_it, toks.end(), "=");
00197     if (tok_it == toks.end())
00198         return false;
00199     ++tok_it;
00200     string col_bases(*tok_it);
00201 
00202     // make the Columns line.
00203     int cols = col_bases.size();
00204     string symbols(1, col_bases[0]);
00205     for (int i = 1;  i < cols;  ++i) {
00206         char c = col_bases[i];
00207         if (isupper((unsigned char) c)  ||  c == '-') {
00208             symbols += "       ";
00209             symbols += c;
00210         }
00211     }
00212     out_map["Columns"] = symbols;
00213 
00214     // Make all the TableRows lines.
00215     string line;
00216     string out_rows;
00217     int rows = row_bases.size();
00218     for (int r = 0; r < rows; ++r) {
00219         if (! getline(in, line))
00220             break;
00221         out_rows += row_bases[r];
00222         out_rows += " =";
00223         out_rows += line;
00224         out_rows += '\n';
00225     }
00226     out_map["TableRows"] = out_rows;
00227     return true;
00228 }
00229 
00230 // file aaindex1, with one score per amino acid,
00231 // always has the same amino acid order.  So here it is:
00232 static const string kAAIndexOrder("ARNDCQEGHILKMFPSTWYV");
00233 
00234 static bool sReadAA_I(CNcbiIstream& in, map<string, string>& out_map)
00235 {
00236     out_map["Method"] = "ColumnScore";
00237 
00238     string line1, line2;
00239     string out_rows;
00240     getline(in, line1);
00241     getline(in, line2);
00242     list<string> scores;
00243     NStr::Split(line1 + line2, " ", scores);
00244 
00245     int r = 0;
00246     ITERATE(list<string>, score_it, scores) {
00247         out_rows += kAAIndexOrder[r];
00248         out_rows += " = ";
00249         out_rows += *score_it;
00250         out_rows += '\n';
00251         ++r;
00252     }
00253     out_map["TableRows"] = out_rows;
00254     return true;
00255 }
00256 
00257 
00258 bool s_ReadAAIndex(const string& accession, CNcbiIstream& in, map<string, string>& out_map)
00259 {
00260     string line;
00261     // skip till we find the accession.
00262     while (getline(in, line)) {
00263         if (line == "H " + accession)
00264             break;
00265     }
00266     if ( ! in.good()) {
00267         cerr << "Accession \"" << accession << "\" not found." << endl;
00268         return false;
00269     }
00270 
00271     out_map["Name"] = NStr::TruncateSpaces(line.substr(2));
00272 
00273     string current_key;
00274     while (getline(in, line)) {
00275         if (line.empty())
00276             continue;
00277         char command = line[0];
00278         line = NStr::TruncateSpaces(line.substr(2));
00279         switch(command) {
00280         case 'D':   // Data description
00281             // assume that 'D' will come before the following.
00282             current_key = "Description";
00283             out_map[current_key] = line;
00284             break;
00285         case 'R':   // LITDB entry number
00286         case 'A':   // Author(s)
00287         case 'T':   // Title of the article
00288         case 'J':   // Journal reference
00289             // tack all of these on to the description.
00290             current_key = "Description";
00291             out_map[current_key] += "\\\n    " + line;
00292             break;
00293         case ' ':   // continuation lines.
00294             if ( ! current_key.empty()) {
00295                 out_map[current_key] += "\\\n    " +line;
00296             }
00297             break;
00298         case '*':   // Comment or missing
00299         case 'C':   // Accession numbers of similar entries
00300             // ignore these.
00301             current_key.erase();
00302             break;
00303         case 'M':   // Matrix data
00304             if (! sReadAA_M(in, line, out_map)) {
00305                 cerr << "Bad format in M section at accession \""
00306                     << accession << "\"" << endl;
00307                 return false;
00308             }
00309             return true;
00310         case 'I':   // Amino acid index data
00311             if (! sReadAA_I(in, out_map)) {
00312                 cerr << "Bad format in I section at accession \""
00313                     << accession << "\"" << endl;
00314                 return false;
00315             }
00316             return true;
00317         case '/':
00318             return false;
00319         }
00320     }
00321     cerr << "No I or M section at accession \""
00322         << accession << "\"" << endl;
00323     return false;
00324 }
00325 
00326 
00327 int CMakeScoreMethodApp::Run(void)
00328 {
00329     CArgs                          args = GetArgs();
00330     CNcbiOstream& out = args["out"].AsOutputFile();
00331 
00332     if (args["sm"]  &&  args["aa"] ) {
00333         string msg = "Options -sm and -aa are mutually exclusive.\n";
00334         cerr << GetArgDescriptions()->PrintUsage(msg);
00335         return 1;
00336     }
00337 
00338     // We could use a CNcbiRegistry to store and write out the lines
00339     // instead of this map.
00340     // But then we couldn't specify the order the lines in the file
00341     // and basically the resulting files would work just as well
00342     // but would look a lot worse.
00343 
00344     map<string, string> out_line;
00345 
00346     out_line["Method"] = "MatrixScore";
00347     out_line["Type"] = "Protein";
00348     out_line["MinimumColor"] = "yellow3";
00349     out_line["MaximumColor"] = "royal blue";
00350 
00351     if (args["sm"]) {
00352         s_BuiltInSM(args["sm"].AsString(), out_line);
00353     } else if (args["aa"]) {
00354         if (! s_ReadAAIndex(args["aa"].AsString(), args["in"].AsInputFile(), out_line)) {
00355             return 1;
00356         }
00357     }
00358 
00359     // specify order of the output lines here.
00360     out << "[Info]" << endl;
00361     sWriteLine(out, out_line, "Name");
00362     sWriteLine(out, out_line, "Description");
00363     sWriteLine(out, out_line, "Method");
00364     sWriteLine(out, out_line, "Type");
00365 
00366     out << "[Table]" << endl;
00367     sWriteLine(out, out_line, "MinimumValue");
00368     sWriteLine(out, out_line, "MinimumColor");
00369     sWriteLine(out, out_line, "MaximumValue");
00370     sWriteLine(out, out_line, "MaximumColor");
00371     if (args["sm"]) {
00372         sWriteLine(out, out_line, "Builtin");
00373     } else {
00374         sWriteLine(out, out_line, "Columns");
00375 
00376         out << "[TableRows]" << endl;
00377         out << out_line["TableRows"];
00378     }
00379 
00380     return 0;
00381 }
00382 
00383 END_NCBI_SCOPE
00384 USING_NCBI_SCOPE;
00385 
00386 int main(int argc, const char* argv[])
00387 {
00388     // Execute main application function
00389     return CMakeScoreMethodApp().AppMain(argc, argv, 0, eDS_Default, 0);
00390 }
00391 
00392 

Generated on Wed Mar 11 20:08:21 2009 for NCBI C++ ToolKit by  doxygen 1.4.6
Modified on Wed Mar 11 23:15:48 2009 by modify_doxy.py rev. 117643