00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036 #include <ncbi_pch.hpp>
00037 #include <corelib/ncbiapp.hpp>
00038 #include <util/tables/raw_scoremat.h>
00039 #include <corelib/ncbireg.hpp>
00040 #include <corelib/ncbi_limits.hpp>
00041
00042 #include <ctype.h>
00043 #include <iostream>
00044 #include <iomanip>
00045 #include <algorithm>
00046
00047
00048 BEGIN_NCBI_SCOPE
00049
00050 class CMakeScoreMethodApp : public CNcbiApplication
00051 {
00052 private:
00053 void Init(void);
00054 int Run(void);
00055 };
00056
00057
00058 void CMakeScoreMethodApp::Init(void)
00059 {
00060 auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
00061 arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
00062 "Make alignment scoring method files");
00063
00064 arg_desc->AddDefaultKey("out", "output", "File name for scoring method",
00065 CArgDescriptions::eOutputFile, "-");
00066
00067 arg_desc->AddOptionalKey("sm", "matrix", "name of score matrix to use",
00068 CArgDescriptions::eString);
00069 arg_desc->SetConstraint
00070 ("sm", &(*new CArgAllow_Strings,
00071 "blosum45", "blosum62", "blosum80", "pam30", "pam70", "pam250"));
00072
00073 arg_desc->AddOptionalKey("aa", "accession",
00074 "Amino Acid Index Database accession number",
00075 CArgDescriptions::eString);
00076 arg_desc->AddDefaultKey("in", "aaindex_file",
00077 "Amino Acid Index Database input file.",
00078 CArgDescriptions::eInputFile, "-");
00079
00080 SetupArgDescriptions(arg_desc.release());
00081 }
00082
00083
00084 inline static string s_FormatAA(int aa) {
00085 return isprint((unsigned char) aa) ? string(1, (char) aa) : NStr::IntToString(aa);
00086 }
00087
00088
00089 static void s_BuiltInSM(const string& sm, map<string, string>& out_map)
00090 {
00091 const SNCBIPackedScoreMatrix* psm = NULL;
00092 string desc;
00093
00094 if (sm == "blosum45") {
00095 psm = &NCBISM_Blosum45;
00096 desc =
00097 " Matrix made by matblas from blosum45.iij\\\n"
00098 " BLOSUM Clustered Scoring Matrix in 1/3 Bit Units\\\n"
00099 " Blocks Database = /data/blocks_5.0/blocks.dat\\\n"
00100 " Cluster Percentage: >= 45\\\n"
00101 " Entropy = 0.3795, Expected = -0.2789";
00102 } else if (sm == "blosum62") {
00103 psm = &NCBISM_Blosum62;
00104 desc =
00105 " Matrix made by matblas from blosum62.iij\\\n"
00106 " BLOSUM Clustered Scoring Matrix in 1/2 Bit Units\\\n"
00107 " Blocks Database = /data/blocks_5.0/blocks.dat\\\n"
00108 " Cluster Percentage: >= 62\\\n"
00109 " Entropy = 0.6979, Expected = -0.5209";
00110 } else if (sm == "blosum80") {
00111 psm = &NCBISM_Blosum80;
00112 desc =
00113 " Matrix made by matblas from blosum80.iij\\\n"
00114 " BLOSUM Clustered Scoring Matrix in 1/2 Bit Units\\\n"
00115 " Blocks Database = /data/blocks_5.0/blocks.dat\\\n"
00116 " Cluster Percentage: >= 80\\\n"
00117 " Entropy = 0.9868, Expected = -0.7442";
00118 } else if (sm == "pam30") {
00119 psm = &NCBISM_Pam30;
00120 desc =
00121 " This matrix was produced by \\\"pam\\\" Version 1.0.6 [28-Jul-93]\\\n"
00122 " PAM 30 substitution matrix, scale = ln(2)/2 = 0.346574\\\n"
00123 " Expected score = -5.06, Entropy = 2.57 bits\\\n"
00124 " Lowest score = -17, Highest score = 13";
00125 } else if (sm == "pam70") {
00126 psm = &NCBISM_Pam70;
00127 desc =
00128 " This matrix was produced by \\\"pam\\\" Version 1.0.6 [28-Jul-93]\\\n"
00129 " PAM 70 substitution matrix, scale = ln(2)/2 = 0.346574\\\n"
00130 " Expected score = -2.77, Entropy = 1.60 bits\\\n"
00131 " Lowest score = -11, Highest score = 13";
00132 } else if (sm == "pam250") {
00133 psm = &NCBISM_Pam250;
00134 desc =
00135 " This matrix was produced by \\\"pam\\\" Version 1.0.7 [01-Feb-98]\\\n"
00136 " using Dayhoff et al. (1978) mutability data.\\\n"
00137 " PAM 250 substitution matrix, scale = ln(2)/3 = 0.231049\\\n"
00138 " Expected score = -0.844, Entropy = 0.354 bits\\\n"
00139 " Lowest score = -8, Highest score = 17";
00140 } else {
00141 _TROUBLE;
00142 }
00143
00144 out_map["Name"] = sm;
00145 out_map["Builtin"] = sm;
00146 out_map["Description"] = desc.empty() ? sm : desc;
00147
00148
00149 int maxscore = numeric_limits<int>::min();
00150 int minscore = numeric_limits<int>::max();
00151
00152 int l = strlen(psm->symbols);
00153
00154
00155 for (int i = 0; i < l; ++i) {
00156 if (isupper((unsigned char) psm->symbols[i])) {
00157 for (int j = 0; j < l; ++j) {
00158 int value = psm->scores[i * l + j];
00159 maxscore = max(value, maxscore);
00160 minscore = min(value, minscore);
00161 }
00162 }
00163 }
00164 out_map["MinimumValue"] = NStr::IntToString(minscore);
00165 out_map["MaximumValue"] = NStr::IntToString(maxscore);
00166 }
00167
00168
00169 static void sWriteLine(CNcbiOstream& out, map<string, string>& out_map, const string& name)
00170 {
00171 if ( ! out_map[name].empty()) {
00172 out << name << " = " << out_map[name] << endl;
00173 }
00174 }
00175
00176
00177 static bool sReadAA_M(
00178 CNcbiIstream& in,
00179 const string& m_line,
00180 map<string, string>& out_map )
00181 {
00182 out_map["Method"] = "MatrixScore";
00183
00184
00185
00186
00187 list<string> toks;
00188 NStr::Split(m_line, " ,", toks);
00189 list<string>::iterator tok_it;
00190 tok_it = find(toks.begin(), toks.end(), "=");
00191 if (tok_it == toks.end())
00192 return false;
00193 ++tok_it;
00194 string row_bases(*tok_it);
00195
00196 tok_it = find(++tok_it, toks.end(), "=");
00197 if (tok_it == toks.end())
00198 return false;
00199 ++tok_it;
00200 string col_bases(*tok_it);
00201
00202
00203 int cols = col_bases.size();
00204 string symbols(1, col_bases[0]);
00205 for (int i = 1; i < cols; ++i) {
00206 char c = col_bases[i];
00207 if (isupper((unsigned char) c) || c == '-') {
00208 symbols += " ";
00209 symbols += c;
00210 }
00211 }
00212 out_map["Columns"] = symbols;
00213
00214
00215 string line;
00216 string out_rows;
00217 int rows = row_bases.size();
00218 for (int r = 0; r < rows; ++r) {
00219 if (! getline(in, line))
00220 break;
00221 out_rows += row_bases[r];
00222 out_rows += " =";
00223 out_rows += line;
00224 out_rows += '\n';
00225 }
00226 out_map["TableRows"] = out_rows;
00227 return true;
00228 }
00229
00230
00231
00232 static const string kAAIndexOrder("ARNDCQEGHILKMFPSTWYV");
00233
00234 static bool sReadAA_I(CNcbiIstream& in, map<string, string>& out_map)
00235 {
00236 out_map["Method"] = "ColumnScore";
00237
00238 string line1, line2;
00239 string out_rows;
00240 getline(in, line1);
00241 getline(in, line2);
00242 list<string> scores;
00243 NStr::Split(line1 + line2, " ", scores);
00244
00245 int r = 0;
00246 ITERATE(list<string>, score_it, scores) {
00247 out_rows += kAAIndexOrder[r];
00248 out_rows += " = ";
00249 out_rows += *score_it;
00250 out_rows += '\n';
00251 ++r;
00252 }
00253 out_map["TableRows"] = out_rows;
00254 return true;
00255 }
00256
00257
00258 bool s_ReadAAIndex(const string& accession, CNcbiIstream& in, map<string, string>& out_map)
00259 {
00260 string line;
00261
00262 while (getline(in, line)) {
00263 if (line == "H " + accession)
00264 break;
00265 }
00266 if ( ! in.good()) {
00267 cerr << "Accession \"" << accession << "\" not found." << endl;
00268 return false;
00269 }
00270
00271 out_map["Name"] = NStr::TruncateSpaces(line.substr(2));
00272
00273 string current_key;
00274 while (getline(in, line)) {
00275 if (line.empty())
00276 continue;
00277 char command = line[0];
00278 line = NStr::TruncateSpaces(line.substr(2));
00279 switch(command) {
00280 case 'D':
00281
00282 current_key = "Description";
00283 out_map[current_key] = line;
00284 break;
00285 case 'R':
00286 case 'A':
00287 case 'T':
00288 case 'J':
00289
00290 current_key = "Description";
00291 out_map[current_key] += "\\\n " + line;
00292 break;
00293 case ' ':
00294 if ( ! current_key.empty()) {
00295 out_map[current_key] += "\\\n " +line;
00296 }
00297 break;
00298 case '*':
00299 case 'C':
00300
00301 current_key.erase();
00302 break;
00303 case 'M':
00304 if (! sReadAA_M(in, line, out_map)) {
00305 cerr << "Bad format in M section at accession \""
00306 << accession << "\"" << endl;
00307 return false;
00308 }
00309 return true;
00310 case 'I':
00311 if (! sReadAA_I(in, out_map)) {
00312 cerr << "Bad format in I section at accession \""
00313 << accession << "\"" << endl;
00314 return false;
00315 }
00316 return true;
00317 case '/':
00318 return false;
00319 }
00320 }
00321 cerr << "No I or M section at accession \""
00322 << accession << "\"" << endl;
00323 return false;
00324 }
00325
00326
00327 int CMakeScoreMethodApp::Run(void)
00328 {
00329 CArgs args = GetArgs();
00330 CNcbiOstream& out = args["out"].AsOutputFile();
00331
00332 if (args["sm"] && args["aa"] ) {
00333 string msg = "Options -sm and -aa are mutually exclusive.\n";
00334 cerr << GetArgDescriptions()->PrintUsage(msg);
00335 return 1;
00336 }
00337
00338
00339
00340
00341
00342
00343
00344 map<string, string> out_line;
00345
00346 out_line["Method"] = "MatrixScore";
00347 out_line["Type"] = "Protein";
00348 out_line["MinimumColor"] = "yellow3";
00349 out_line["MaximumColor"] = "royal blue";
00350
00351 if (args["sm"]) {
00352 s_BuiltInSM(args["sm"].AsString(), out_line);
00353 } else if (args["aa"]) {
00354 if (! s_ReadAAIndex(args["aa"].AsString(), args["in"].AsInputFile(), out_line)) {
00355 return 1;
00356 }
00357 }
00358
00359
00360 out << "[Info]" << endl;
00361 sWriteLine(out, out_line, "Name");
00362 sWriteLine(out, out_line, "Description");
00363 sWriteLine(out, out_line, "Method");
00364 sWriteLine(out, out_line, "Type");
00365
00366 out << "[Table]" << endl;
00367 sWriteLine(out, out_line, "MinimumValue");
00368 sWriteLine(out, out_line, "MinimumColor");
00369 sWriteLine(out, out_line, "MaximumValue");
00370 sWriteLine(out, out_line, "MaximumColor");
00371 if (args["sm"]) {
00372 sWriteLine(out, out_line, "Builtin");
00373 } else {
00374 sWriteLine(out, out_line, "Columns");
00375
00376 out << "[TableRows]" << endl;
00377 out << out_line["TableRows"];
00378 }
00379
00380 return 0;
00381 }
00382
00383 END_NCBI_SCOPE
00384 USING_NCBI_SCOPE;
00385
00386 int main(int argc, const char* argv[])
00387 {
00388
00389 return CMakeScoreMethodApp().AppMain(argc, argv, 0, eDS_Default, 0);
00390 }
00391
00392