NCBI Home IEB Home C++ Toolkit docs C Toolkit source browser C Toolkit source browser (2) |
NCBI C++ Toolkit Cross ReferenceC++/include/util/dictionary_util.hpp |
source navigation diff markup identifier search freetext search file search |
1 #ifndef UTIL___DICTIONARY_UTIL__HPP 2 #define UTIL___DICTIONARY_UTIL__HPP 3 4 /* $Id: dictionary_util.hpp 103491 2007-05-04 17:18:18Z kazimird $ 5 * =========================================================================== 6 * 7 * PUBLIC DOMAIN NOTICE 8 * National Center for Biotechnology Information 9 * 10 * This software/database is a "United States Government Work" under the 11 * terms of the United States Copyright Act. It was written as part of 12 * the author's official duties as a United States Government employee and 13 * thus cannot be copyrighted. This software/database is freely available 14 * to the public for use. The National Library of Medicine and the U.S. 15 * Government have not placed any restriction on its use or reproduction. 16 * 17 * Although all reasonable efforts have been taken to ensure the accuracy 18 * and reliability of the software and data, the NLM and the U.S. 19 * Government do not and cannot warrant the performance or results that 20 * may be obtained by using this software or data. The NLM and the U.S. 21 * Government disclaim all warranties, express or implied, including 22 * warranties of performance, merchantability or fitness for any particular 23 * purpose. 24 * 25 * Please cite the author in any work or product based on this material. 26 * 27 * =========================================================================== 28 * 29 * Authors: Mike DiCuccio 30 * 31 * File Description: 32 * 33 */ 34 35 #include <corelib/ncbistd.hpp> 36 37 38 BEGIN_NCBI_SCOPE 39 40 41 /// 42 /// Standard dictionary utility functions 43 /// 44 class NCBI_XUTIL_EXPORT CDictionaryUtil 45 { 46 public: 47 enum { 48 eMaxSoundex = 4, 49 eMaxMetaphone = 4 50 }; 51 52 /// Compute the Soundex key for a given word 53 /// The Soundex key is defined as: 54 /// - the first leter of the word, capitalized 55 /// - convert the remaining letters based on simple substitutions and 56 /// groupings of similar letters 57 /// - remove duplicates 58 /// - pad with '0' to the maximum number of characters 59 /// 60 /// The final step is non-standard; the usual pad is ' ' 61 static void GetSoundex(const string& in, string* out, 62 size_t max_chars = eMaxSoundex, 63 char pad_char = '0'); 64 65 /// Compute the Metaphone key for a given word 66 /// Metaphone is a more advanced algorithm than Soundex; instead of 67 /// matching simple letters, Metaphone matches diphthongs. The rules are 68 /// complex, and try to match how languages are pronounced. The 69 /// implementation here borrows some options from Double Metaphone; the 70 /// modifications from the traditional Metaphone algorithm include: 71 /// - all leading vowels are rendered as 'a' (from Double Metaphone) 72 /// - rules regarding substitution of dge/dgi/dgy -> j were loosened a bit 73 /// to permit such substitutions at the end of the word 74 /// - 'y' is treated as a vowel if surrounded by consonants 75 static void GetMetaphone(const string& in, string* out, 76 size_t max_chars = eMaxMetaphone); 77 78 /// Compute the Porter stem for a given word. 79 /// Porter's stemming algorithm is one of many automated stemming 80 /// algorithms; unlike most, Porter's stemming algorithm is a widely 81 /// accepted standard algorithm for generating word stems. 82 /// 83 /// A description of the algorithm is available at 84 /// 85 /// http://www.tartarus.org/~martin/PorterStemmer/def.txt 86 /// 87 /// The essence of the algorithm is to repeatedly strip likely word 88 /// suffixes such as -ed, -es, -s, -ess, -ness, -ability, -ly, and so 89 /// forth, leaving a residue of a word that can be compared with other stem 90 /// sources. The goal is to permit comparison of socuh words as: 91 /// 92 /// compare 93 /// comparable 94 /// comparability 95 /// comparably 96 /// 97 /// since they all contain approximately the same meaning. 98 /// 99 /// This algorithm assumes that word case has already been adjusted to 100 /// lower case. 101 static void Stem(const string& in_str, string* out_str); 102 103 /// Return the Levenshtein edit distance between two words. Two possible 104 /// methods of computation are supported - an exact method with quadratic 105 /// complexity and a method suitable for similar words with a near-linear 106 /// complexity. The similar algorithm is suitable for almost all words we 107 /// would encounter; it will render inaccuracies if the number of 108 /// consecutive differences is greater than three. 109 110 enum EDistanceMethod { 111 /// This method performs an exhausive search, and has an 112 /// algorithmic complexity of O(n x m), where n = length of str1 and 113 /// m = length of str2 114 eEditDistance_Exact, 115 116 /// This method performs a simpler search, looking for the distance 117 /// between similar words. Words with more than two consecutively 118 /// different characters will be scored incorrectly. 119 eEditDistance_Similar 120 }; 121 static size_t GetEditDistance(const string& str1, const string& str2, 122 EDistanceMethod method = eEditDistance_Exact); 123 124 /// Compute a nearness score for two different words or phrases 125 static int Score(const string& word1, const string& word2, 126 size_t max_metaphone = eMaxMetaphone); 127 128 /// Compute a nearness score based on metaphone as well as raw distance 129 static int Score(const string& word1, const string& meta1, 130 const string& word2, const string& meta2, 131 EDistanceMethod method = eEditDistance_Similar); 132 133 }; 134 135 136 137 138 END_NCBI_SCOPE 139 140 #endif // UTIL___DICTIONARY_UTIL__HPP 141
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |