NCBI C++ Toolkit Cross Reference

C++/include/util/dictionary_util.hpp

  1 #ifndef UTIL___DICTIONARY_UTIL__HPP
  2 #define UTIL___DICTIONARY_UTIL__HPP
  3 
  4 /*  $Id: dictionary_util.hpp 103491 2007-05-04 17:18:18Z kazimird $
  5  * ===========================================================================
  6  *
  7  *                            PUBLIC DOMAIN NOTICE
  8  *               National Center for Biotechnology Information
  9  *
 10  *  This software/database is a "United States Government Work" under the
 11  *  terms of the United States Copyright Act.  It was written as part of
 12  *  the author's official duties as a United States Government employee and
 13  *  thus cannot be copyrighted.  This software/database is freely available
 14  *  to the public for use. The National Library of Medicine and the U.S.
 15  *  Government have not placed any restriction on its use or reproduction.
 16  *
 17  *  Although all reasonable efforts have been taken to ensure the accuracy
 18  *  and reliability of the software and data, the NLM and the U.S.
 19  *  Government do not and cannot warrant the performance or results that
 20  *  may be obtained by using this software or data. The NLM and the U.S.
 21  *  Government disclaim all warranties, express or implied, including
 22  *  warranties of performance, merchantability or fitness for any particular
 23  *  purpose.
 24  *
 25  *  Please cite the author in any work or product based on this material.
 26  *
 27  * ===========================================================================
 28  *
 29  * Authors:  Mike DiCuccio
 30  *
 31  * File Description:
 32  *
 33  */
 34 
 35 #include <corelib/ncbistd.hpp>
 36 
 37 
 38 BEGIN_NCBI_SCOPE
 39 
 40 
 41 ///
 42 /// Standard dictionary utility functions
 43 ///
 44 class NCBI_XUTIL_EXPORT CDictionaryUtil
 45 {
 46 public:
 47     enum {
 48         eMaxSoundex = 4,
 49         eMaxMetaphone = 4
 50     };
 51 
 52     /// Compute the Soundex key for a given word
 53     /// The Soundex key is defined as:
 54     ///   - the first leter of the word, capitalized
 55     ///   - convert the remaining letters based on simple substitutions and
 56     ///     groupings of similar letters
 57     ///   - remove duplicates
 58     ///   - pad with '0' to the maximum number of characters
 59     ///
 60     /// The final step is non-standard; the usual pad is ' '
 61     static void GetSoundex(const string& in, string* out,
 62                            size_t max_chars = eMaxSoundex,
 63                            char pad_char = '0');
 64 
 65     /// Compute the Metaphone key for a given word
 66     /// Metaphone is a more advanced algorithm than Soundex; instead of
 67     /// matching simple letters, Metaphone matches diphthongs.  The rules are
 68     /// complex, and try to match how languages are pronounced.  The
 69     /// implementation here borrows some options from Double Metaphone; the
 70     /// modifications from the traditional Metaphone algorithm include:
 71     ///  - all leading vowels are rendered as 'a' (from Double Metaphone)
 72     ///  - rules regarding substitution of dge/dgi/dgy -> j were loosened a bit
 73     ///  to permit such substitutions at the end of the word
 74     ///  - 'y' is treated as a vowel if surrounded by consonants
 75     static void GetMetaphone(const string& in, string* out,
 76                              size_t max_chars = eMaxMetaphone);
 77 
 78     /// Compute the Porter stem for a given word.
 79     /// Porter's stemming algorithm is one of many automated stemming
 80     /// algorithms; unlike most, Porter's stemming algorithm is a widely
 81     /// accepted standard algorithm for generating word stems.
 82     ///
 83     /// A description of the algorithm is available at
 84     ///
 85     ///   http://www.tartarus.org/~martin/PorterStemmer/def.txt
 86     ///
 87     /// The essence of the algorithm is to repeatedly strip likely word
 88     /// suffixes such as -ed, -es, -s, -ess, -ness, -ability, -ly, and so
 89     /// forth, leaving a residue of a word that can be compared with other stem
 90     /// sources.  The goal is to permit comparison of socuh words as:
 91     ///
 92     ///    compare
 93     ///    comparable
 94     ///    comparability
 95     ///    comparably
 96     ///
 97     /// since they all contain approximately the same meaning.
 98     ///
 99     /// This algorithm assumes that word case has already been adjusted to
100     /// lower case.
101     static void Stem(const string& in_str, string* out_str);
102 
103     /// Return the Levenshtein edit distance between two words.  Two possible
104     /// methods of computation are supported - an exact method with quadratic
105     /// complexity and a method suitable for similar words with a near-linear
106     /// complexity.  The similar algorithm is suitable for almost all words we
107     /// would encounter; it will render inaccuracies if the number of
108     /// consecutive differences is greater than three.
109 
110     enum EDistanceMethod {
111         /// This method performs an exhausive search, and has an
112         /// algorithmic complexity of O(n x m), where n = length of str1 and 
113         /// m = length of str2
114         eEditDistance_Exact,
115 
116         /// This method performs a simpler search, looking for the distance
117         /// between similar words.  Words with more than two consecutively
118         /// different characters will be scored incorrectly.
119         eEditDistance_Similar
120     };
121     static size_t GetEditDistance(const string& str1, const string& str2,
122                                   EDistanceMethod method = eEditDistance_Exact);
123 
124     /// Compute a nearness score for two different words or phrases
125     static int Score(const string& word1, const string& word2,
126                      size_t max_metaphone = eMaxMetaphone);
127 
128     /// Compute a nearness score based on metaphone as well as raw distance
129     static int Score(const string& word1, const string& meta1,
130                      const string& word2, const string& meta2,
131                      EDistanceMethod method = eEditDistance_Similar);
132 
133 };
134 
135 
136 
137 
138 END_NCBI_SCOPE
139 
140 #endif  // UTIL___DICTIONARY_UTIL__HPP
141
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ]
This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.