Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
word_list_lang_model.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: word_list_lang_model.cpp
3  * Description: Implementation of the Word List Language Model Class
4  * Author: Ahmad Abdulkader
5  * Created: 2008
6  *
7  * (C) Copyright 2008, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include <string>
21 #include <vector>
22 #include "word_list_lang_model.h"
23 #include "cube_utils.h"
24 
25 #include "ratngs.h"
26 #include "trie.h"
27 
28 namespace tesseract {
30  cntxt_ = cntxt;
31  dawg_ = NULL;
32  init_ = false;
33 }
34 
36  Cleanup();
37 }
38 
39 // Cleanup
40 void WordListLangModel::Cleanup() {
41  if (dawg_ != NULL) {
42  delete dawg_;
43  dawg_ = NULL;
44  }
45  init_ = false;
46 }
47 
48 // Initialize the language model
49 bool WordListLangModel::Init() {
50  if (init_ == true) {
51  return true;
52  }
53  // The last parameter to the Trie constructor (the debug level) is set to
54  // false for now, until Cube has a way to express its preferred debug level.
55  dawg_ = new Trie(DAWG_TYPE_WORD, "", NO_PERM,
56  WordListLangModel::kMaxDawgEdges,
57  cntxt_->CharacterSet()->ClassCount(), false);
58  if (dawg_ == NULL) {
59  return false;
60  }
61  init_ = true;
62  return true;
63 }
64 
65 // return a pointer to the root
67  return NULL;
68 }
69 
70 // return the edges emerging from the current state
72  LangModEdge *edge,
73  int *edge_cnt) {
74  // initialize if necessary
75  if (init_ == false) {
76  if (Init() == false) {
77  return false;
78  }
79  }
80 
81  (*edge_cnt) = 0;
82 
83  EDGE_REF edge_ref;
84 
85  TessLangModEdge *tess_lm_edge = reinterpret_cast<TessLangModEdge *>(edge);
86 
87  if (tess_lm_edge == NULL) {
88  edge_ref = 0;
89  } else {
90  edge_ref = tess_lm_edge->EndEdge();
91 
92  // advance node
93  edge_ref = dawg_->next_node(edge_ref);
94  if (edge_ref == 0) {
95  return 0;
96  }
97  }
98 
99  // allocate memory for edges
100  LangModEdge **edge_array = new LangModEdge *[kMaxEdge];
101  if (edge_array == NULL) {
102  return NULL;
103  }
104 
105  // now get all the emerging edges
106  (*edge_cnt) += TessLangModEdge::CreateChildren(cntxt_, dawg_, edge_ref,
107  edge_array + (*edge_cnt));
108 
109  return edge_array;
110 }
111 
112 // returns true if the char_32 is supported by the language model
113 // TODO(ahmadab) currently not implemented
115  bool terminal, LangModEdge **edges) {
116  return false;
117 }
118 
119 // Recursive helper function for WordVariants().
120 void WordListLangModel::WordVariants(const CharSet &char_set,
121  string_32 prefix_str32,
122  WERD_CHOICE *word_so_far,
123  string_32 str32,
124  vector<WERD_CHOICE *> *word_variants) {
125  int str_len = str32.length();
126  if (str_len == 0) {
127  if (word_so_far->length() > 0) {
128  word_variants->push_back(new WERD_CHOICE(*word_so_far));
129  }
130  } else {
131  // Try out all the possible prefixes of the str32.
132  for (int len = 1; len <= str_len; len++) {
133  // Check if prefix is supported in character set.
134  string_32 str_pref32 = str32.substr(0, len);
135  int class_id = char_set.ClassID(reinterpret_cast<const char_32 *>(
136  str_pref32.c_str()));
137  if (class_id <= 0) {
138  continue;
139  } else {
140  string_32 new_prefix_str32 = prefix_str32 + str_pref32;
141  string_32 new_str32 = str32.substr(len);
142  word_so_far->append_unichar_id(class_id, 1, 0.0, 0.0);
143  WordVariants(char_set, new_prefix_str32, word_so_far, new_str32,
144  word_variants);
145  word_so_far->remove_last_unichar_id();
146  }
147  }
148  }
149 }
150 
151 // Compute all the variants of a 32-bit string in terms of the class-ids
152 // This is needed for languages that have ligatures. A word can then have more
153 // than one spelling in terms of the class-ids
155  const UNICHARSET *uchset, string_32 str32,
156  vector<WERD_CHOICE *> *word_variants) {
157  for (int i = 0; i < word_variants->size(); i++) {
158  delete (*word_variants)[i];
159  }
160  word_variants->clear();
161  string_32 prefix_str32;
162  WERD_CHOICE word_so_far(uchset);
163  WordVariants(char_set, prefix_str32, &word_so_far, str32, word_variants);
164 }
165 
166 // add a new UTF-8 string to the lang model
167 bool WordListLangModel::AddString(const char *char_ptr) {
168  if (!init_ && !Init()) { // initialize if necessary
169  return false;
170  }
171 
172  string_32 str32;
173  CubeUtils::UTF8ToUTF32(char_ptr, &str32);
174  if (str32.length() < 1) {
175  return false;
176  }
177  return AddString32(str32.c_str());
178 }
179 
180 // add a new UTF-32 string to the lang model
181 bool WordListLangModel::AddString32(const char_32 *char_32_ptr) {
182  if (char_32_ptr == NULL) {
183  return false;
184  }
185  // get all the word variants
186  vector<WERD_CHOICE *> word_variants;
187  WordVariants(*(cntxt_->CharacterSet()), cntxt_->TessUnicharset(),
188  char_32_ptr, &word_variants);
189 
190  if (word_variants.size() > 0) {
191  // find the shortest variant
192  int shortest_word = 0;
193  for (int word = 1; word < word_variants.size(); word++) {
194  if (word_variants[shortest_word]->length() >
195  word_variants[word]->length()) {
196  shortest_word = word;
197  }
198  }
199  // only add the shortest grapheme interpretation of string to the word list
200  dawg_->add_word_to_dawg(*word_variants[shortest_word]);
201  }
202  for (int i = 0; i < word_variants.size(); i++) { delete word_variants[i]; }
203  return true;
204 }
205 
206 }