Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
reject.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: reject.cpp (Formerly reject.c)
3  * Description: Rejection functions used in tessedit
4  * Author: Phil Cheatle
5  * Created: Wed Sep 23 16:50:21 BST 1992
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifdef _MSC_VER
21 #pragma warning(disable:4244) // Conversion warnings
22 #pragma warning(disable:4305) // int/float warnings
23 #endif
24 
25 #include "mfcpch.h"
26 
27 #include "tessvars.h"
28 #ifdef __UNIX__
29 #include <assert.h>
30 #include <errno.h>
31 #endif
32 #include "scanutils.h"
33 #include <ctype.h>
34 #include <string.h>
35 #include "memry.h"
36 #include "reject.h"
37 #include "tfacep.h"
38 #include "imgs.h"
39 #include "control.h"
40 #include "docqual.h"
41 #include "secname.h"
42 #include "globals.h"
43 #include "helpers.h"
44 
45 /* #define SECURE_NAMES done in secnames.h when necessary */
46 
47 #include "tesseractclass.h"
48 #include "notdll.h"
49 
50 // Include automatically generated configuration file if running autoconf.
51 #ifdef HAVE_CONFIG_H
52 #include "config_auto.h"
53 #endif
54 
56 
57 /*************************************************************************
58  * set_done()
59  *
60  * Set the done flag based on the word acceptability criteria
61  *************************************************************************/
62 
63 namespace tesseract {
64 void Tesseract::set_done( //set done flag
65  WERD_RES *word,
66  inT16 pass) {
67  /*
68  0: Original heuristic used in Tesseract and Ray's prototype Resaljet
69  */
70  if (tessedit_ok_mode == 0) {
71  /* NOTE - done even if word contains some or all spaces !!! */
72  word->done = word->tess_accepted;
73  }
74  /*
75  1: Reject words containing blanks and on pass 1 reject I/l/1 conflicts
76  */
77  else if (tessedit_ok_mode == 1) {
78  word->done = word->tess_accepted &&
79  (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
80 
81  if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
82  word->done = FALSE;
83  }
84  /*
85  2: as 1 + only accept dict words or numerics in pass 1
86  */
87  else if (tessedit_ok_mode == 2) {
88  word->done = word->tess_accepted &&
89  (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
90 
91  if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
92  word->done = FALSE;
93 
94  if (word->done &&
95  (pass == 1) &&
96  (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
97  (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
98  (word->best_choice->permuter () != USER_DAWG_PERM) &&
99  (word->best_choice->permuter () != NUMBER_PERM)) {
100  #ifndef SECURE_NAMES
101  if (tessedit_rejection_debug)
102  tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
103  word->best_choice->unichar_string().string ());
104  #endif
105  word->done = FALSE;
106  }
107  }
108  /*
109  3: as 2 + only accept dict words or numerics in pass 2 as well
110  */
111  else if (tessedit_ok_mode == 3) {
112  word->done = word->tess_accepted &&
113  (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
114 
115  if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
116  word->done = FALSE;
117 
118  if (word->done &&
119  (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
120  (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
121  (word->best_choice->permuter () != USER_DAWG_PERM) &&
122  (word->best_choice->permuter () != NUMBER_PERM)) {
123  #ifndef SECURE_NAMES
124  if (tessedit_rejection_debug)
125  tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
126  word->best_choice->unichar_string().string ());
127  #endif
128  word->done = FALSE;
129  }
130  }
131  /*
132  4: as 2 + reject dict ambigs in pass 1
133  */
134  else if (tessedit_ok_mode == 4) {
135  word->done = word->tess_accepted &&
136  (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
137 
138  if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
139  word->done = FALSE;
140 
141  if (word->done &&
142  (pass == 1) &&
143  (((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
144  (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
145  (word->best_choice->permuter () != USER_DAWG_PERM) &&
146  (word->best_choice->permuter () != NUMBER_PERM)) ||
147  (test_ambig_word (word)))) {
148  #ifndef SECURE_NAMES
149  if (tessedit_rejection_debug)
150  tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
151  word->best_choice->unichar_string().string ());
152  #endif
153  word->done = FALSE;
154  }
155  }
156  /*
157  5: as 3 + reject dict ambigs in both passes
158  */
159  else if (tessedit_ok_mode == 5) {
160  word->done = word->tess_accepted &&
161  (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
162 
163  if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
164  word->done = FALSE;
165 
166  if (word->done &&
167  (((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
168  (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
169  (word->best_choice->permuter () != USER_DAWG_PERM) &&
170  (word->best_choice->permuter () != NUMBER_PERM)) ||
171  (test_ambig_word (word)))) {
172  #ifndef SECURE_NAMES
173  if (tessedit_rejection_debug)
174  tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
175  word->best_choice->unichar_string().string ());
176  #endif
177  word->done = FALSE;
178  }
179  }
180 
181  else {
182  tprintf ("BAD tessedit_ok_mode\n");
183  err_exit();
184  }
185 }
186 
187 
188 /*************************************************************************
189  * make_reject_map()
190  *
191  * Sets the done flag to indicate whether the resylt is acceptable.
192  *
193  * Sets a reject map for the word.
194  *************************************************************************/
195 void Tesseract::make_reject_map( //make rej map for wd //detailed results
196  WERD_RES *word,
197  BLOB_CHOICE_LIST_CLIST *blob_choices,
198  ROW *row,
199  inT16 pass //1st or 2nd?
200  ) {
201  int i;
202  int offset;
203 
204  flip_0O(word);
205  check_debug_pt(word, -1); // For trap only
206  set_done(word, pass); // Set acceptance
208  reject_blanks(word);
209  /*
210  0: Rays original heuristic - the baseline
211  */
212  if (tessedit_reject_mode == 0) {
213  if (!word->done)
214  reject_poor_matches(word, blob_choices);
215  } else if (tessedit_reject_mode == 5) {
216  /*
217  5: Reject I/1/l from words where there is no strong contextual confirmation;
218  the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
219  and the whole of any words which are very small
220  */
221  if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
223  } else {
224  one_ell_conflict(word, TRUE);
225  /*
226  Originally the code here just used the done flag. Now I have duplicated
227  and unpacked the conditions for setting the done flag so that each
228  mechanism can be turned on or off independently. This works WITHOUT
229  affecting the done flag setting.
230  */
231  if (rej_use_tess_accepted && !word->tess_accepted)
233 
234  if (rej_use_tess_blanks &&
235  (strchr (word->best_choice->unichar_string().string (), ' ') != NULL))
237 
238  WERD_CHOICE* best_choice = word->best_choice;
239  if (rej_use_good_perm) {
240  if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
241  best_choice->permuter() == FREQ_DAWG_PERM ||
242  best_choice->permuter() == USER_DAWG_PERM) &&
243  (!rej_use_sensible_wd ||
244  acceptable_word_string(*word->uch_set,
245  best_choice->unichar_string().string(),
246  best_choice->unichar_lengths().string()) !=
247  AC_UNACCEPTABLE)) {
248  // PASSED TEST
249  } else if (best_choice->permuter() == NUMBER_PERM) {
250  if (rej_alphas_in_number_perm) {
251  for (i = 0, offset = 0;
252  best_choice->unichar_string()[offset] != '\0';
253  offset += best_choice->unichar_lengths()[i++]) {
254  if (word->reject_map[i].accepted() &&
255  word->uch_set->get_isalpha(
256  best_choice->unichar_string().string() + offset,
257  best_choice->unichar_lengths()[i]))
258  word->reject_map[i].setrej_bad_permuter();
259  // rej alpha
260  }
261  }
262  } else {
264  }
265  }
266  /* Ambig word rejection was here once !!*/
267  }
268  } else {
269  tprintf("BAD tessedit_reject_mode\n");
270  err_exit();
271  }
272 
273  if (tessedit_image_border > -1)
274  reject_edge_blobs(word);
275 
276  check_debug_pt (word, 10);
277  if (tessedit_rejection_debug) {
278  tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
279  tprintf("Certainty: %f Rating: %f\n",
280  word->best_choice->certainty (), word->best_choice->rating ());
281  tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
282  }
283 
284  flip_hyphens(word);
285  check_debug_pt(word, 20);
286 }
287 } // namespace tesseract
288 
289 
290 void reject_blanks(WERD_RES *word) {
291  inT16 i;
292  inT16 offset;
293 
294  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
295  offset += word->best_choice->unichar_lengths()[i], i += 1) {
296  if (word->best_choice->unichar_string()[offset] == ' ')
297  //rej unrecognised blobs
298  word->reject_map[i].setrej_tess_failure ();
299  }
300 }
301 
302 namespace tesseract {
304  inT16 i;
305  inT16 offset;
306 
307  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
308  offset += word->best_choice->unichar_lengths()[i], i += 1) {
310  contains (word->best_choice->unichar_string()[offset])) {
311  //rej 1Il conflict
312  word->reject_map[i].setrej_1Il_conflict ();
313  }
314  }
315 }
316 } // namespace tesseract
317 
318 
319 void reject_poor_matches( //detailed results
320  WERD_RES *word,
321  BLOB_CHOICE_LIST_CLIST *blob_choices) {
322  float threshold;
323  inT16 i = 0;
324  inT16 offset = 0;
325  //super iterator
326  BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
327  BLOB_CHOICE_IT choice_it; //real iterator
328 
329  #ifndef SECURE_NAMES
330  if (strlen(word->best_choice->unichar_lengths().string()) !=
331  list_it.length()) {
332  tprintf
333  ("ASSERT FAIL string:\"%s\"; strlen=%d; choices len=%d; blob len=%d\n",
334  word->best_choice->unichar_string().string(),
335  strlen (word->best_choice->unichar_lengths().string()), list_it.length(),
336  word->box_word->length());
337  }
338  #endif
339  ASSERT_HOST (strlen (word->best_choice->unichar_lengths().string ()) ==
340  list_it.length ());
341  ASSERT_HOST(word->box_word->length() == list_it.length());
342  threshold = compute_reject_threshold (blob_choices);
343 
344  for (list_it.mark_cycle_pt ();
345  !list_it.cycled_list (); list_it.forward (), i++,
346  offset += word->best_choice->unichar_lengths()[i]) {
347  /* NB - only compares the threshold against the TOP choice char in the
348  choices list for a blob !! - the selected one may be below the threshold
349  */
350  choice_it.set_to_list (list_it.data ());
351  if ((word->best_choice->unichar_string()[offset] == ' ') ||
352  (choice_it.length () == 0))
353  //rej unrecognised blobs
354  word->reject_map[i].setrej_tess_failure ();
355  else if (choice_it.data ()->certainty () < threshold)
356  //rej poor score blob
357  word->reject_map[i].setrej_poor_match ();
358  }
359 }
360 
361 
362 /**********************************************************************
363  * compute_reject_threshold
364  *
365  * Set a rejection threshold for this word.
366  * Initially this is a trivial function which looks for the largest
367  * gap in the certainty value.
368  **********************************************************************/
369 
370 float compute_reject_threshold( //compute threshold //detailed results
371  BLOB_CHOICE_LIST_CLIST *blob_choices) {
372  inT16 index; //to ratings
373  inT16 blob_count; //no of blobs in word
374  inT16 ok_blob_count = 0; //non TESS rej blobs in word
375  float *ratings; //array of confidences
376  float threshold; //rejection threshold
377  float bestgap; //biggest gap
378  float gapstart; //bottom of gap
379  //super iterator
380  BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
381  BLOB_CHOICE_IT choice_it; //real iterator
382 
383  blob_count = blob_choices->length ();
384  ratings = (float *) alloc_mem (blob_count * sizeof (float));
385  for (list_it.mark_cycle_pt (), index = 0;
386  !list_it.cycled_list (); list_it.forward (), index++) {
387  choice_it.set_to_list (list_it.data ());
388  if (choice_it.length () > 0) {
389  ratings[ok_blob_count] = choice_it.data ()->certainty ();
390  //get in an array
391  // tprintf("Rating[%d]=%c %g %g\n",
392  // index,choice_it.data()->char_class(),
393  // choice_it.data()->rating(),choice_it.data()->certainty());
394  ok_blob_count++;
395  }
396  }
397  ASSERT_HOST (index == blob_count);
398  qsort (ratings, ok_blob_count, sizeof (float), sort_floats);
399  //sort them
400  bestgap = 0;
401  gapstart = ratings[0] - 1; //all reject if none better
402  if (ok_blob_count >= 3) {
403  for (index = 0; index < ok_blob_count - 1; index++) {
404  if (ratings[index + 1] - ratings[index] > bestgap) {
405  bestgap = ratings[index + 1] - ratings[index];
406  //find biggest
407  gapstart = ratings[index];
408  }
409  }
410  }
411  threshold = gapstart + bestgap / 2;
412  // tprintf("First=%g, last=%g, gap=%g, threshold=%g\n",
413  // ratings[0],ratings[index],bestgap,threshold);
414 
415  free_mem(ratings);
416  return threshold;
417 }
418 
419 
420 /*************************************************************************
421  * reject_edge_blobs()
422  *
423  * If the word is perilously close to the edge of the image, reject those blobs
424  * in the word which are too close to the edge as they could be clipped.
425  *************************************************************************/
426 namespace tesseract {
428  TBOX word_box = word->word->bounding_box();
429  // Use the box_word as it is already denormed back to image coordinates.
430  int blobcount = word->box_word->length();
431 
432  if (word_box.left() < tessedit_image_border ||
433  word_box.bottom() < tessedit_image_border ||
434  word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
435  word_box.top() + tessedit_image_border > ImageHeight() - 1) {
436  ASSERT_HOST(word->reject_map.length() == blobcount);
437  for (int blobindex = 0; blobindex < blobcount; blobindex++) {
438  TBOX blob_box = word->box_word->BlobBox(blobindex);
439  if (blob_box.left() < tessedit_image_border ||
440  blob_box.bottom() < tessedit_image_border ||
441  blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
442  blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
443  word->reject_map[blobindex].setrej_edge_char();
444  // Close to edge
445  }
446  }
447  }
448 }
449 
450 /**********************************************************************
451  * one_ell_conflict()
452  *
453  * Identify words where there is a potential I/l/1 error.
454  * - A bundle of contextual heuristics!
455  **********************************************************************/
457  const char *word;
458  const char *lengths;
459  inT16 word_len; //its length
460  inT16 first_alphanum_index_;
461  inT16 first_alphanum_offset_;
462  inT16 i;
463  inT16 offset;
464  BOOL8 non_conflict_set_char; //non conf set a/n?
465  BOOL8 conflict = FALSE;
466  BOOL8 allow_1s;
467  ACCEPTABLE_WERD_TYPE word_type;
468  BOOL8 dict_perm_type;
469  BOOL8 dict_word_ok;
470  int dict_word_type;
471 
472  word = word_res->best_choice->unichar_string().string ();
473  lengths = word_res->best_choice->unichar_lengths().string();
474  word_len = strlen (lengths);
475  /*
476  If there are no occurrences of the conflict set characters then the word
477  is OK.
478  */
479  if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL)
480  return FALSE;
481 
482  /*
483  There is a conflict if there are NO other (confirmed) alphanumerics apart
484  from those in the conflict set.
485  */
486 
487  for (i = 0, offset = 0, non_conflict_set_char = FALSE;
488  (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
489  non_conflict_set_char =
490  (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
491  word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
492  !STRING (conflict_set_I_l_1).contains (word[offset]);
493  if (!non_conflict_set_char) {
494  if (update_map)
495  reject_I_1_L(word_res);
496  return TRUE;
497  }
498 
499  /*
500  If the word is accepted by a dawg permuter, and the first alpha character
501  is "I" or "l", check to see if the alternative is also a dawg word. If it
502  is, then there is a potential error otherwise the word is ok.
503  */
504 
505  dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
506  (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
508  (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
509  (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
510  dict_word_type = dict_word(*(word_res->best_choice));
511  dict_word_ok = (dict_word_type > 0) &&
512  (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
513 
514  if ((rej_1Il_use_dict_word && dict_word_ok) ||
515  (rej_1Il_trust_permuter_type && dict_perm_type) ||
516  (dict_perm_type && dict_word_ok)) {
517  first_alphanum_index_ = first_alphanum_index (word, lengths);
518  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
519  if (lengths[first_alphanum_index_] == 1 &&
520  word[first_alphanum_offset_] == 'I') {
521  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
522  if (safe_dict_word(word_res) > 0) {
523  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
524  if (update_map)
525  word_res->reject_map[first_alphanum_index_].
526  setrej_1Il_conflict();
527  return TRUE;
528  }
529  else {
530  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
531  return FALSE;
532  }
533  }
534 
535  if (lengths[first_alphanum_index_] == 1 &&
536  word[first_alphanum_offset_] == 'l') {
537  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
538  if (safe_dict_word(word_res) > 0) {
539  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
540  if (update_map)
541  word_res->reject_map[first_alphanum_index_].
542  setrej_1Il_conflict();
543  return TRUE;
544  }
545  else {
546  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
547  return FALSE;
548  }
549  }
550  return FALSE;
551  }
552 
553  /*
554  NEW 1Il code. The old code relied on permuter types too much. In fact,
555  tess will use TOP_CHOICE permute for good things like "palette".
556  In this code the string is examined independently to see if it looks like
557  a well formed word.
558  */
559 
560  /*
561  REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
562  dictionary word.
563  */
564  first_alphanum_index_ = first_alphanum_index (word, lengths);
565  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
566  if (lengths[first_alphanum_index_] == 1 &&
567  word[first_alphanum_offset_] == 'l') {
568  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
569  if (safe_dict_word(word_res) > 0)
570  return FALSE;
571  else
572  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
573  }
574  else if (lengths[first_alphanum_index_] == 1 &&
575  word[first_alphanum_offset_] == 'I') {
576  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
577  if (safe_dict_word(word_res) > 0)
578  return FALSE;
579  else
580  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
581  }
582  /*
583  For strings containing digits:
584  If there are no alphas OR the numeric permuter liked the word,
585  reject any non 1 conflict chs
586  Else reject all conflict chs
587  */
588  if (word_contains_non_1_digit (word, lengths)) {
589  allow_1s = (alpha_count (word, lengths) == 0) ||
590  (word_res->best_choice->permuter () == NUMBER_PERM);
591 
592  inT16 offset;
593  conflict = FALSE;
594  for (i = 0, offset = 0; word[offset] != '\0';
595  offset += word_res->best_choice->unichar_lengths()[i++]) {
596  if ((!allow_1s || (word[offset] != '1')) &&
597  STRING (conflict_set_I_l_1).contains (word[offset])) {
598  if (update_map)
599  word_res->reject_map[i].setrej_1Il_conflict ();
600  conflict = TRUE;
601  }
602  }
603  return conflict;
604  }
605  /*
606  For anything else. See if it conforms to an acceptable word type. If so,
607  treat accordingly.
608  */
609  word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
610  if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
611  first_alphanum_index_ = first_alphanum_index (word, lengths);
612  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
613  if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
614  if (update_map)
615  word_res->reject_map[first_alphanum_index_].
616  setrej_1Il_conflict ();
617  return TRUE;
618  }
619  else
620  return FALSE;
621  }
622  else if (word_type == AC_UPPER_CASE) {
623  return FALSE;
624  }
625  else {
626  if (update_map)
627  reject_I_1_L(word_res);
628  return TRUE;
629  }
630 }
631 
632 
634  const char *word_lengths) {
635  inT16 i;
636  inT16 offset;
637 
638  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
639  if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
640  unicharset.get_isdigit(word + offset, word_lengths[i]))
641  return i;
642  }
643  return -1;
644 }
645 
647  const char *word_lengths) {
648  inT16 i;
649  inT16 offset;
650 
651  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
652  if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
653  unicharset.get_isdigit(word + offset, word_lengths[i]))
654  return offset;
655  }
656  return -1;
657 }
658 
659 inT16 Tesseract::alpha_count(const char *word,
660  const char *word_lengths) {
661  inT16 i;
662  inT16 offset;
663  inT16 count = 0;
664 
665  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
666  if (unicharset.get_isalpha (word + offset, word_lengths[i]))
667  count++;
668  }
669  return count;
670 }
671 
672 
674  const char *word_lengths) {
675  inT16 i;
676  inT16 offset;
677 
678  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
679  if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
680  (word_lengths[i] != 1 || word[offset] != '1'))
681  return TRUE;
682  }
683  return FALSE;
684 }
685 
686 
687 BOOL8 Tesseract::test_ambig_word( //test for ambiguity
688  WERD_RES *word) {
689  BOOL8 ambig = FALSE;
690 
691  if ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
692  (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
693  (word->best_choice->permuter () == USER_DAWG_PERM)) {
694  ambig = !getDict().NoDangerousAmbig(
695  word->best_choice, NULL, false, NULL, NULL);
696  }
697  return ambig;
698 }
699 
700 
701 /*************************************************************************
702  * dont_allow_1Il()
703  * Dont unreject LONE accepted 1Il conflict set chars
704  *************************************************************************/
706  int i = 0;
707  int offset;
708  int word_len = word->reject_map.length();
709  const char *s = word->best_choice->unichar_string().string();
710  const char *lengths = word->best_choice->unichar_lengths().string();
711  BOOL8 accepted_1Il = FALSE;
712 
713  for (i = 0, offset = 0; i < word_len;
714  offset += word->best_choice->unichar_lengths()[i++]) {
715  if (word->reject_map[i].accepted()) {
716  if (STRING(conflict_set_I_l_1).contains(s[offset])) {
717  accepted_1Il = TRUE;
718  } else {
719  if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
720  word->uch_set->get_isdigit(s + offset, lengths[i]))
721  return; // >=1 non 1Il ch accepted
722  }
723  }
724  }
725  if (!accepted_1Il)
726  return; //Nothing to worry about
727 
728  for (i = 0, offset = 0; i < word_len;
729  offset += word->best_choice->unichar_lengths()[i++]) {
730  if (STRING(conflict_set_I_l_1).contains(s[offset]) &&
731  word->reject_map[i].accepted())
732  word->reject_map[i].setrej_postNN_1Il();
733  }
734 }
735 
736 
738  int count = 0;
739  const WERD_CHOICE *best_choice = word_res->best_choice;
740  for (int i = 0; i < word_res->reject_map.length(); ++i) {
741  if ((word_res->reject_map[i].accepted()) &&
742  (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
743  word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
744  count++;
745  }
746  }
747  return count;
748 }
749 
750 
751 // reject all if most rejected.
753  /* Reject the whole of the word if the fraction of rejects exceeds a limit */
754 
755  if ((float) word->reject_map.reject_count() / word->reject_map.length() >=
758 }
759 
760 
762  inT16 char_quality;
763  inT16 accepted_char_quality;
764 
765  if (word->best_choice->unichar_lengths().length() <= 1)
766  return FALSE;
767 
769  contains(word->best_choice->unichar_string()[0]))
770  return FALSE;
771 
772  UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
773  for (int i = 1; i < word->best_choice->length(); ++i) {
774  if (word->best_choice->unichar_id(i) != uch_id) return FALSE;
775  }
776 
777  word_char_quality(word, row, &char_quality, &accepted_char_quality);
778 
779  if ((word->best_choice->unichar_lengths().length () == char_quality) &&
780  (char_quality == accepted_char_quality))
781  return TRUE;
782  else
783  return FALSE;
784 }
785 
787  const WERD_CHOICE &word = *werd_res->best_choice;
788  int dict_word_type = werd_res->tesseract->dict_word(word);
789  return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
790 }
791 
792 // Note: After running this function word_res->best_choice->blob_choices()
793 // might not contain the right BLOB_CHOICE coresponding to each character
794 // in word_res->best_choice. However, the length of blob_choices and
795 // word_res->best_choice will remain the same.
797  WERD_CHOICE *best_choice = word_res->best_choice;
798  int i;
799  int prev_right = -9999;
800  int next_left;
801  TBOX out_box;
802  float aspect_ratio;
803 
805  return;
806 
807  TBLOB* blob = word_res->rebuild_word->blobs;
808  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
809  bool modified = false;
810  for (i = 0; i < best_choice->length() && blob != NULL; ++i,
811  blob = blob->next) {
812  out_box = blob->bounding_box();
813  if (blob->next == NULL)
814  next_left = 9999;
815  else
816  next_left = blob->next->bounding_box().left();
817  // Dont touch small or touching blobs - it is too dangerous.
818  if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
819  (out_box.left() > prev_right) && (out_box.right() < next_left)) {
820  aspect_ratio = out_box.width() / (float) out_box.height();
821  if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
822  if (aspect_ratio >= tessedit_upper_flip_hyphen &&
823  word_res->uch_set->contains_unichar_id(unichar_dash) &&
824  word_res->uch_set->get_enabled(unichar_dash)) {
825  /* Certain HYPHEN */
826  best_choice->set_unichar_id(unichar_dash, i);
827  modified = true;
828  if (word_res->reject_map[i].rejected())
829  word_res->reject_map[i].setrej_hyphen_accept();
830  }
831  if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
832  word_res->reject_map[i].accepted())
833  //Suspected HYPHEN
834  word_res->reject_map[i].setrej_hyphen ();
835  }
836  else if (best_choice->unichar_id(i) == unichar_dash) {
837  if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
838  (word_res->reject_map[i].rejected()))
839  word_res->reject_map[i].setrej_hyphen_accept();
840  //Certain HYPHEN
841 
842  if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
843  (word_res->reject_map[i].accepted()))
844  //Suspected HYPHEN
845  word_res->reject_map[i].setrej_hyphen();
846  }
847  }
848  prev_right = out_box.right();
849  }
850 }
851 
852 // Note: After running this function word_res->best_choice->blob_choices()
853 // might not contain the right BLOB_CHOICE coresponding to each character
854 // in word_res->best_choice. However, the length of blob_choices and
855 // word_res->best_choice will remain the same.
856 void Tesseract::flip_0O(WERD_RES *word_res) {
857  WERD_CHOICE *best_choice = word_res->best_choice;
858  int i;
859  TBOX out_box;
860 
861  if (!tessedit_flip_0O)
862  return;
863 
864  TBLOB* blob = word_res->rebuild_word->blobs;
865  for (i = 0; i < best_choice->length() && blob != NULL; ++i,
866  blob = blob->next) {
867  if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
868  word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
869  out_box = blob->bounding_box();
870  if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
871  (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4))
872  return; //Beware words with sub/superscripts
873  }
874  }
875  UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
876  UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
877  if (unichar_0 == INVALID_UNICHAR_ID ||
878  !word_res->uch_set->get_enabled(unichar_0) ||
879  unichar_O == INVALID_UNICHAR_ID ||
880  !word_res->uch_set->get_enabled(unichar_O)) {
881  return; // 0 or O are not present/enabled in unicharset
882  }
883  bool modified = false;
884  for (i = 1; i < best_choice->length(); ++i) {
885  if (best_choice->unichar_id(i) == unichar_0 ||
886  best_choice->unichar_id(i) == unichar_O) {
887  /* A0A */
888  if ((i+1) < best_choice->length() &&
889  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
890  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) {
891  best_choice->set_unichar_id(unichar_O, i);
892  modified = true;
893  }
894  /* A00A */
895  if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
896  (i+1) < best_choice->length() &&
897  (best_choice->unichar_id(i+1) == unichar_0 ||
898  best_choice->unichar_id(i+1) == unichar_O) &&
899  (i+2) < best_choice->length() &&
900  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) {
901  best_choice->set_unichar_id(unichar_O, i);
902  modified = true;
903  i++;
904  }
905  /* AA0<non digit or end of word> */
906  if ((i > 1) &&
907  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) &&
908  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
909  (((i+1) < best_choice->length() &&
910  !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) &&
911  !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") &&
912  !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) ||
913  (i == best_choice->length() - 1))) {
914  best_choice->set_unichar_id(unichar_O, i);
915  modified = true;
916  }
917  /* 9O9 */
918  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
919  (i+1) < best_choice->length() &&
920  non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) {
921  best_choice->set_unichar_id(unichar_0, i);
922  modified = true;
923  }
924  /* 9OOO */
925  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
926  (i+2) < best_choice->length() &&
927  (best_choice->unichar_id(i+1) == unichar_0 ||
928  best_choice->unichar_id(i+1) == unichar_O) &&
929  (best_choice->unichar_id(i+2) == unichar_0 ||
930  best_choice->unichar_id(i+2) == unichar_O)) {
931  best_choice->set_unichar_id(unichar_0, i);
932  best_choice->set_unichar_id(unichar_0, i+1);
933  best_choice->set_unichar_id(unichar_0, i+2);
934  modified = true;
935  i += 2;
936  }
937  /* 9OO<non upper> */
938  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
939  (i+2) < best_choice->length() &&
940  (best_choice->unichar_id(i+1) == unichar_0 ||
941  best_choice->unichar_id(i+1) == unichar_O) &&
942  !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) {
943  best_choice->set_unichar_id(unichar_0, i);
944  best_choice->set_unichar_id(unichar_0, i+1);
945  modified = true;
946  i++;
947  }
948  /* 9O<non upper> */
949  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
950  (i+1) < best_choice->length() &&
951  !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) {
952  best_choice->set_unichar_id(unichar_0, i);
953  }
954  /* 9[.,]OOO.. */
955  if ((i > 1) &&
956  (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") ||
957  word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) &&
958  (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) ||
959  best_choice->unichar_id(i-2) == unichar_O)) {
960  if (best_choice->unichar_id(i-2) == unichar_O) {
961  best_choice->set_unichar_id(unichar_0, i-2);
962  modified = true;
963  }
964  while (i < best_choice->length() &&
965  (best_choice->unichar_id(i) == unichar_O ||
966  best_choice->unichar_id(i) == unichar_0)) {
967  best_choice->set_unichar_id(unichar_0, i);
968  modified = true;
969  i++;
970  }
971  i--;
972  }
973  }
974  }
975 }
976 
977 BOOL8 Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
978  return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
979 }
980 
981 BOOL8 Tesseract::non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
982  return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
983 }
984 } // namespace tesseract