tesseract  3.04.00
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
applybox.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: applybox.cpp (Formerly applybox.c)
3  * Description: Re segment rows according to box file data
4  * Author: Phil Cheatle
5  * Created: Wed Nov 24 09:11:23 GMT 1993
6  *
7  * (C) Copyright 1993, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifdef _MSC_VER
21 #pragma warning(disable:4244) // Conversion warnings
22 #endif
23 
24 #include <ctype.h>
25 #include <string.h>
26 #ifdef __UNIX__
27 #include <assert.h>
28 #include <errno.h>
29 #endif
30 #include "allheaders.h"
31 #include "boxread.h"
32 #include "chopper.h"
33 #include "pageres.h"
34 #include "unichar.h"
35 #include "unicharset.h"
36 #include "tesseractclass.h"
37 #include "genericvector.h"
38 
39 // Max number of blobs to classify together in FindSegmentation.
40 const int kMaxGroupSize = 4;
41 // Max fraction of median allowed as deviation in xheight before switching
42 // to median.
43 const double kMaxXHeightDeviationFraction = 0.125;
44 
45 /*************************************************************************
46  * The box file is assumed to contain box definitions, one per line, of the
47  * following format for blob-level boxes:
48  * <UTF8 str> <left> <bottom> <right> <top> <page id>
49  * and for word/line-level boxes:
50  * WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
51  * NOTES:
52  * The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.
53  *
54  * <page id> is 0-based, and the page number is used for multipage input (tiff).
55  *
56  * In the blob-level form, each line represents a recognizable unit, which may
57  * be several UTF-8 bytes, but there is a bounding box around each recognizable
58  * unit, and no classifier is needed to train in this mode (bootstrapping.)
59  *
60  * In the word/line-level form, the line begins with the literal "WordStr", and
61  * the bounding box bounds either a whole line or a whole word. The recognizable
62  * units in the word/line are listed after the # at the end of the line and
63  * are space delimited, ignoring any original spaces on the line.
64  * Eg.
65  * word -> #w o r d
66  * multi word line -> #m u l t i w o r d l i n e
67  * The recognizable units must be space-delimited in order to allow multiple
68  * unicodes to be used for a single recognizable unit, eg Hindi.
69  * In this mode, the classifier must have been pre-trained with the desired
70  * character set, or it will not be able to find the character segmentations.
71  *************************************************************************/
72 
73 namespace tesseract {
74 
75 static void clear_any_old_text(BLOCK_LIST *block_list) {
76  BLOCK_IT block_it(block_list);
77  for (block_it.mark_cycle_pt();
78  !block_it.cycled_list(); block_it.forward()) {
79  ROW_IT row_it(block_it.data()->row_list());
80  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
81  WERD_IT word_it(row_it.data()->word_list());
82  for (word_it.mark_cycle_pt();
83  !word_it.cycled_list(); word_it.forward()) {
84  word_it.data()->set_text("");
85  }
86  }
87  }
88 }
89 
90 // Applies the box file based on the image name fname, and resegments
91 // the words in the block_list (page), with:
92 // blob-mode: one blob per line in the box file, words as input.
93 // word/line-mode: one blob per space-delimited unit after the #, and one word
94 // per line in the box file. (See comment above for box file format.)
95 // If find_segmentation is true, (word/line mode) then the classifier is used
96 // to re-segment words/lines to match the space-delimited truth string for
97 // each box. In this case, the input box may be for a word or even a whole
98 // text line, and the output words will contain multiple blobs corresponding
99 // to the space-delimited input string.
100 // With find_segmentation false, no classifier is needed, but the chopper
101 // can still be used to correctly segment touching characters with the help
102 // of the input boxes.
103 // In the returned PAGE_RES, the WERD_RES are setup as they would be returned
104 // from normal classification, ie. with a word, chopped_word, rebuild_word,
105 // seam_array, denorm, box_word, and best_state, but NO best_choice or
106 // raw_choice, as they would require a UNICHARSET, which we aim to avoid.
107 // Instead, the correct_text member of WERD_RES is set, and this may be later
108 // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
109 // is not required before calling ApplyBoxTraining.
111  bool find_segmentation,
112  BLOCK_LIST *block_list) {
113  GenericVector<TBOX> boxes;
114  GenericVector<STRING> texts, full_texts;
115  if (!ReadAllBoxes(applybox_page, true, fname, &boxes, &texts, &full_texts,
116  NULL)) {
117  return NULL; // Can't do it.
118  }
119 
120  int box_count = boxes.size();
121  int box_failures = 0;
122  // Add an empty everything to the end.
123  boxes.push_back(TBOX());
124  texts.push_back(STRING());
125  full_texts.push_back(STRING());
126 
127  // In word mode, we use the boxes to make a word for each box, but
128  // in blob mode we use the existing words and maximally chop them first.
129  PAGE_RES* page_res = find_segmentation ?
130  NULL : SetupApplyBoxes(boxes, block_list);
131  clear_any_old_text(block_list);
132 
133  for (int i = 0; i < boxes.size() - 1; i++) {
134  bool foundit = false;
135  if (page_res != NULL) {
136  if (i == 0) {
137  foundit = ResegmentCharBox(page_res, NULL, boxes[i], boxes[i + 1],
138  full_texts[i].string());
139  } else {
140  foundit = ResegmentCharBox(page_res, &boxes[i-1], boxes[i],
141  boxes[i + 1], full_texts[i].string());
142  }
143  } else {
144  foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1],
145  texts[i].string());
146  }
147  if (!foundit) {
148  box_failures++;
149  ReportFailedBox(i, boxes[i], texts[i].string(),
150  "FAILURE! Couldn't find a matching blob");
151  }
152  }
153 
154  if (page_res == NULL) {
155  // In word/line mode, we now maximally chop all the words and resegment
156  // them with the classifier.
157  page_res = SetupApplyBoxes(boxes, block_list);
158  ReSegmentByClassification(page_res);
159  }
160  if (applybox_debug > 0) {
161  tprintf("APPLY_BOXES:\n");
162  tprintf(" Boxes read from boxfile: %6d\n", box_count);
163  if (box_failures > 0)
164  tprintf(" Boxes failed resegmentation: %6d\n", box_failures);
165  }
166  TidyUp(page_res);
167  return page_res;
168 }
169 
170 // Helper computes median xheight in the image.
171 static double MedianXHeight(BLOCK_LIST *block_list) {
172  BLOCK_IT block_it(block_list);
173  STATS xheights(0, block_it.data()->bounding_box().height());
174  for (block_it.mark_cycle_pt();
175  !block_it.cycled_list(); block_it.forward()) {
176  ROW_IT row_it(block_it.data()->row_list());
177  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
178  xheights.add(IntCastRounded(row_it.data()->x_height()), 1);
179  }
180  }
181  return xheights.median();
182 }
183 
184 // Any row xheight that is significantly different from the median is set
185 // to the median.
186 void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
187  double median_xheight = MedianXHeight(block_list);
188  double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
189  // Strip all fuzzy space markers to simplify the PAGE_RES.
190  BLOCK_IT b_it(block_list);
191  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
192  BLOCK* block = b_it.data();
193  ROW_IT r_it(block->row_list());
194  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
195  ROW* row = r_it.data();
196  float diff = fabs(row->x_height() - median_xheight);
197  if (diff > max_deviation) {
198  if (applybox_debug) {
199  tprintf("row xheight=%g, but median xheight = %g\n",
200  row->x_height(), median_xheight);
201  }
202  row->set_x_height(static_cast<float>(median_xheight));
203  }
204  }
205  }
206 }
207 
208 // Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
209 // All fuzzy spaces are removed, and all the words are maximally chopped.
211  BLOCK_LIST *block_list) {
212  PreenXHeights(block_list);
213  // Strip all fuzzy space markers to simplify the PAGE_RES.
214  BLOCK_IT b_it(block_list);
215  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
216  BLOCK* block = b_it.data();
217  ROW_IT r_it(block->row_list());
218  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
219  ROW* row = r_it.data();
220  WERD_IT w_it(row->word_list());
221  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
222  WERD* word = w_it.data();
223  if (word->cblob_list()->empty()) {
224  delete w_it.extract();
225  } else {
226  word->set_flag(W_FUZZY_SP, false);
227  word->set_flag(W_FUZZY_NON, false);
228  }
229  }
230  }
231  }
232  PAGE_RES* page_res = new PAGE_RES(false, block_list, NULL);
233  PAGE_RES_IT pr_it(page_res);
234  WERD_RES* word_res;
235  while ((word_res = pr_it.word()) != NULL) {
236  MaximallyChopWord(boxes, pr_it.block()->block,
237  pr_it.row()->row, word_res);
238  pr_it.forward();
239  }
240  return page_res;
241 }
242 
243 // Tests the chopper by exhaustively running chop_one_blob.
244 // The word_res will contain filled chopped_word, seam_array, denorm,
245 // box_word and best_state for the maximally chopped word.
247  BLOCK* block, ROW* row,
248  WERD_RES* word_res) {
249  if (!word_res->SetupForRecognition(unicharset, this, BestPix(),
254  row, block)) {
255  word_res->CloneChoppedToRebuild();
256  return;
257  }
258  if (chop_debug) {
259  tprintf("Maximally chopping word at:");
260  word_res->word->bounding_box().print();
261  }
262  GenericVector<BLOB_CHOICE*> blob_choices;
263  ASSERT_HOST(!word_res->chopped_word->blobs.empty());
264  float rating = static_cast<float>(MAX_INT8);
265  for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
266  // The rating and certainty are not quite arbitrary. Since
267  // select_blob_to_chop uses the worst certainty to choose, they all have
268  // to be different, so starting with MAX_INT8, subtract 1/8 for each blob
269  // in here, and then divide by e each time they are chopped, which
270  // should guarantee a set of unequal values for the whole tree of blobs
271  // produced, however much chopping is required. The chops are thus only
272  // limited by the ability of the chopper to find suitable chop points,
273  // and not by the value of the certainties.
274  BLOB_CHOICE* choice =
275  new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
276  blob_choices.push_back(choice);
277  rating -= 0.125f;
278  }
279  const double e = exp(1.0); // The base of natural logs.
280  int blob_number;
281  int right_chop_index = 0;
283  // We only chop if the language is not fixed pitch like CJK.
284  SEAM* seam = NULL;
285  while ((seam = chop_one_blob(boxes, blob_choices, word_res,
286  &blob_number)) != NULL) {
287  word_res->InsertSeam(blob_number, seam);
288  BLOB_CHOICE* left_choice = blob_choices[blob_number];
289  rating = left_choice->rating() / e;
290  left_choice->set_rating(rating);
291  left_choice->set_certainty(-rating);
292  // combine confidence w/ serial #
293  BLOB_CHOICE* right_choice = new BLOB_CHOICE(++right_chop_index,
294  rating - 0.125f, -rating, -1,
295  0.0f, 0.0f, 0.0f, BCC_FAKE);
296  blob_choices.insert(right_choice, blob_number + 1);
297  }
298  }
299  word_res->CloneChoppedToRebuild();
300  word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
301 }
302 
303 // Helper to compute the dispute resolution metric.
304 // Disputed blob resolution. The aim is to give the blob to the most
305 // appropriate boxfile box. Most of the time it is obvious, but if
306 // two boxfile boxes overlap significantly it is not. If a small boxfile
307 // box takes most of the blob, and a large boxfile box does too, then
308 // we want the small boxfile box to get it, but if the small box
309 // is much smaller than the blob, we don't want it to get it.
310 // Details of the disputed blob resolution:
311 // Given a box with area A, and a blob with area B, with overlap area C,
312 // then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
313 // miss metric gets the blob.
314 static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
315  int overlap_area = box1.intersection(box2).area();
316  double miss_metric = box1.area()- overlap_area;
317  miss_metric /= box1.area();
318  miss_metric *= box2.area() - overlap_area;
319  miss_metric /= box2.area();
320  return miss_metric;
321 }
322 
323 // Gather consecutive blobs that match the given box into the best_state
324 // and corresponding correct_text.
325 // Fights over which box owns which blobs are settled by pre-chopping and
326 // applying the blobs to box or next_box with the least non-overlap.
327 // Returns false if the box was in error, which can only be caused by
328 // failing to find an appropriate blob for a box.
329 // This means that occasionally, blobs may be incorrectly segmented if the
330 // chopper fails to find a suitable chop point.
331 bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
332  const TBOX& box, const TBOX& next_box,
333  const char* correct_text) {
334  if (applybox_debug > 1) {
335  tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
336  }
337  PAGE_RES_IT page_res_it(page_res);
338  WERD_RES* word_res;
339  for (word_res = page_res_it.word(); word_res != NULL;
340  word_res = page_res_it.forward()) {
341  if (!word_res->box_word->bounding_box().major_overlap(box))
342  continue;
343  if (applybox_debug > 1) {
344  tprintf("Checking word box:");
345  word_res->box_word->bounding_box().print();
346  }
347  int word_len = word_res->box_word->length();
348  for (int i = 0; i < word_len; ++i) {
349  TBOX char_box = TBOX();
350  int blob_count = 0;
351  for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
352  TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
353  if (!blob_box.major_overlap(box))
354  break;
355  if (word_res->correct_text[i + blob_count].length() > 0)
356  break; // Blob is claimed already.
357  double current_box_miss_metric = BoxMissMetric(blob_box, box);
358  double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
359  if (applybox_debug > 2) {
360  tprintf("Checking blob:");
361  blob_box.print();
362  tprintf("Current miss metric = %g, next = %g\n",
363  current_box_miss_metric, next_box_miss_metric);
364  }
365  if (current_box_miss_metric > next_box_miss_metric)
366  break; // Blob is a better match for next box.
367  char_box += blob_box;
368  }
369  if (blob_count > 0) {
370  if (applybox_debug > 1) {
371  tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
372  }
373  if (!char_box.almost_equal(box, 3) &&
374  (box.x_gap(next_box) < -3 ||
375  (prev_box != NULL && prev_box->x_gap(box) < -3))) {
376  return false;
377  }
378  // We refine just the box_word, best_state and correct_text here.
379  // The rebuild_word is made in TidyUp.
380  // blob_count blobs are put together to match the box. Merge the
381  // box_word boxes, save the blob_count in the state and the text.
382  word_res->box_word->MergeBoxes(i, i + blob_count);
383  word_res->best_state[i] = blob_count;
384  word_res->correct_text[i] = correct_text;
385  if (applybox_debug > 2) {
386  tprintf("%d Blobs match: blob box:", blob_count);
387  word_res->box_word->BlobBox(i).print();
388  tprintf("Matches box:");
389  box.print();
390  tprintf("With next box:");
391  next_box.print();
392  }
393  // Eliminated best_state and correct_text entries for the consumed
394  // blobs.
395  for (int j = 1; j < blob_count; ++j) {
396  word_res->best_state.remove(i + 1);
397  word_res->correct_text.remove(i + 1);
398  }
399  // Assume that no box spans multiple source words, so we are done with
400  // this box.
401  if (applybox_debug > 1) {
402  tprintf("Best state = ");
403  for (int j = 0; j < word_res->best_state.size(); ++j) {
404  tprintf("%d ", word_res->best_state[j]);
405  }
406  tprintf("\n");
407  tprintf("Correct text = [[ ");
408  for (int j = 0; j < word_res->correct_text.size(); ++j) {
409  tprintf("%s ", word_res->correct_text[j].string());
410  }
411  tprintf("]]\n");
412  }
413  return true;
414  }
415  }
416  }
417  if (applybox_debug > 0) {
418  tprintf("FAIL!\n");
419  }
420  return false; // Failure.
421 }
422 
423 // Consume all source blobs that strongly overlap the given box,
424 // putting them into a new word, with the correct_text label.
425 // Fights over which box owns which blobs are settled by
426 // applying the blobs to box or next_box with the least non-overlap.
427 // Returns false if the box was in error, which can only be caused by
428 // failing to find an overlapping blob for a box.
429 bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
430  const TBOX& box, const TBOX& next_box,
431  const char* correct_text) {
432  if (applybox_debug > 1) {
433  tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
434  }
435  WERD* new_word = NULL;
436  BLOCK_IT b_it(block_list);
437  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
438  BLOCK* block = b_it.data();
439  if (!box.major_overlap(block->bounding_box()))
440  continue;
441  ROW_IT r_it(block->row_list());
442  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
443  ROW* row = r_it.data();
444  if (!box.major_overlap(row->bounding_box()))
445  continue;
446  WERD_IT w_it(row->word_list());
447  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
448  WERD* word = w_it.data();
449  if (applybox_debug > 2) {
450  tprintf("Checking word:");
451  word->bounding_box().print();
452  }
453  if (word->text() != NULL && word->text()[0] != '\0')
454  continue; // Ignore words that are already done.
455  if (!box.major_overlap(word->bounding_box()))
456  continue;
457  C_BLOB_IT blob_it(word->cblob_list());
458  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
459  blob_it.forward()) {
460  C_BLOB* blob = blob_it.data();
461  TBOX blob_box = blob->bounding_box();
462  if (!blob_box.major_overlap(box))
463  continue;
464  double current_box_miss_metric = BoxMissMetric(blob_box, box);
465  double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
466  if (applybox_debug > 2) {
467  tprintf("Checking blob:");
468  blob_box.print();
469  tprintf("Current miss metric = %g, next = %g\n",
470  current_box_miss_metric, next_box_miss_metric);
471  }
472  if (current_box_miss_metric > next_box_miss_metric)
473  continue; // Blob is a better match for next box.
474  if (applybox_debug > 2) {
475  tprintf("Blob match: blob:");
476  blob_box.print();
477  tprintf("Matches box:");
478  box.print();
479  tprintf("With next box:");
480  next_box.print();
481  }
482  if (new_word == NULL) {
483  // Make a new word with a single blob.
484  new_word = word->shallow_copy();
485  new_word->set_text(correct_text);
486  w_it.add_to_end(new_word);
487  }
488  C_BLOB_IT new_blob_it(new_word->cblob_list());
489  new_blob_it.add_to_end(blob_it.extract());
490  }
491  }
492  }
493  }
494  if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n");
495  return new_word != NULL;
496 }
497 
498 // Resegments the words by running the classifier in an attempt to find the
499 // correct segmentation that produces the required string.
501  PAGE_RES_IT pr_it(page_res);
502  WERD_RES* word_res;
503  for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
504  WERD* word = word_res->word;
505  if (word->text() == NULL || word->text()[0] == '\0')
506  continue; // Ignore words that have no text.
507  // Convert the correct text to a vector of UNICHAR_ID
508  GenericVector<UNICHAR_ID> target_text;
509  if (!ConvertStringToUnichars(word->text(), &target_text)) {
510  tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
511  word->text());
512  pr_it.DeleteCurrentWord();
513  continue;
514  }
515  if (!FindSegmentation(target_text, word_res)) {
516  tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
517  word->text());
518  pr_it.DeleteCurrentWord();
519  continue;
520  }
521  }
522 }
523 
524 // Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
525 // Returns false if an invalid UNICHAR_ID is encountered.
527  GenericVector<UNICHAR_ID>* class_ids) {
528  for (int step = 0; *utf8 != '\0'; utf8 += step) {
529  const char* next_space = strchr(utf8, ' ');
530  if (next_space == NULL)
531  next_space = utf8 + strlen(utf8);
532  step = next_space - utf8;
533  UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
534  if (class_id == INVALID_UNICHAR_ID) {
535  return false;
536  }
537  while (utf8[step] == ' ')
538  ++step;
539  class_ids->push_back(class_id);
540  }
541  return true;
542 }
543 
544 // Resegments the word to achieve the target_text from the classifier.
545 // Returns false if the re-segmentation fails.
546 // Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
547 // applies a full search on the classifier results to find the best classified
548 // segmentation. As a compromise to obtain better recall, 1-1 ambiguity
549 // substitutions ARE used.
551  WERD_RES* word_res) {
552  // Classify all required combinations of blobs and save results in choices.
553  int word_length = word_res->box_word->length();
555  new GenericVector<BLOB_CHOICE_LIST*>[word_length];
556  for (int i = 0; i < word_length; ++i) {
557  for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
558  BLOB_CHOICE_LIST* match_result = classify_piece(
559  word_res->seam_array, i, i + j - 1, "Applybox",
560  word_res->chopped_word, word_res->blamer_bundle);
561  if (applybox_debug > 2) {
562  tprintf("%d+%d:", i, j);
563  print_ratings_list("Segment:", match_result, unicharset);
564  }
565  choices[i].push_back(match_result);
566  }
567  }
568  // Search the segmentation graph for the target text. Must be an exact
569  // match. Using wildcards makes it difficult to find the correct
570  // segmentation even when it is there.
571  word_res->best_state.clear();
572  GenericVector<int> search_segmentation;
573  float best_rating = 0.0f;
574  SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
575  &search_segmentation, &best_rating, &word_res->best_state);
576  for (int i = 0; i < word_length; ++i)
577  choices[i].delete_data_pointers();
578  delete [] choices;
579  if (word_res->best_state.empty()) {
580  // Build the original segmentation and if it is the same length as the
581  // truth, assume it will do.
582  int blob_count = 1;
583  for (int s = 0; s < word_res->seam_array.size(); ++s) {
584  SEAM* seam = word_res->seam_array[s];
585  if (!seam->HasAnySplits()) {
586  word_res->best_state.push_back(blob_count);
587  blob_count = 1;
588  } else {
589  ++blob_count;
590  }
591  }
592  word_res->best_state.push_back(blob_count);
593  if (word_res->best_state.size() != target_text.size()) {
594  word_res->best_state.clear(); // No good. Original segmentation bad size.
595  return false;
596  }
597  }
598  word_res->correct_text.clear();
599  for (int i = 0; i < target_text.size(); ++i) {
600  word_res->correct_text.push_back(
601  STRING(unicharset.id_to_unichar(target_text[i])));
602  }
603  return true;
604 }
605 
606 // Recursive helper to find a match to the target_text (from text_index
607 // position) in the choices (from choices_pos position).
608 // Choices is an array of GenericVectors, of length choices_length, with each
609 // element representing a starting position in the word, and the
610 // GenericVector holding classification results for a sequence of consecutive
611 // blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
613  int choices_pos, int choices_length,
614  const GenericVector<UNICHAR_ID>& target_text,
615  int text_index,
616  float rating, GenericVector<int>* segmentation,
617  float* best_rating,
618  GenericVector<int>* best_segmentation) {
620  for (int length = 1; length <= choices[choices_pos].size(); ++length) {
621  // Rating of matching choice or worst choice if no match.
622  float choice_rating = 0.0f;
623  // Find the corresponding best BLOB_CHOICE.
624  BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
625  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
626  choice_it.forward()) {
627  BLOB_CHOICE* choice = choice_it.data();
628  choice_rating = choice->rating();
629  UNICHAR_ID class_id = choice->unichar_id();
630  if (class_id == target_text[text_index]) {
631  break;
632  }
633  // Search ambigs table.
634  if (class_id < table.size() && table[class_id] != NULL) {
635  AmbigSpec_IT spec_it(table[class_id]);
636  for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
637  spec_it.forward()) {
638  const AmbigSpec *ambig_spec = spec_it.data();
639  // We'll only do 1-1.
640  if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
641  ambig_spec->correct_ngram_id == target_text[text_index])
642  break;
643  }
644  if (!spec_it.cycled_list())
645  break; // Found an ambig.
646  }
647  }
648  if (choice_it.cycled_list())
649  continue; // No match.
650  segmentation->push_back(length);
651  if (choices_pos + length == choices_length &&
652  text_index + 1 == target_text.size()) {
653  // This is a complete match. If the rating is good record a new best.
654  if (applybox_debug > 2) {
655  tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
656  rating + choice_rating, *best_rating, segmentation->size(),
657  best_segmentation->size());
658  }
659  if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
660  *best_segmentation = *segmentation;
661  *best_rating = rating + choice_rating;
662  }
663  } else if (choices_pos + length < choices_length &&
664  text_index + 1 < target_text.size()) {
665  if (applybox_debug > 3) {
666  tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n",
667  target_text[text_index],
668  unicharset.id_to_unichar(target_text[text_index]),
669  choice_it.data()->unichar_id() == target_text[text_index]
670  ? "Match" : "Ambig",
671  choices_pos, length);
672  }
673  SearchForText(choices, choices_pos + length, choices_length, target_text,
674  text_index + 1, rating + choice_rating, segmentation,
675  best_rating, best_segmentation);
676  if (applybox_debug > 3) {
677  tprintf("End recursion for %d=%s\n", target_text[text_index],
678  unicharset.id_to_unichar(target_text[text_index]));
679  }
680  }
681  segmentation->truncate(segmentation->size() - 1);
682  }
683 }
684 
685 // Counts up the labelled words and the blobs within.
686 // Deletes all unused or emptied words, counting the unused ones.
687 // Resets W_BOL and W_EOL flags correctly.
688 // Builds the rebuild_word and rebuilds the box_word and the best_choice.
689 void Tesseract::TidyUp(PAGE_RES* page_res) {
690  int ok_blob_count = 0;
691  int bad_blob_count = 0;
692  int ok_word_count = 0;
693  int unlabelled_words = 0;
694  PAGE_RES_IT pr_it(page_res);
695  WERD_RES* word_res;
696  for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
697  int ok_in_word = 0;
698  int blob_count = word_res->correct_text.size();
699  WERD_CHOICE* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
700  word_choice->set_permuter(TOP_CHOICE_PERM);
701  for (int c = 0; c < blob_count; ++c) {
702  if (word_res->correct_text[c].length() > 0) {
703  ++ok_in_word;
704  }
705  // Since we only need a fake word_res->best_choice, the actual
706  // unichar_ids do not matter. Which is fortunate, since TidyUp()
707  // can be called while training Tesseract, at the stage where
708  // unicharset is not meaningful yet.
710  INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f);
711  }
712  if (ok_in_word > 0) {
713  ok_blob_count += ok_in_word;
714  bad_blob_count += word_res->correct_text.size() - ok_in_word;
715  word_res->LogNewRawChoice(word_choice);
716  word_res->LogNewCookedChoice(1, false, word_choice);
717  } else {
718  ++unlabelled_words;
719  if (applybox_debug > 0) {
720  tprintf("APPLY_BOXES: Unlabelled word at :");
721  word_res->word->bounding_box().print();
722  }
723  pr_it.DeleteCurrentWord();
724  delete word_choice;
725  }
726  }
727  pr_it.restart_page();
728  for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
729  // Denormalize back to a BoxWord.
730  word_res->RebuildBestState();
731  word_res->SetupBoxWord();
732  word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
733  word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
734  }
735  if (applybox_debug > 0) {
736  tprintf(" Found %d good blobs.\n", ok_blob_count);
737  if (bad_blob_count > 0) {
738  tprintf(" Leaving %d unlabelled blobs in %d words.\n",
739  bad_blob_count, ok_word_count);
740  }
741  if (unlabelled_words > 0)
742  tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words);
743  }
744 }
745 
746 // Logs a bad box by line in the box file and box coords.
747 void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
748  const char *box_ch, const char *err_msg) {
749  tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
750  boxfile_lineno + 1, box_ch,
751  box.left(), box.bottom(), box.right(), box.top(), err_msg);
752 }
753 
754 // Creates a fake best_choice entry in each WERD_RES with the correct text.
756  PAGE_RES_IT pr_it(page_res);
757  for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
758  word_res = pr_it.forward()) {
759  WERD_CHOICE* choice = new WERD_CHOICE(word_res->uch_set,
760  word_res->correct_text.size());
761  for (int i = 0; i < word_res->correct_text.size(); ++i) {
762  // The part before the first space is the real ground truth, and the
763  // rest is the bounding box location and page number.
764  GenericVector<STRING> tokens;
765  word_res->correct_text[i].split(' ', &tokens);
766  UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
767  choice->append_unichar_id_space_allocated(char_id,
768  word_res->best_state[i],
769  0.0f, 0.0f);
770  }
771  word_res->ClearWordChoices();
772  word_res->LogNewRawChoice(choice);
773  word_res->LogNewCookedChoice(1, false, choice);
774  }
775 }
776 
777 // Calls LearnWord to extract features for labelled blobs within each word.
778 // Features are stored in an internal buffer.
779 void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) {
780  PAGE_RES_IT pr_it(page_res);
781  int word_count = 0;
782  for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
783  word_res = pr_it.forward()) {
784  LearnWord(fontname.string(), word_res);
785  ++word_count;
786  }
787  tprintf("Generated training data for %d words\n", word_count);
788 }
789 
790 
791 } // namespace tesseract
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX &next_box, const char *correct_text)
Definition: applybox.cpp:331
const int kMaxGroupSize
Definition: applybox.cpp:40
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:596
int size() const
Definition: genericvector.h:72
void CorrectClassifyWords(PAGE_RES *page_res)
Definition: applybox.cpp:755
void truncate(int size)
tesseract::BoxWord * box_word
Definition: pageres.h:250
bool classify_bln_numeric_mode
Definition: classify.h:500
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
const double kMaxXHeightDeviationFraction
Definition: applybox.cpp:43
void RebuildBestState()
Definition: pageres.cpp:800
int length() const
Definition: genericvector.h:79
bool HasAnySplits() const
Definition: seam.h:67
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:161
void set_text(const char *new_text)
Definition: werd.h:126
ROW_RES * next_row() const
Definition: pageres.h:745
int push_back(T object)
TWERD * chopped_word
Definition: pageres.h:201
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:612
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:449
#define tprintf(...)
Definition: tprintf.h:31
bool almost_equal(const TBOX &box, int tolerance) const
Definition: rect.cpp:258
const TBOX & BlobBox(int index) const
Definition: boxword.h:88
Definition: statistc.h:33
UNICHARSET unicharset
Definition: ccutil.h:72
void print() const
Definition: rect.h:270
void set_x_height(float new_xheight)
Definition: ocrrow.h:64
void MergeBoxes(int start, int end)
Definition: boxword.cpp:134
void set_permuter(uinT8 perm)
Definition: ratngs.h:372
void add(inT32 value, inT32 count)
Definition: statistc.cpp:104
float x_height() const
Definition: ocrrow.h:61
TBOX bounding_box() const
Definition: werd.cpp:160
GenericVector< STRING > correct_text
Definition: pageres.h:259
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE+1]
Definition: ambigs.h:132
void DeleteCurrentWord()
Definition: pageres.cpp:1449
inT16 right() const
Definition: rect.h:75
PAGE_RES * ApplyBoxes(const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
Definition: applybox.cpp:110
BLOCK * block
Definition: pageres.h:99
#define ASSERT_HOST(x)
Definition: errcode.h:84
Definition: ocrrow.h:32
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:819
Definition: werd.h:35
BLOCK_RES * block() const
Definition: pageres.h:739
WERD_RES * forward()
Definition: pageres.h:713
int NumBlobs() const
Definition: blobs.h:425
bool FindSegmentation(const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
Definition: applybox.cpp:550
inT32 area() const
Definition: rect.h:118
float rating() const
Definition: ratngs.h:79
void TidyUp(PAGE_RES *page_res)
Definition: applybox.cpp:689
WERD_RES * restart_page()
Definition: pageres.h:680
void insert(T t, int index)
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:872
Definition: werd.h:36
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:102
inT16 left() const
Definition: rect.h:68
void CloneChoppedToRebuild()
Definition: pageres.cpp:828
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
Definition: applybox.cpp:747
void set_certainty(float newrat)
Definition: ratngs.h:150
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
Definition: ocrblock.h:30
const UNICHARSET * uch_set
Definition: pageres.h:192
ROW_RES * row() const
Definition: pageres.h:736
TBOX bounding_box() const
Definition: ocrrow.h:85
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM * > &seams, inT16 start, inT16 end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:57
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:67
Dict & getDict()
Definition: classify.h:65
GenericVector< SEAM * > seam_array
Definition: pageres.h:203
void SearchForText(const GenericVector< BLOB_CHOICE_LIST * > *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
Definition: applybox.cpp:612
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:153
int UNICHAR_ID
Definition: unichar.h:33
Definition: werd.h:60
Pix * BestPix() const
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:409
WERD * shallow_copy()
Definition: werd.cpp:352
const char * text() const
Definition: werd.h:125
inT16 bottom() const
Definition: rect.h:61
bool major_overlap(const TBOX &box) const
Definition: rect.h:358
WERD * word
Definition: pageres.h:175
#define MAX_INT8
Definition: host.h:118
bool empty() const
Definition: genericvector.h:84
void remove(int index)
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX &next_box, const char *correct_text)
Definition: applybox.cpp:429
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87
void ApplyBoxTraining(const STRING &fontname, PAGE_RES *page_res)
Definition: applybox.cpp:779
UNICHAR_ID correct_ngram_id
Definition: ambigs.h:134
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
const int length() const
Definition: boxword.h:85
int x_gap(const TBOX &box) const
Definition: rect.h:217
int IntCastRounded(double x)
Definition: helpers.h:172
ROW * row
Definition: pageres.h:127
Definition: rect.h:30
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: applybox.cpp:246
const TBOX & bounding_box() const
Definition: boxword.h:82
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:294
void SetupBoxWord()
Definition: pageres.cpp:843
TBOX bounding_box() const
Definition: stepblob.cpp:250
Definition: strngs.h:44
GenericVector< int > best_state
Definition: pageres.h:255
#define NULL
Definition: host.h:144
void set_rating(float newrat)
Definition: ratngs.h:147
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:244
Definition: seam.h:44
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
Definition: applybox.cpp:210
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:120
void PreenXHeights(BLOCK_LIST *block_list)
Definition: applybox.cpp:186
bool ConvertStringToUnichars(const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
Definition: applybox.cpp:526
const char * string() const
Definition: strngs.cpp:193
inT16 top() const
Definition: rect.h:54
ROW_RES * prev_row() const
Definition: pageres.h:727
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:129
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
BlamerBundle * blamer_bundle
Definition: pageres.h:230
SEAM * chop_one_blob(const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, int *blob_number)
Definition: chopper.cpp:374
bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING &filename, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)
Definition: boxread.cpp:51
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
WERD_LIST * word_list()
Definition: ocrrow.h:52
WERD_RES * word() const
Definition: pageres.h:733
void ReSegmentByClassification(PAGE_RES *page_res)
Definition: applybox.cpp:500