Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tordmain.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: tordmain.cpp (Formerly textordp.c)
3  * Description: C++ top level textord code.
4  * Author: Ray Smith
5  * Created: Tue Jul 28 17:12:33 BST 1992
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 #include "mfcpch.h"
20 #ifdef __UNIX__
21 #include <assert.h>
22 #endif
23 #include "stderr.h"
24 #include "globaloc.h"
25 #include "blread.h"
26 #include "blobbox.h"
27 #include "ccstruct.h"
28 #include "edgblob.h"
29 #include "drawtord.h"
30 #include "makerow.h"
31 #include "wordseg.h"
32 #include "imgs.h"
33 #include "textord.h"
34 #include "tordmain.h"
35 #include "secname.h"
36 
37 // Include automatically generated configuration file if running autoconf.
38 #ifdef HAVE_CONFIG_H
39 #include "config_auto.h"
40 #endif
41 
42 #include "allheaders.h"
43 
44 const ERRCODE BLOCKLESS_BLOBS = "Warning:some blobs assigned to no block";
45 
46 #undef EXTERN
47 #define EXTERN
48 
49 #define MAX_NEAREST_DIST 600 //for block skew stats
50 
51 /**********************************************************************
52  * SetBlobStrokeWidth
53  *
54  * Set the horizontal and vertical stroke widths in the blob.
55  **********************************************************************/
56 void SetBlobStrokeWidth(Pix* pix, BLOBNBOX* blob) {
57  // Cut the blob rectangle into a Pix.
58  int pix_height = pixGetHeight(pix);
59  const TBOX& box = blob->bounding_box();
60  int width = box.width();
61  int height = box.height();
62  Box* blob_pix_box = boxCreate(box.left(), pix_height - box.top(),
63  width, height);
64  Pix* pix_blob = pixClipRectangle(pix, blob_pix_box, NULL);
65  boxDestroy(&blob_pix_box);
66  Pix* dist_pix = pixDistanceFunction(pix_blob, 4, 8, L_BOUNDARY_BG);
67  pixDestroy(&pix_blob);
68  // Compute the stroke widths.
69  uinT32* data = pixGetData(dist_pix);
70  int wpl = pixGetWpl(dist_pix);
71  // Horizontal width of stroke.
72  STATS h_stats(0, width + 1);
73  for (int y = 0; y < height; ++y) {
74  uinT32* pixels = data + y*wpl;
75  int prev_pixel = 0;
76  int pixel = GET_DATA_BYTE(pixels, 0);
77  for (int x = 1; x < width; ++x) {
78  int next_pixel = GET_DATA_BYTE(pixels, x);
79  // We are looking for a pixel that is equal to its vertical neighbours,
80  // yet greater than its left neighbour.
81  if (prev_pixel < pixel &&
82  (y == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
83  (y == height - 1 || pixel == GET_DATA_BYTE(pixels + wpl, x - 1))) {
84  if (pixel > next_pixel) {
85  // Single local max, so an odd width.
86  h_stats.add(pixel * 2 - 1, 1);
87  } else if (pixel == next_pixel && x + 1 < width &&
88  pixel > GET_DATA_BYTE(pixels, x + 1)) {
89  // Double local max, so an even width.
90  h_stats.add(pixel * 2, 1);
91  }
92  }
93  prev_pixel = pixel;
94  pixel = next_pixel;
95  }
96  }
97  // Vertical width of stroke.
98  STATS v_stats(0, height + 1);
99  for (int x = 0; x < width; ++x) {
100  int prev_pixel = 0;
101  int pixel = GET_DATA_BYTE(data, x);
102  for (int y = 1; y < height; ++y) {
103  uinT32* pixels = data + y*wpl;
104  int next_pixel = GET_DATA_BYTE(pixels, x);
105  // We are looking for a pixel that is equal to its horizontal neighbours,
106  // yet greater than its upper neighbour.
107  if (prev_pixel < pixel &&
108  (x == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
109  (x == width - 1 || pixel == GET_DATA_BYTE(pixels - wpl, x + 1))) {
110  if (pixel > next_pixel) {
111  // Single local max, so an odd width.
112  v_stats.add(pixel * 2 - 1, 1);
113  } else if (pixel == next_pixel && y + 1 < height &&
114  pixel > GET_DATA_BYTE(pixels + wpl, x)) {
115  // Double local max, so an even width.
116  v_stats.add(pixel * 2, 1);
117  }
118  }
119  prev_pixel = pixel;
120  pixel = next_pixel;
121  }
122  }
123  pixDestroy(&dist_pix);
124  // Store the horizontal and vertical width in the blob, keeping both
125  // widths if there is enough information, otherwse only the one with
126  // the most samples.
127  // If there are insufficent samples, store zero, rather than using
128  // 2*area/perimeter, as the numbers that gives do not match the numbers
129  // from the distance method.
130  if (h_stats.get_total() >= (width + height) / 4) {
131  blob->set_horz_stroke_width(h_stats.ile(0.5f));
132  if (v_stats.get_total() >= (width + height) / 4)
133  blob->set_vert_stroke_width(v_stats.ile(0.5f));
134  else
135  blob->set_vert_stroke_width(0.0f);
136  } else {
137  if (v_stats.get_total() >= (width + height) / 4 ||
138  v_stats.get_total() > h_stats.get_total()) {
139  blob->set_horz_stroke_width(0.0f);
140  blob->set_vert_stroke_width(v_stats.ile(0.5f));
141  } else {
142  blob->set_horz_stroke_width(h_stats.get_total() > 2 ? h_stats.ile(0.5f)
143  : 0.0f);
144  blob->set_vert_stroke_width(0.0f);
145  }
146  }
147 }
148 
149 
150 /**********************************************************************
151  * assign_blobs_to_blocks2
152  *
153  * Make a list of TO_BLOCKs for portrait and landscape orientation.
154  **********************************************************************/
155 
157  BLOCK_LIST *blocks, // blocks to process
158  TO_BLOCK_LIST *port_blocks) { // output list
159  BLOCK *block; // current block
160  BLOBNBOX *newblob; // created blob
161  C_BLOB *blob; // current blob
162  BLOCK_IT block_it = blocks;
163  C_BLOB_IT blob_it; // iterator
164  BLOBNBOX_IT port_box_it; // iterator
165  // destination iterator
166  TO_BLOCK_IT port_block_it = port_blocks;
167  TO_BLOCK *port_block; // created block
168 
169  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
170  block = block_it.data();
171  port_block = new TO_BLOCK(block);
172 
173  // Convert the good outlines to block->blob_list
174  port_box_it.set_to_list(&port_block->blobs);
175  blob_it.set_to_list(block->blob_list());
176  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
177  blob = blob_it.extract();
178  newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX.
179  SetBlobStrokeWidth(pix, newblob);
180  port_box_it.add_after_then_move(newblob);
181  }
182 
183  // Put the rejected outlines in block->noise_blobs, which allows them to
184  // be reconsidered and sorted back into rows and recover outlines mistakenly
185  // rejected.
186  port_box_it.set_to_list(&port_block->noise_blobs);
187  blob_it.set_to_list(block->reject_blobs());
188  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
189  blob = blob_it.extract();
190  newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX.
191  SetBlobStrokeWidth(pix, newblob);
192  port_box_it.add_after_then_move(newblob);
193  }
194 
195  port_block_it.add_after_then_move(port_block);
196  }
197 }
198 
199 namespace tesseract {
200 /**********************************************************************
201  * find_components
202  *
203  * Find the C_OUTLINEs of the connected components in each block, put them
204  * in C_BLOBs, and filter them by size, putting the different size
205  * grades on different lists in the matching TO_BLOCK in to_blocks.
206  **********************************************************************/
207 
208 void Textord::find_components(Pix* pix, BLOCK_LIST *blocks,
209  TO_BLOCK_LIST *to_blocks) {
210  int width = pixGetWidth(pix);
211  int height = pixGetHeight(pix);
212  if (width > MAX_INT16 || height > MAX_INT16) {
213  tprintf("Input image too large! (%d, %d)\n", width, height);
214  return; // Can't handle it.
215  }
216 
218 
219  BLOCK_IT block_it(blocks); // iterator
220  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
221  block_it.forward()) {
222  BLOCK* block = block_it.data();
223  if (block->poly_block() == NULL || block->poly_block()->IsText()) {
224  extract_edges(pix, block);
225  }
226  }
227 
228  assign_blobs_to_blocks2(pix, blocks, to_blocks);
229  ICOORD page_tr(width, height);
230  filter_blobs(page_tr, to_blocks, !textord_test_landscape);
231 }
232 
233 /**********************************************************************
234  * filter_blobs
235  *
236  * Sort the blobs into sizes in all the blocks for later work.
237  **********************************************************************/
238 
239 void Textord::filter_blobs(ICOORD page_tr, // top right
240  TO_BLOCK_LIST *blocks, // output list
241  BOOL8 testing_on) { // for plotting
242  TO_BLOCK_IT block_it = blocks; // destination iterator
243  TO_BLOCK *block; // created block
244 
245  #ifndef GRAPHICS_DISABLED
246  if (to_win != NULL)
247  to_win->Clear();
248  #endif // GRAPHICS_DISABLED
249 
250  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
251  block_it.forward()) {
252  block = block_it.data();
253  block->line_size = filter_noise_blobs(&block->blobs,
254  &block->noise_blobs,
255  &block->small_blobs,
256  &block->large_blobs);
257  block->line_spacing = block->line_size *
264 
265  #ifndef GRAPHICS_DISABLED
266  if (textord_show_blobs && testing_on) {
267  if (to_win == NULL)
268  create_to_win(page_tr);
269  block->plot_graded_blobs(to_win);
270  }
271  if (textord_show_boxes && testing_on) {
272  if (to_win == NULL)
273  create_to_win(page_tr);
278  }
279  #endif // GRAPHICS_DISABLED
280  }
281 }
282 
283 /**********************************************************************
284  * filter_noise_blobs
285  *
286  * Move small blobs to a separate list.
287  **********************************************************************/
288 
289 float Textord::filter_noise_blobs(
290  BLOBNBOX_LIST *src_list, // original list
291  BLOBNBOX_LIST *noise_list, // noise list
292  BLOBNBOX_LIST *small_list, // small blobs
293  BLOBNBOX_LIST *large_list) { // large blobs
294  inT16 height; //height of blob
295  inT16 width; //of blob
296  BLOBNBOX *blob; //current blob
297  float initial_x; //first guess
298  BLOBNBOX_IT src_it = src_list; //iterators
299  BLOBNBOX_IT noise_it = noise_list;
300  BLOBNBOX_IT small_it = small_list;
301  BLOBNBOX_IT large_it = large_list;
302  STATS size_stats (0, MAX_NEAREST_DIST);
303  //blob heights
304  float min_y; //size limits
305  float max_y;
306  float max_x;
307  float max_height; //of good blobs
308 
309  for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
310  blob = src_it.data ();
311  if (blob->bounding_box ().height () < textord_max_noise_size)
312  noise_it.add_after_then_move (src_it.extract ());
313  else if (blob->enclosed_area () >= blob->bounding_box ().height ()
315  small_it.add_after_then_move (src_it.extract ());
316  }
317  for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
318  size_stats.add (src_it.data ()->bounding_box ().height (), 1);
319  }
320  initial_x = size_stats.ile (textord_initialx_ile);
321  max_y = ceil(initial_x *
326  min_y = floor (initial_x / 2);
327  max_x = ceil (initial_x * textord_width_limit);
328  small_it.move_to_first ();
329  for (small_it.mark_cycle_pt (); !small_it.cycled_list ();
330  small_it.forward ()) {
331  height = small_it.data()->bounding_box().height();
332  if (height > max_y)
333  large_it.add_after_then_move(small_it.extract ());
334  else if (height >= min_y)
335  src_it.add_after_then_move(small_it.extract ());
336  }
337  size_stats.clear ();
338  for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
339  height = src_it.data ()->bounding_box ().height ();
340  width = src_it.data ()->bounding_box ().width ();
341  if (height < min_y)
342  small_it.add_after_then_move (src_it.extract ());
343  else if (height > max_y || width > max_x)
344  large_it.add_after_then_move (src_it.extract ());
345  else
346  size_stats.add (height, 1);
347  }
348  max_height = size_stats.ile (textord_initialasc_ile);
349  // printf("max_y=%g, min_y=%g, initial_x=%g, max_height=%g,",
350  // max_y,min_y,initial_x,max_height);
352  if (max_height > initial_x)
353  initial_x = max_height;
354  // printf(" ret=%g\n",initial_x);
355  return initial_x;
356 }
357 
358 /**********************************************************************
359  * cleanup_blocks
360  *
361  * Delete empty blocks, rows from the page.
362  **********************************************************************/
363 
364 void Textord::cleanup_blocks( //remove empties
365  BLOCK_LIST *blocks //list
366  ) {
367  BLOCK_IT block_it = blocks; //iterator
368  ROW_IT row_it; //row iterator
369 
370  int num_rows = 0;
371  int num_rows_all = 0;
372  int num_blocks = 0;
373  int num_blocks_all = 0;
374  for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
375  block_it.forward ()) {
376  num_rows = 0;
377  num_rows_all = 0;
378  row_it.set_to_list (block_it.data ()->row_list ());
379  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
380  ++num_rows_all;
381  clean_small_noise_from_words(row_it.data());
383  && !row_it.data ()->word_list ()->empty ()
384  && clean_noise_from_row (row_it.data ()))
385  || row_it.data ()->word_list ()->empty ())
386  delete row_it.extract ();//lose empty row
387  else {
389  clean_noise_from_words (row_it.data ());
390  if (textord_blshift_maxshift >= 0)
391  tweak_row_baseline(row_it.data(),
394  ++num_rows;
395  }
396  }
397  if (block_it.data()->row_list()->empty() &&
398  (block_it.data()->poly_block() == NULL ||
399  block_it.data()->poly_block()->IsText())) {
400  delete block_it.extract(); // Lose empty text blocks but not other types.
401  } else {
402  ++num_blocks;
403  }
404  ++num_blocks_all;
406  tprintf("cleanup_blocks: # rows = %d / %d\n", num_rows, num_rows_all);
407  }
409  tprintf("cleanup_blocks: # blocks = %d / %d\n", num_blocks, num_blocks_all);
410 }
411 
412 
413 /**********************************************************************
414  * clean_noise_from_row
415  *
416  * Move blobs of words from rows of garbage into the reject blobs list.
417  **********************************************************************/
418 
419 BOOL8 Textord::clean_noise_from_row( //remove empties
420  ROW *row //row to clean
421  ) {
422  BOOL8 testing_on;
423  TBOX blob_box; //bounding box
424  C_BLOB *blob; //current blob
425  C_OUTLINE *outline; //current outline
426  WERD *word; //current word
427  inT32 blob_size; //biggest size
428  inT32 trans_count = 0; //no of transitions
429  inT32 trans_threshold; //noise tolerance
430  inT32 dot_count; //small objects
431  inT32 norm_count; //normal objects
432  inT32 super_norm_count; //real char-like
433  //words of row
434  WERD_IT word_it = row->word_list ();
435  C_BLOB_IT blob_it; //blob iterator
436  C_OUTLINE_IT out_it; //outline iterator
437 
440  && textord_test_y < row->base_line (textord_test_x) + row->x_height ())
441  testing_on = TRUE;
442  else
443  testing_on = FALSE;
444  dot_count = 0;
445  norm_count = 0;
446  super_norm_count = 0;
447  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
448  word = word_it.data (); //current word
449  //blobs in word
450  blob_it.set_to_list (word->cblob_list ());
451  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
452  blob_it.forward ()) {
453  blob = blob_it.data ();
454  if (!word->flag (W_DONT_CHOP)) {
455  //get outlines
456  out_it.set_to_list (blob->out_list ());
457  for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
458  out_it.forward ()) {
459  outline = out_it.data ();
460  blob_box = outline->bounding_box ();
461  blob_size =
462  blob_box.width () >
463  blob_box.height ()? blob_box.width () : blob_box.
464  height();
465  if (blob_size < textord_noise_sizelimit * row->x_height ())
466  dot_count++; //count smal outlines
467  if (!outline->child ()->empty ()
468  && blob_box.height () <
469  (1 + textord_noise_syfract) * row->x_height ()
470  && blob_box.height () >
471  (1 - textord_noise_syfract) * row->x_height ()
472  && blob_box.width () <
473  (1 + textord_noise_sxfract) * row->x_height ()
474  && blob_box.width () >
475  (1 - textord_noise_sxfract) * row->x_height ())
476  super_norm_count++; //count smal outlines
477  }
478  }
479  else
480  super_norm_count++;
481  blob_box = blob->bounding_box ();
482  blob_size =
483  blob_box.width () >
484  blob_box.height ()? blob_box.width () : blob_box.height ();
485  if (blob_size >= textord_noise_sizelimit * row->x_height ()
486  && blob_size < row->x_height () * 2) {
487  trans_threshold = blob_size / textord_noise_sizefraction;
488  trans_count = blob->count_transitions (trans_threshold);
489  if (trans_count < textord_noise_translimit)
490  norm_count++;
491  }
492  else if (blob_box.height () > row->x_height () * 2
493  && (!word_it.at_first () || !blob_it.at_first ()))
494  dot_count += 2;
495  #ifndef SECURE_NAMES
496  if (testing_on) {
497  tprintf
498  ("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n",
499  blob_box.left (), blob_box.bottom (), blob_box.right (),
500  blob_box.top (), blob->out_list ()->length (), trans_count,
501  blob_box.bottom () - row->base_line (blob_box.left ()));
502  }
503  #endif
504  }
505  }
506  #ifndef SECURE_NAMES
507  if (textord_noise_debug) {
508  tprintf ("Row ending at (%d,%g):",
509  blob_box.right (), row->base_line (blob_box.right ()));
510  tprintf (" R=%g, dc=%d, nc=%d, %s\n",
511  norm_count > 0 ? (float) dot_count / norm_count : 9999,
512  dot_count, norm_count,
513  dot_count > norm_count * textord_noise_normratio
514  && dot_count > 2 ? "REJECTED" : "ACCEPTED");
515  }
516  #endif
517  return super_norm_count < textord_noise_sncount
518  && dot_count > norm_count * textord_noise_rowratio && dot_count > 2;
519 }
520 
521 /**********************************************************************
522  * clean_noise_from_words
523  *
524  * Move blobs of words from rows of garbage into the reject blobs list.
525  **********************************************************************/
526 
527 void Textord::clean_noise_from_words( //remove empties
528  ROW *row //row to clean
529  ) {
530  TBOX blob_box; //bounding box
531  inT8 *word_dud; //was it chucked
532  C_BLOB *blob; //current blob
533  C_OUTLINE *outline; //current outline
534  WERD *word; //current word
535  inT32 blob_size; //biggest size
536  inT32 trans_count; //no of transitions
537  inT32 trans_threshold; //noise tolerance
538  inT32 dot_count; //small objects
539  inT32 norm_count; //normal objects
540  inT32 dud_words; //number discarded
541  inT32 ok_words; //number remaining
542  inT32 word_index; //current word
543  //words of row
544  WERD_IT word_it = row->word_list ();
545  C_BLOB_IT blob_it; //blob iterator
546  C_OUTLINE_IT out_it; //outline iterator
547 
548  ok_words = word_it.length ();
549  if (ok_words == 0 || textord_no_rejects)
550  return;
551  word_dud = (inT8 *) alloc_mem (ok_words * sizeof (inT8));
552  dud_words = 0;
553  ok_words = 0;
554  word_index = 0;
555  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
556  word = word_it.data (); //current word
557  dot_count = 0;
558  norm_count = 0;
559  //blobs in word
560  blob_it.set_to_list (word->cblob_list ());
561  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
562  blob_it.forward ()) {
563  blob = blob_it.data ();
564  if (!word->flag (W_DONT_CHOP)) {
565  //get outlines
566  out_it.set_to_list (blob->out_list ());
567  for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
568  out_it.forward ()) {
569  outline = out_it.data ();
570  blob_box = outline->bounding_box ();
571  blob_size =
572  blob_box.width () >
573  blob_box.height ()? blob_box.width () : blob_box.
574  height();
575  if (blob_size < textord_noise_sizelimit * row->x_height ())
576  dot_count++; //count smal outlines
577  if (!outline->child ()->empty ()
578  && blob_box.height () <
579  (1 + textord_noise_syfract) * row->x_height ()
580  && blob_box.height () >
581  (1 - textord_noise_syfract) * row->x_height ()
582  && blob_box.width () <
583  (1 + textord_noise_sxfract) * row->x_height ()
584  && blob_box.width () >
585  (1 - textord_noise_sxfract) * row->x_height ())
586  norm_count++; //count smal outlines
587  }
588  }
589  else
590  norm_count++;
591  blob_box = blob->bounding_box ();
592  blob_size =
593  blob_box.width () >
594  blob_box.height ()? blob_box.width () : blob_box.height ();
595  if (blob_size >= textord_noise_sizelimit * row->x_height ()
596  && blob_size < row->x_height () * 2) {
597  trans_threshold = blob_size / textord_noise_sizefraction;
598  trans_count = blob->count_transitions (trans_threshold);
599  if (trans_count < textord_noise_translimit)
600  norm_count++;
601  }
602  else if (blob_box.height () > row->x_height () * 2
603  && (!word_it.at_first () || !blob_it.at_first ()))
604  dot_count += 2;
605  }
606  if (dot_count > 2) {
607  if (dot_count > norm_count * textord_noise_normratio * 2)
608  word_dud[word_index] = 2;
609  else if (dot_count > norm_count * textord_noise_normratio)
610  word_dud[word_index] = 1;
611  else
612  word_dud[word_index] = 0;
613  }
614  else
615  word_dud[word_index] = 0;
616  if (word_dud[word_index] == 2)
617  dud_words++;
618  else
619  ok_words++;
620  word_index++;
621  }
622 
623  word_index = 0;
624  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
625  if (word_dud[word_index] == 2
626  || (word_dud[word_index] == 1 && dud_words > ok_words)) {
627  word = word_it.data (); //current word
628  //rejected blobs
629  blob_it.set_to_list (word->rej_cblob_list ());
630  //move from blobs
631  blob_it.add_list_after (word->cblob_list ());
632  }
633  word_index++;
634  }
635  free_mem(word_dud);
636 }
637 
638 // Remove outlines that are a tiny fraction in either width or height
639 // of the word height.
640 void Textord::clean_small_noise_from_words(ROW *row) {
641  WERD_IT word_it(row->word_list());
642  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
643  WERD* word = word_it.data();
644  int min_size = static_cast<int>(
645  textord_noise_hfract * word->bounding_box().height() + 0.5);
646  C_BLOB_IT blob_it(word->cblob_list());
647  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
648  C_BLOB* blob = blob_it.data();
649  C_OUTLINE_IT out_it(blob->out_list());
650  for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
651  C_OUTLINE* outline = out_it.data();
652  outline->RemoveSmallRecursive(min_size, &out_it);
653  }
654  if (blob->out_list()->empty()) {
655  delete blob_it.extract();
656  }
657  }
658  if (word->cblob_list()->empty()) {
659  if (!word_it.at_last()) {
660  // The next word is no longer a fuzzy non space if it was before,
661  // since the word before is about to be deleted.
662  WERD* next_word = word_it.data_relative(1);
663  if (next_word->flag(W_FUZZY_NON)) {
664  next_word->set_flag(W_FUZZY_NON, false);
665  }
666  }
667  delete word_it.extract();
668  }
669  }
670 }
671 } // tesseract
672 
673 /**********************************************************************
674  * tweak_row_baseline
675  *
676  * Shift baseline to fit the blobs more accurately where they are
677  * close enough.
678  **********************************************************************/
679 
681  double blshift_maxshift,
682  double blshift_xfraction) {
683  TBOX blob_box; //bounding box
684  C_BLOB *blob; //current blob
685  WERD *word; //current word
686  inT32 blob_count; //no of blobs
687  inT32 src_index; //source segment
688  inT32 dest_index; //destination segment
689  inT32 *xstarts; //spline segments
690  double *coeffs; //spline coeffs
691  float ydiff; //baseline error
692  float x_centre; //centre of blob
693  //words of row
694  WERD_IT word_it = row->word_list ();
695  C_BLOB_IT blob_it; //blob iterator
696 
697  blob_count = 0;
698  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
699  word = word_it.data (); //current word
700  //get total blobs
701  blob_count += word->cblob_list ()->length ();
702  }
703  if (blob_count == 0)
704  return;
705  xstarts =
706  (inT32 *) alloc_mem ((blob_count + row->baseline.segments + 1) *
707  sizeof (inT32));
708  coeffs =
709  (double *) alloc_mem ((blob_count + row->baseline.segments) * 3 *
710  sizeof (double));
711 
712  src_index = 0;
713  dest_index = 0;
714  xstarts[0] = row->baseline.xcoords[0];
715  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
716  word = word_it.data (); //current word
717  //blobs in word
718  blob_it.set_to_list (word->cblob_list ());
719  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
720  blob_it.forward ()) {
721  blob = blob_it.data ();
722  blob_box = blob->bounding_box ();
723  x_centre = (blob_box.left () + blob_box.right ()) / 2.0;
724  ydiff = blob_box.bottom () - row->base_line (x_centre);
725  if (ydiff < 0)
726  ydiff = -ydiff / row->x_height ();
727  else
728  ydiff = ydiff / row->x_height ();
729  if (ydiff < blshift_maxshift
730  && blob_box.height () / row->x_height () > blshift_xfraction) {
731  if (xstarts[dest_index] >= x_centre)
732  xstarts[dest_index] = blob_box.left ();
733  coeffs[dest_index * 3] = 0;
734  coeffs[dest_index * 3 + 1] = 0;
735  coeffs[dest_index * 3 + 2] = blob_box.bottom ();
736  //shift it
737  dest_index++;
738  xstarts[dest_index] = blob_box.right () + 1;
739  }
740  else {
741  if (xstarts[dest_index] <= x_centre) {
742  while (row->baseline.xcoords[src_index + 1] <= x_centre
743  && src_index < row->baseline.segments - 1) {
744  if (row->baseline.xcoords[src_index + 1] >
745  xstarts[dest_index]) {
746  coeffs[dest_index * 3] =
747  row->baseline.quadratics[src_index].a;
748  coeffs[dest_index * 3 + 1] =
749  row->baseline.quadratics[src_index].b;
750  coeffs[dest_index * 3 + 2] =
751  row->baseline.quadratics[src_index].c;
752  dest_index++;
753  xstarts[dest_index] =
754  row->baseline.xcoords[src_index + 1];
755  }
756  src_index++;
757  }
758  coeffs[dest_index * 3] =
759  row->baseline.quadratics[src_index].a;
760  coeffs[dest_index * 3 + 1] =
761  row->baseline.quadratics[src_index].b;
762  coeffs[dest_index * 3 + 2] =
763  row->baseline.quadratics[src_index].c;
764  dest_index++;
765  xstarts[dest_index] = row->baseline.xcoords[src_index + 1];
766  }
767  }
768  }
769  }
770  while (src_index < row->baseline.segments
771  && row->baseline.xcoords[src_index + 1] <= xstarts[dest_index])
772  src_index++;
773  while (src_index < row->baseline.segments) {
774  coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a;
775  coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b;
776  coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c;
777  dest_index++;
778  src_index++;
779  xstarts[dest_index] = row->baseline.xcoords[src_index];
780  }
781  //turn to spline
782  row->baseline = QSPLINE (dest_index, xstarts, coeffs);
783  free_mem(xstarts);
784  free_mem(coeffs);
785 }
786 
787 /**********************************************************************
788  * blob_y_order
789  *
790  * Sort function to sort blobs in y from page top.
791  **********************************************************************/
792 
793 inT32 blob_y_order( //sort function
794  void *item1, //items to compare
795  void *item2) {
796  //converted ptr
797  BLOBNBOX *blob1 = *(BLOBNBOX **) item1;
798  //converted ptr
799  BLOBNBOX *blob2 = *(BLOBNBOX **) item2;
800 
801  if (blob1->bounding_box ().bottom () > blob2->bounding_box ().bottom ())
802  return -1;
803  else if (blob1->bounding_box ().bottom () <
804  blob2->bounding_box ().bottom ())
805  return 1;
806  else {
807  if (blob1->bounding_box ().left () < blob2->bounding_box ().left ())
808  return -1;
809  else if (blob1->bounding_box ().left () >
810  blob2->bounding_box ().left ())
811  return 1;
812  else
813  return 0;
814  }
815 }