Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tablerecog.cpp
Go to the documentation of this file.
1 
2 // File: tablerecog.cpp
3 // Description: Helper class to help structure table areas. Given an bounding
4 // box from TableFinder, the TableRecognizer should give a
5 // StructuredTable (maybe a list in the future) of "good" tables
6 // in that area.
7 // Author: Nicholas Beato
8 // Created: Friday, Aug. 20, 2010
9 //
10 // (C) Copyright 2009, Google Inc.
11 // Licensed under the Apache License, Version 2.0 (the "License");
12 // you may not use this file except in compliance with the License.
13 // You may obtain a copy of the License at
14 // http://www.apache.org/licenses/LICENSE-2.0
15 // Unless required by applicable law or agreed to in writing, software
16 // distributed under the License is distributed on an "AS IS" BASIS,
17 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 // See the License for the specific language governing permissions and
19 // limitations under the License.
20 //
22 
23 #include "tablerecog.h"
24 
25 namespace tesseract {
26 
27 // The amount of space required between the ColPartitions in 2 columns
28 // of a non-lined table as a multiple of the median width.
29 const double kHorizontalSpacing = 0.30;
30 // The amount of space required between the ColPartitions in 2 rows
31 // of a non-lined table as multiples of the median height.
32 const double kVerticalSpacing = -0.2;
33 // The number of cells that the grid lines may intersect.
34 // See FindCellSplitLocations for explanation.
35 const int kCellSplitRowThreshold = 0;
37 // For "lined tables", the number of required lines. Currently a guess.
40 // Number of columns required, as a fraction of the most columns found.
41 // None of these are tweaked at all.
42 const double kRequiredColumns = 0.7;
43 // The tolerance for comparing margins of potential tables.
44 const double kMarginFactor = 1.1;
45 // The first and last row should be consistent cell height.
46 // This factor is the first and last row cell height max.
47 const double kMaxRowSize = 2.5;
48 // Number of filled columns required to form a strong table row.
49 // For small tables, this is an absolute number.
50 const double kGoodRowNumberOfColumnsSmall[] = { 2, 2, 2, 2, 2, 3, 3 };
52  sizeof(kGoodRowNumberOfColumnsSmall) / sizeof(double) - 1;
53 // For large tables, it is a relative number
54 const double kGoodRowNumberOfColumnsLarge = 0.7;
55 // The amount of area that must be covered in a cell by ColPartitions to
56 // be considered "filled"
57 const double kMinFilledArea = 0.35;
58 
62 
64  : text_grid_(NULL),
65  line_grid_(NULL),
66  is_lined_(false),
67  space_above_(0),
68  space_below_(0),
69  space_left_(0),
70  space_right_(0),
71  median_cell_height_(0),
72  median_cell_width_(0),
73  max_text_height_(MAX_INT32) {
74 }
75 
77 }
78 
80 }
81 
83  text_grid_ = text_grid;
84 }
86  line_grid_ = line_grid;
87 }
89  max_text_height_ = height;
90 }
92  return is_lined_;
93 }
95  return cell_y_.length() == 0 ? 0 : cell_y_.length() - 1;
96 }
98  return cell_x_.length() == 0 ? 0 : cell_x_.length() - 1;
99 }
101  return row_count() * column_count();
102 }
104  bounding_box_ = box;
105 }
107  return bounding_box_;
108 }
110  return median_cell_height_;
111 }
113  return median_cell_width_;
114 }
115 int StructuredTable::row_height(int row) const {
116  ASSERT_HOST(0 <= row && row < row_count());
117  return cell_y_[row + 1] - cell_y_[row];
118 }
119 int StructuredTable::column_width(int column) const {
120  ASSERT_HOST(0 <= column && column < column_count());
121  return cell_x_[column + 1] - cell_x_[column];
122 }
124  return space_above_;
125 }
127  return space_below_;
128 }
129 
130 // At this point, we know that the lines are contained
131 // by the box (by FindLinesBoundingBox).
132 // So try to find the cell structure and make sure it works out.
133 // The assumption is that all lines span the table. If this
134 // assumption fails, the VerifyLinedTable method will
135 // abort the lined table. The TableRecognizer will fall
136 // back on FindWhitespacedStructure.
138  ClearStructure();
139 
140  // Search for all of the lines in the current box.
141  // Update the cellular structure with the exact lines.
143  box_search.SetUniqueMode(true);
144  box_search.StartRectSearch(bounding_box_);
145  ColPartition* line = NULL;
146 
147  while ((line = box_search.NextRectSearch()) != NULL) {
148  if (line->IsHorizontalLine())
149  cell_y_.push_back(line->MidY());
150  if (line->IsVerticalLine())
151  cell_x_.push_back(line->MidX());
152  }
153 
154  // HasSignificantLines should guarantee cells.
155  // Because that code is a different class, just gracefully
156  // return false. This could be an assert.
157  if (cell_x_.length() < 3 || cell_y_.length() < 3)
158  return false;
159 
160  cell_x_.sort();
161  cell_y_.sort();
162 
163  // Remove duplicates that may have occurred due to split lines.
166 
167  // The border should be the extents of line boxes, not middle.
168  cell_x_[0] = bounding_box_.left();
172 
173  // Remove duplicates that may have occurred due to moving the borders.
176 
178  CalculateStats();
180  return is_lined_;
181 }
182 
183 // Finds the cellular structure given a particular box.
185  ClearStructure();
188 
189  if (!VerifyWhitespacedTable()) {
190  return false;
191  } else {
198  CalculateStats();
199  return true;
200  }
201 }
202 
203 // Tests if a partition fits inside the table structure.
204 // Partitions must fully span a grid line in order to intersect it.
205 // This means that a partition does not intersect a line
206 // that it "just" touches. This is mainly because the assumption
207 // throughout the code is that "0" distance is a very very small space.
209  const TBOX& box = part.bounding_box();
210  for (int i = 0; i < cell_x_.length(); ++i)
211  if (box.left() < cell_x_[i] && cell_x_[i] < box.right())
212  return false;
213  for (int i = 0; i < cell_y_.length(); ++i)
214  if (box.bottom() < cell_y_[i] && cell_y_[i] < box.top())
215  return false;
216  return true;
217 }
218 
219 // Checks if a sub-table has multiple data cells filled.
221  return CountFilledCells(0, row_count() - 1, 0, column_count() - 1);
222 }
224  return CountFilledCells(row, row, 0, column_count() - 1);
225 }
227  return CountFilledCells(0, row_count() - 1, column, column);
228 }
229 int StructuredTable::CountFilledCells(int row_start, int row_end,
230  int column_start, int column_end) {
231  ASSERT_HOST(0 <= row_start && row_start <= row_end && row_end < row_count());
232  ASSERT_HOST(0 <= column_start && column_start <= column_end &&
233  column_end < column_count());
234  int cell_count = 0;
235  TBOX cell_box;
236  for (int row = row_start; row <= row_end; ++row) {
237  cell_box.set_bottom(cell_y_[row]);
238  cell_box.set_top(cell_y_[row + 1]);
239  for (int col = column_start; col <= column_end; ++col) {
240  cell_box.set_left(cell_x_[col]);
241  cell_box.set_right(cell_x_[col + 1]);
242  if (CountPartitions(cell_box) > 0)
243  ++cell_count;
244  }
245  }
246  return cell_count;
247 }
248 
249 // Makes sure that at least one cell in a row has substantial area filled.
250 // This can filter out large whitespace caused by growing tables too far
251 // and page numbers.
253  for (int i = 0; i < column_count(); ++i) {
254  double area_filled = CalculateCellFilledPercentage(row, i);
255  if (area_filled >= kMinFilledArea)
256  return true;
257  }
258  return false;
259 }
260 
261 // Finds the filled area in a cell.
262 // Assume ColPartitions do not overlap for simplicity (even though they do).
264  ASSERT_HOST(0 <= row && row <= row_count());
265  ASSERT_HOST(0 <= column && column <= column_count());
266  const TBOX kCellBox(cell_x_[column], cell_y_[row],
267  cell_x_[column + 1], cell_y_[row + 1]);
268  ASSERT_HOST(!kCellBox.null_box());
269 
271  gsearch.SetUniqueMode(true);
272  gsearch.StartRectSearch(kCellBox);
273  double area_covered = 0;
274  ColPartition* text = NULL;
275  while ((text = gsearch.NextRectSearch()) != NULL) {
276  if (text->IsTextType())
277  area_covered += text->bounding_box().intersection(kCellBox).area();
278  }
279  return MIN(1.0, area_covered / kCellBox.area());
280 }
281 
283 #ifndef GRAPHICS_DISABLED
284  window->Brush(ScrollView::NONE);
285  window->Pen(color);
288  for (int i = 0; i < cell_x_.length(); i++) {
289  window->Line(cell_x_[i], bounding_box_.bottom(),
290  cell_x_[i], bounding_box_.top());
291  }
292  for (int i = 0; i < cell_y_.length(); i++) {
293  window->Line(bounding_box_.left(), cell_y_[i],
294  bounding_box_.right(), cell_y_[i]);
295  }
296  window->UpdateWindow();
297 #endif
298 }
299 
300 // Clear structure information.
302  cell_x_.clear();
303  cell_y_.clear();
304  is_lined_ = false;
305  space_above_ = 0;
306  space_below_ = 0;
307  space_left_ = 0;
308  space_right_ = 0;
310  median_cell_width_ = 0;
311 }
312 
313 // When a table has lines, the lines should not intersect any partitions.
314 // The following function makes sure the previous assumption is met.
316  // Function only called when lines exist.
317  ASSERT_HOST(cell_y_.length() >= 2 && cell_x_.length() >= 2);
318  for (int i = 0; i < cell_y_.length(); ++i) {
320  return false;
321  }
322  for (int i = 0; i < cell_x_.length(); ++i) {
324  return false;
325  }
326  return true;
327 }
328 
329 // TODO(nbeato): Could be much better than this.
330 // Examples:
331 // - Caclulate the percentage of filled cells.
332 // - Calculate the average number of ColPartitions per cell.
333 // - Calculate the number of cells per row with partitions.
334 // - Check if ColPartitions in adjacent cells are similar.
335 // - Check that all columns are at least a certain width.
336 // - etc.
338  // criteria for a table, must be at least 2x3 or 3x2
339  return row_count() >= 2 && column_count() >= 2 && cell_count() >= 6;
340 }
341 
342 // Finds vertical splits in the ColPartitions of text_grid_ by considering
343 // all possible "good" guesses. A good guess is just the left/right sides of
344 // the partitions, since these locations will uniquely define where the
345 // extremal values where the splits can occur. The split happens
346 // in the middle of the two nearest partitions.
348  // Set of the extents of all partitions on the page.
349  GenericVectorEqEq<int> left_sides;
350  GenericVectorEqEq<int> right_sides;
351 
352  // Look at each text partition. We want to find the partitions
353  // that have extremal left/right sides. These will give us a basis
354  // for the table columns.
356  gsearch.SetUniqueMode(true);
358  ColPartition* text = NULL;
359  while ((text = gsearch.NextRectSearch()) != NULL) {
360  if (!text->IsTextType())
361  continue;
362 
363  ASSERT_HOST(text->bounding_box().left() < text->bounding_box().right());
364  int spacing = static_cast<int>(text->median_width() *
365  kHorizontalSpacing / 2.0 + 0.5);
366  left_sides.push_back(text->bounding_box().left() - spacing);
367  right_sides.push_back(text->bounding_box().right() + spacing);
368  }
369  // It causes disaster below, so avoid it!
370  if (left_sides.length() == 0 || right_sides.length() == 0)
371  return;
372 
373  // Since data may be inserted in grid order, we sort the left/right sides.
374  left_sides.sort();
375  right_sides.sort();
376 
377  // At this point, in the "merged list", we expect to have a left side,
378  // followed by either more left sides or a right side. The last number
379  // should be a right side. We find places where the splits occur by looking
380  // for "valleys". If we want to force gap sizes or allow overlap, change
381  // the spacing above. If you want to let lines "slice" partitions as long
382  // as it is infrequent, change the following function.
383  FindCellSplitLocations(left_sides, right_sides, kCellSplitColumnThreshold,
384  &cell_x_);
385 }
386 
387 // Finds horizontal splits in the ColPartitions of text_grid_ by considering
388 // all possible "good" guesses. A good guess is just the bottom/top sides of
389 // the partitions, since these locations will uniquely define where the
390 // extremal values where the splits can occur. The split happens
391 // in the middle of the two nearest partitions.
393  // Set of the extents of all partitions on the page.
394  GenericVectorEqEq<int> bottom_sides;
395  GenericVectorEqEq<int> top_sides;
396  // We will be "shrinking" partitions, so keep the min/max around to
397  // make sure the bottom/top lines do not intersect text.
398  int min_bottom = MAX_INT32;
399  int max_top = MIN_INT32;
400 
401  // Look at each text partition. We want to find the partitions
402  // that have extremal bottom/top sides. These will give us a basis
403  // for the table rows. Because the textlines can be skewed and close due
404  // to warping, the height of the partitions is toned down a little bit.
406  gsearch.SetUniqueMode(true);
408  ColPartition* text = NULL;
409  while ((text = gsearch.NextRectSearch()) != NULL) {
410  if (!text->IsTextType())
411  continue;
412 
413  ASSERT_HOST(text->bounding_box().bottom() < text->bounding_box().top());
414  min_bottom = MIN(min_bottom, text->bounding_box().bottom());
415  max_top = MAX(max_top, text->bounding_box().top());
416 
417  // Ignore "tall" text partitions, as these are usually false positive
418  // vertical text or multiple lines pulled together.
419  if (text->bounding_box().height() > max_text_height_)
420  continue;
421 
422  int spacing = static_cast<int>(text->bounding_box().height() *
423  kVerticalSpacing / 2.0 + 0.5);
424  int bottom = text->bounding_box().bottom() - spacing;
425  int top = text->bounding_box().top() + spacing;
426  // For horizontal text, the factor can be negative. This should
427  // probably cause a warning or failure. I haven't actually checked if
428  // it happens.
429  if (bottom >= top)
430  continue;
431 
432  bottom_sides.push_back(bottom);
433  top_sides.push_back(top);
434  }
435  // It causes disaster below, so avoid it!
436  if (bottom_sides.length() == 0 || top_sides.length() == 0)
437  return;
438 
439  // Since data may be inserted in grid order, we sort the bottom/top sides.
440  bottom_sides.sort();
441  top_sides.sort();
442 
443  // At this point, in the "merged list", we expect to have a bottom side,
444  // followed by either more bottom sides or a top side. The last number
445  // should be a top side. We find places where the splits occur by looking
446  // for "valleys". If we want to force gap sizes or allow overlap, change
447  // the spacing above. If you want to let lines "slice" partitions as long
448  // as it is infrequent, change the following function.
449  FindCellSplitLocations(bottom_sides, top_sides, kCellSplitRowThreshold,
450  &cell_y_);
451 
452  // Recover the min/max correctly since it was shifted.
453  cell_y_[0] = min_bottom;
454  cell_y_[cell_y_.length() - 1] = max_top;
455 }
456 
464 }
465 // Finds the nearest partition in grid to the table
466 // boundaries and updates the margin.
468  int below = FindVerticalMargin(grid, bounding_box_.bottom(), true);
469  space_below_ = MIN(space_below_, below);
470  int above = FindVerticalMargin(grid, bounding_box_.top(), false);
471  space_above_ = MIN(space_above_, above);
472  int left = FindHorizontalMargin(grid, bounding_box_.left(), true);
473  space_left_ = MIN(space_left_, left);
474  int right = FindHorizontalMargin(grid, bounding_box_.right(), false);
475  space_right_ = MIN(space_right_, right);
476 }
478  bool decrease) const {
479  ColPartitionGridSearch gsearch(grid);
480  gsearch.SetUniqueMode(true);
482  border);
483  ColPartition* part = NULL;
484  while ((part = gsearch.NextVerticalSearch(decrease)) != NULL) {
485  if (!part->IsTextType() && !part->IsHorizontalLine())
486  continue;
487  int distance = decrease ? border - part->bounding_box().top()
488  : part->bounding_box().bottom() - border;
489  if (distance >= 0)
490  return distance;
491  }
492  return MAX_INT32;
493 }
495  bool decrease) const {
496  ColPartitionGridSearch gsearch(grid);
497  gsearch.SetUniqueMode(true);
498  gsearch.StartSideSearch(border, bounding_box_.bottom(), bounding_box_.top());
499  ColPartition* part = NULL;
500  while ((part = gsearch.NextSideSearch(decrease)) != NULL) {
501  if (!part->IsTextType() && !part->IsVerticalLine())
502  continue;
503  int distance = decrease ? border - part->bounding_box().right()
504  : part->bounding_box().left() - border;
505  if (distance >= 0)
506  return distance;
507  }
508  return MAX_INT32;
509 }
510 
512  const int kMaxCellHeight = 1000;
513  const int kMaxCellWidth = 1000;
514  STATS height_stats(0, kMaxCellHeight + 1);
515  STATS width_stats(0, kMaxCellWidth + 1);
516 
517  for (int i = 0; i < row_count(); ++i)
518  height_stats.add(row_height(i), column_count());
519  for (int i = 0; i < column_count(); ++i)
520  width_stats.add(column_width(i), row_count());
521 
522  median_cell_height_ = static_cast<int>(height_stats.median() + 0.5);
523  median_cell_width_ = static_cast<int>(width_stats.median() + 0.5);
524 }
525 
526 // Looks for grid lines near the current bounding box and
527 // grows the bounding box to include them if no intersections
528 // will occur as a result. This is necessary because the margins
529 // are calculated relative to the closest line/text. If the
530 // line isn't absorbed, the margin will be the distance to the line.
533  gsearch.SetUniqueMode(true);
534 
535  // Is the closest line above good? Loop multiple times for tables with
536  // multi-line (sometimes 2) borders. Limit the number of lines by
537  // making sure they stay within a table cell or so.
538  ColPartition* line = NULL;
540  bounding_box_.top());
541  while ((line = gsearch.NextVerticalSearch(false)) != NULL) {
542  if (!line->IsHorizontalLine())
543  break;
544  TBOX text_search(bounding_box_.left(), bounding_box_.top() + 1,
545  bounding_box_.right(), line->MidY());
546  if (text_search.height() > median_cell_height_ * 2)
547  break;
548  if (CountPartitions(text_search) > 0)
549  break;
550  bounding_box_.set_top(line->MidY());
551  }
552  // As above, is the closest line below good?
553  line = NULL;
556  while ((line = gsearch.NextVerticalSearch(true)) != NULL) {
557  if (!line->IsHorizontalLine())
558  break;
559  TBOX text_search(bounding_box_.left(), line->MidY(),
561  if (text_search.height() > median_cell_height_ * 2)
562  break;
563  if (CountPartitions(text_search) > 0)
564  break;
565  bounding_box_.set_bottom(line->MidY());
566  }
567  // TODO(nbeato): vertical lines
568 }
569 
570 
571 // This function will find all "0 valleys" (of any length) given two
572 // arrays. The arrays are the mins and maxes of partitions (either
573 // left and right or bottom and top). Since the min/max lists are generated
574 // with pairs of increasing integers, we can make some assumptions in
575 // the function about ordering of the overall list, which are shown in the
576 // asserts.
577 // The algorithm works as follows:
578 // While there are numbers to process, take the smallest number.
579 // If it is from the min_list, increment the "hill" counter.
580 // Otherwise, decrement the "hill" counter.
581 // In the process of doing this, keep track of "crossing" the
582 // desired height.
583 // The first/last items are extremal values of the list and known.
584 // NOTE: This function assumes the lists are sorted!
586  const GenericVector<int>& max_list,
587  int max_merged,
588  GenericVector<int>* locations) {
589  locations->clear();
590  ASSERT_HOST(min_list.length() == max_list.length());
591  if (min_list.length() == 0)
592  return;
593  ASSERT_HOST(min_list.get(0) < max_list.get(0));
594  ASSERT_HOST(min_list.get(min_list.length() - 1) <
595  max_list.get(max_list.length() - 1));
596 
597  locations->push_back(min_list.get(0));
598  int min_index = 0;
599  int max_index = 0;
600  int stacked_partitions = 0;
601  int last_cross_position = MAX_INT32;
602  // max_index will expire after min_index.
603  // However, we can't "increase" the hill size if min_index expired.
604  // So finish processing when min_index expires.
605  while (min_index < min_list.length()) {
606  // Increase the hill count.
607  if (min_list[min_index] < max_list[max_index]) {
608  ++stacked_partitions;
609  if (last_cross_position != MAX_INT32 &&
610  stacked_partitions > max_merged) {
611  int mid = (last_cross_position + min_list[min_index]) / 2;
612  locations->push_back(mid);
613  last_cross_position = MAX_INT32;
614  }
615  ++min_index;
616  } else {
617  // Decrease the hill count.
618  --stacked_partitions;
619  if (last_cross_position == MAX_INT32 &&
620  stacked_partitions <= max_merged) {
621  last_cross_position = max_list[max_index];
622  }
623  ++max_index;
624  }
625  }
626  locations->push_back(max_list.get(max_list.length() - 1));
627 }
628 
629 // Counts the number of partitions in the table
630 // box that intersection the given x value.
632  int count = 0;
633  // Make a small box to keep the search time down.
634  const int kGridSize = text_grid_->gridsize();
635  TBOX vertical_box = bounding_box_;
636  vertical_box.set_left(x - kGridSize);
637  vertical_box.set_right(x + kGridSize);
638 
640  gsearch.SetUniqueMode(true);
641  gsearch.StartRectSearch(vertical_box);
642  ColPartition* text = NULL;
643  while ((text = gsearch.NextRectSearch()) != NULL) {
644  if (!text->IsTextType())
645  continue;
646  const TBOX& box = text->bounding_box();
647  if (box.left() < x && x < box.right())
648  ++count;
649  }
650  return count;
651 }
652 
653 // Counts the number of partitions in the table
654 // box that intersection the given y value.
656  int count = 0;
657  // Make a small box to keep the search time down.
658  const int kGridSize = text_grid_->gridsize();
659  TBOX horizontal_box = bounding_box_;
660  horizontal_box.set_bottom(y - kGridSize);
661  horizontal_box.set_top(y + kGridSize);
662 
664  gsearch.SetUniqueMode(true);
665  gsearch.StartRectSearch(horizontal_box);
666  ColPartition* text = NULL;
667  while ((text = gsearch.NextRectSearch()) != NULL) {
668  if (!text->IsTextType())
669  continue;
670 
671  const TBOX& box = text->bounding_box();
672  if (box.bottom() < y && y < box.top())
673  ++count;
674  }
675  return count;
676 }
677 
678 // Counts how many text partitions are in this box.
679 // This is used to count partitons in cells, as that can indicate
680 // how "strong" a potential table row/colum (or even full table) actually is.
683  gsearch.SetUniqueMode(true);
684  gsearch.StartRectSearch(box);
685  int count = 0;
686  ColPartition* text = NULL;
687  while ((text = gsearch.NextRectSearch()) != NULL) {
688  if (text->IsTextType())
689  ++count;
690  }
691  return count;
692 }
693 
697 
699  : text_grid_(NULL),
700  line_grid_(NULL),
701  min_height_(0),
702  min_width_(0),
703  max_text_height_(MAX_INT32) {
704 }
705 
707 }
708 
710 }
711 
713  text_grid_ = text_grid;
714 }
716  line_grid_ = line_grid;
717 }
719  min_height_ = height;
720 }
722  min_width_ = width;
723 }
725  max_text_height_ = height;
726 }
727 
729  StructuredTable* table = new StructuredTable();
730  table->Init();
731  table->set_text_grid(text_grid_);
732  table->set_line_grid(line_grid_);
734 
735  // Try to solve ths simple case, a table with *both*
736  // vertical and horizontal lines.
737  if (RecognizeLinedTable(guess, table))
738  return table;
739 
740  // Fallback to whitespace if that failed.
741  // TODO(nbeato): Break this apart to take advantage of horizontal
742  // lines or vertical lines when present.
743  if (RecognizeWhitespacedTable(guess, table))
744  return table;
745 
746  // No table found...
747  delete table;
748  return NULL;
749 }
750 
752  StructuredTable* table) {
753  if (!HasSignificantLines(guess_box))
754  return false;
755  TBOX line_bound = guess_box;
756  if (!FindLinesBoundingBox(&line_bound))
757  return false;
758  table->set_bounding_box(line_bound);
759  return table->FindLinedStructure();
760 }
761 
762 // Quick implementation. Just count the number of lines in the box.
763 // A better implementation would counter intersections and look for connected
764 // components. It could even go as far as finding similar length lines.
765 // To account for these possible issues, the VerifyLinedTableCells function
766 // will reject lined tables that cause intersections with text on the page.
767 // TODO(nbeato): look for "better" lines
770  box_search.SetUniqueMode(true);
771  box_search.StartRectSearch(guess);
772  ColPartition* line = NULL;
773  int vertical_count = 0;
774  int horizontal_count = 0;
775 
776  while ((line = box_search.NextRectSearch()) != NULL) {
777  if (line->IsHorizontalLine())
778  ++horizontal_count;
779  if (line->IsVerticalLine())
780  ++vertical_count;
781  }
782 
783  return vertical_count >= kLinedTableMinVerticalLines &&
784  horizontal_count >= kLinedTableMinHorizontalLines;
785 }
786 
787 // Given a bounding box with a bunch of horizontal / vertical lines,
788 // we just find the extents of all of these lines iteratively.
789 // The box will be at least as large as guess. This
790 // could possibly be a bad assumption.
791 // It is guaranteed to halt in at least O(n * gridarea) where n
792 // is the number of lines.
793 // The assumption is that growing the box iteratively will add lines
794 // several times, but eventually we'll find the extents.
795 //
796 // For tables, the approach is a bit aggressive, a single line (which could be
797 // noise or a column ruling) can destroy the table inside.
798 //
799 // TODO(nbeato): This is a quick first implementation.
800 // A better implementation would actually look for consistency
801 // in extents of the lines and find the extents using lines
802 // that clearly describe the table. This would allow the
803 // lines to "vote" for height/width. An approach like
804 // this would solve issues with page layout rulings.
805 // I haven't looked for these issues yet, so I can't even
806 // say they happen confidently.
808  // The first iteration will tell us if there are lines
809  // present and shrink the box to a minimal iterative size.
810  if (!FindLinesBoundingBoxIteration(bounding_box))
811  return false;
812 
813  // Keep growing until the area of the table stabilizes.
814  // The box can only get bigger, increasing area.
815  bool changed = true;
816  while (changed) {
817  changed = false;
818  int old_area = bounding_box->area();
819  bool check = FindLinesBoundingBoxIteration(bounding_box);
820  // At this point, the function will return true.
821  ASSERT_HOST(check);
822  ASSERT_HOST(bounding_box->area() >= old_area);
823  changed = (bounding_box->area() > old_area);
824  }
825 
826  return true;
827 }
828 
830  // Search for all of the lines in the current box, keeping track of extents.
832  box_search.SetUniqueMode(true);
833  box_search.StartRectSearch(*bounding_box);
834  ColPartition* line = NULL;
835  bool first_line = true;
836 
837  while ((line = box_search.NextRectSearch()) != NULL) {
838  if (line->IsLineType()) {
839  if (first_line) {
840  // The first iteration can shrink the box.
841  *bounding_box = line->bounding_box();
842  first_line = false;
843  } else {
844  *bounding_box += line->bounding_box();
845  }
846  }
847  }
848  return !first_line;
849 }
850 
851 // The goal of this function is to move the table boundaries around and find
852 // a table that maximizes the whitespace around the table while maximizing
853 // the cellular structure. As a result, it gets confused by headers, footers,
854 // and merged columns (text that crosses columns). There is a tolerance
855 // that allows a few partitions to count towards potential cell merges.
856 // It's the max_merged parameter to FindPartitionLocations.
857 // It can work, but it needs some false positive remove on boundaries.
858 // For now, the grid structure must not intersect any partitions.
859 // Also, small tolerance is added to the horizontal lines for tightly packed
860 // tables. The tolerance is added by adjusting the bounding boxes of the
861 // partitions (in FindHorizontalPartitions). The current implementation
862 // only adjusts the vertical extents of the table.
863 //
864 // Also note. This was hacked at a lot. It could probably use some
865 // more hacking at to find a good set of border conditions and then a
866 // nice clean up.
868  StructuredTable* table) {
869  TBOX best_box = guess_box; // Best borders known.
870  int best_below = 0; // Margin size above best table.
871  int best_above = 0; // Margin size below best table.
872  TBOX adjusted = guess_box; // The search box.
873 
874  // We assume that the guess box is somewhat accurate, so we don't allow
875  // the adjusted border to pass half of the guessed area. This prevents
876  // "negative" tables from forming.
877  const int kMidGuessY = (guess_box.bottom() + guess_box.top()) / 2;
878  // Keeps track of the most columns in an accepted table. The resulting table
879  // may be less than the max, but we don't want to stray too far.
880  int best_cols = 0;
881  // Make sure we find a good border.
882  bool found_good_border = false;
883 
884  // Find the bottom of the table by trying a few different locations. For
885  // each location, the top, left, and right are fixed. We start the search
886  // in a smaller table to favor best_cols getting a good estimate sooner.
887  int last_bottom = MAX_INT32;
888  int bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(),
889  kMidGuessY - min_height_ / 2, true);
890  int top = NextHorizontalSplit(guess_box.left(), guess_box.right(),
891  kMidGuessY + min_height_ / 2, false);
892  adjusted.set_top(top);
893 
894  // Headers/footers can be spaced far from everything.
895  // Make sure that the space below is greater than the space above
896  // the lowest row.
897  int previous_below = 0;
898  const int kMaxChances = 10;
899  int chances = kMaxChances;
900  while (bottom != last_bottom) {
901  adjusted.set_bottom(bottom);
902 
903  if (adjusted.height() >= min_height_) {
904  // Try to fit the grid on the current box. We give it a chance
905  // if the number of columns didn't significantly drop.
906  table->set_bounding_box(adjusted);
907  if (table->FindWhitespacedStructure() &&
908  table->column_count() >= best_cols * kRequiredColumns) {
909  if (false && IsWeakTableRow(table, 0)) {
910  // Currently buggy, but was looking promising so disabled.
911  --chances;
912  } else {
913  // We favor 2 things,
914  // 1- Adding rows that have partitioned data.
915  // 2- Better margins (to find header/footer).
916  // For better tables, we just look for multiple cells in the
917  // bottom row with data in them.
918  // For margins, the space below the last row should
919  // be better than a table with the last row removed.
920  chances = kMaxChances;
921  double max_row_height = kMaxRowSize * table->median_cell_height();
922  if ((table->space_below() * kMarginFactor >= best_below &&
923  table->space_below() >= previous_below) ||
924  (table->CountFilledCellsInRow(0) > 1 &&
925  table->row_height(0) < max_row_height)) {
926  best_box.set_bottom(bottom);
927  best_below = table->space_below();
928  best_cols = MAX(table->column_count(), best_cols);
929  found_good_border = true;
930  }
931  }
932  previous_below = table->space_below();
933  } else {
934  --chances;
935  }
936  }
937  if (chances <= 0)
938  break;
939 
940  last_bottom = bottom;
941  bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(),
942  last_bottom, true);
943  }
944  if (!found_good_border)
945  return false;
946 
947  // TODO(nbeato) comments: follow modified code above... put it in a function!
948  found_good_border = false;
949  int last_top = MIN_INT32;
950  top = NextHorizontalSplit(guess_box.left(), guess_box.right(),
951  kMidGuessY + min_height_ / 2, false);
952  int previous_above = 0;
953  chances = kMaxChances;
954 
955  adjusted.set_bottom(best_box.bottom());
956  while (last_top != top) {
957  adjusted.set_top(top);
958  if (adjusted.height() >= min_height_) {
959  table->set_bounding_box(adjusted);
960  if (table->FindWhitespacedStructure() &&
961  table->column_count() >= best_cols * kRequiredColumns) {
962  int last_row = table->row_count() - 1;
963  if (false && IsWeakTableRow(table, last_row)) {
964  // Currently buggy, but was looking promising so disabled.
965  --chances;
966  } else {
967  chances = kMaxChances;
968  double max_row_height = kMaxRowSize * table->median_cell_height();
969  if ((table->space_above() * kMarginFactor >= best_above &&
970  table->space_above() >= previous_above) ||
971  (table->CountFilledCellsInRow(last_row) > 1 &&
972  table->row_height(last_row) < max_row_height)) {
973  best_box.set_top(top);
974  best_above = table->space_above();
975  best_cols = MAX(table->column_count(), best_cols);
976  found_good_border = true;
977  }
978  }
979  previous_above = table->space_above();
980  } else {
981  --chances;
982  }
983  }
984  if (chances <= 0)
985  break;
986 
987  last_top = top;
988  top = NextHorizontalSplit(guess_box.left(), guess_box.right(),
989  last_top, false);
990  }
991 
992  if (!found_good_border)
993  return false;
994 
995  // If we get here, this shouldn't happen. It can be an assert, but
996  // I haven't tested it enough to make it crash things.
997  if (best_box.null_box())
998  return false;
999 
1000  // Given the best locations, fit the box to those locations.
1001  table->set_bounding_box(best_box);
1002  return table->FindWhitespacedStructure();
1003 }
1004 
1005 // Finds the closest value to y that can safely cause a horizontal
1006 // split in the partitions.
1007 // This function has been buggy and not as reliable as I would've
1008 // liked. I suggest finding all of the splits using the
1009 // FindPartitionLocations once and then just keeping the results
1010 // of that function cached somewhere.
1011 int TableRecognizer::NextHorizontalSplit(int left, int right, int y,
1012  bool top_to_bottom) {
1014  gsearch.SetUniqueMode(true);
1015  gsearch.StartVerticalSearch(left, right, y);
1016  ColPartition* text = NULL;
1017  int last_y = y;
1018  while ((text = gsearch.NextVerticalSearch(top_to_bottom)) != NULL) {
1019  if (!text->IsTextType() || !text->IsHorizontalType())
1020  continue;
1021  if (text->bounding_box().height() > max_text_height_)
1022  continue;
1023 
1024  const TBOX& text_box = text->bounding_box();
1025  if (top_to_bottom && (last_y >= y || last_y <= text_box.top())) {
1026  last_y = MIN(last_y, text_box.bottom());
1027  continue;
1028  }
1029  if (!top_to_bottom && (last_y <= y || last_y >= text_box.bottom())) {
1030  last_y = MAX(last_y, text_box.top());
1031  continue;
1032  }
1033 
1034  return last_y;
1035  }
1036  // If none is found, we at least want to preserve the min/max,
1037  // which defines the overlap of y with the last partition in the grid.
1038  return last_y;
1039 }
1040 
1041 // Code is buggy right now. It is disabled in the calling function.
1042 // It seems like sometimes the row that is passed in is not correct
1043 // sometimes (like a phantom row is introduced). There's something going
1044 // on in the cell_y_ data member before this is called... not certain.
1046  if (!table->VerifyRowFilled(row))
1047  return false;
1048 
1049  double threshold = 0.0;
1051  threshold = table->column_count() * kGoodRowNumberOfColumnsLarge;
1052  else
1053  threshold = kGoodRowNumberOfColumnsSmall[table->column_count()];
1054 
1055  return table->CountFilledCellsInRow(row) < threshold;
1056 }
1057 
1058 } // namespace tesseract