Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
pieces.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: pieces.c (Formerly pieces.c)
5  * Description:
6  * Author: Mark Seaman, OCR Technology
7  * Created: Fri Oct 16 14:37:00 1987
8  * Modified: Mon May 20 12:12:35 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Reusable Software Component
12  *
13  * (c) Copyright 1987, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  *********************************************************************************/
25 /*----------------------------------------------------------------------
26  I n c l u d e s
27 ----------------------------------------------------------------------*/
28 
29 #include "blobs.h"
30 #include "freelist.h"
31 #include "helpers.h"
32 #include "matchtab.h"
33 #include "matrix.h"
34 #include "ndminx.h"
35 #include "plotseg.h"
36 #include "ratngs.h"
37 #include "seam.h"
38 #include "states.h"
39 #include "wordclass.h"
40 #include "wordrec.h"
41 
42 // Include automatically generated configuration file if running autoconf.
43 #ifdef HAVE_CONFIG_H
44 #include "config_auto.h"
45 #endif
46 
47 /*----------------------------------------------------------------------
48  F u n c t i o n s
49 ----------------------------------------------------------------------*/
50 
51 
52 /**********************************************************************
53  * bounds_of_piece
54  *
55  * Find the bounds of the piece that will be created by joining the
56  * requested collection of pieces together.
57  **********************************************************************/
58 TBOX bounds_of_piece(TBOX *bounds, inT16 start, inT16 end) {
59  TBOX all_together = bounds[start];
60  for (int x = start + 1; x <= end; x++) {
61  all_together += bounds[x];
62  }
63  return all_together;
64 }
65 
66 
67 /**********************************************************************
68  * classify_piece
69  *
70  * Create a larger piece from a collection of smaller ones. Classify
71  * it and return the results. Take the large piece apart to leave
72  * the collection of small pieces un modified.
73  **********************************************************************/
74 namespace tesseract {
75 BLOB_CHOICE_LIST *Wordrec::classify_piece(TBLOB *pieces,
76  const DENORM& denorm,
77  SEAMS seams,
78  inT16 start,
79  inT16 end,
80  BlamerBundle *blamer_bundle) {
81  BLOB_CHOICE_LIST *choices;
82  TBLOB *blob;
83  inT16 x;
84 
85  join_pieces(pieces, seams, start, end);
86  for (blob = pieces, x = 0; x < start; x++) {
87  blob = blob->next;
88  }
89  choices = classify_blob(blob, denorm, "pieces:", White, blamer_bundle);
90 
91  break_pieces(blob, seams, start, end);
92 #ifndef GRAPHICS_DISABLED
94  STATE current_state;
95  SEARCH_STATE chunk_groups;
96  set_n_ones (&current_state, array_count(seams));
97  chunk_groups = bin_to_chunks(&current_state, array_count(seams));
98  display_segmentation(pieces, chunk_groups);
100  memfree(chunk_groups);
101  }
102 #endif
103 
104  return (choices);
105 }
106 
107 template<class BLOB_CHOICE>
108 int SortByUnicharID(const void *void1, const void *void2) {
109  const BLOB_CHOICE *p1 = *reinterpret_cast<const BLOB_CHOICE * const *>(void1);
110  const BLOB_CHOICE *p2 = *reinterpret_cast<const BLOB_CHOICE * const *>(void2);
111 
112  return p1->unichar_id() - p2->unichar_id();
113 }
114 
115 template<class BLOB_CHOICE>
116 int SortByRating(const void *void1, const void *void2) {
117  const BLOB_CHOICE *p1 = *reinterpret_cast<const BLOB_CHOICE * const *>(void1);
118  const BLOB_CHOICE *p2 = *reinterpret_cast<const BLOB_CHOICE * const *>(void2);
119 
120  if (p1->rating() < p2->rating())
121  return 1;
122  return -1;
123 }
124 
125 
126 /**********************************************************************
127  * fill_filtered_fragment_list
128  *
129  * Filter the fragment list so that the filtered_choices only contain
130  * fragments that are in the correct position. choices is the list
131  * that we are going to filter. fragment_pos is the position in the
132  * fragment that we are looking for and num_frag_parts is the the
133  * total number of pieces. The result will be appended to
134  * filtered_choices.
135  **********************************************************************/
136 void Wordrec::fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices,
137  int fragment_pos,
138  int num_frag_parts,
139  BLOB_CHOICE_LIST *filtered_choices) {
140  BLOB_CHOICE_IT filtered_choices_it(filtered_choices);
141  BLOB_CHOICE_IT choices_it(choices);
142 
143  for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
144  choices_it.forward()) {
145  UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
146  const CHAR_FRAGMENT *frag = unicharset.get_fragment(choice_unichar_id);
147 
148  if (frag != NULL && frag->get_pos() == fragment_pos &&
149  frag->get_total() == num_frag_parts) {
150  // Recover the unichar_id of the unichar that this fragment is
151  // a part of
152  BLOB_CHOICE *b = new BLOB_CHOICE(*choices_it.data());
153  int original_unichar = unicharset.unichar_to_id(frag->get_unichar());
154  b->set_unichar_id(original_unichar);
155  filtered_choices_it.add_to_end(b);
156  }
157  }
158 
159  filtered_choices->sort(SortByUnicharID<BLOB_CHOICE>);
160 }
161 
162 
163 /**********************************************************************
164  * merge_and_put_fragment_lists
165  *
166  * Merge the fragment lists in choice_lists and append it to the
167  * ratings matrix.
168  **********************************************************************/
170  inT16 num_frag_parts,
171  BLOB_CHOICE_LIST *choice_lists,
172  MATRIX *ratings) {
173  BLOB_CHOICE_IT *choice_lists_it = new BLOB_CHOICE_IT[num_frag_parts];
174 
175  for (int i = 0; i < num_frag_parts; i++) {
176  choice_lists_it[i].set_to_list(&choice_lists[i]);
177  choice_lists_it[i].mark_cycle_pt();
178  }
179 
180  BLOB_CHOICE_LIST *merged_choice = ratings->get(row, column);
181  if (merged_choice == NULL)
182  merged_choice = new BLOB_CHOICE_LIST;
183 
184  bool end_of_list = false;
185  BLOB_CHOICE_IT merged_choice_it(merged_choice);
186  while (!end_of_list) {
187  // Find the maximum unichar_id of the current entry the iterators
188  // are pointing at
189  UNICHAR_ID max_unichar_id = choice_lists_it[0].data()->unichar_id();
190  int max_list = 0;
191  for (int i = 0; i < num_frag_parts; i++) {
192  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
193  if (max_unichar_id < unichar_id) {
194  max_unichar_id = unichar_id;
195  max_list = i;
196  }
197  }
198 
199  // Move the each iterators until it gets to an entry that has a
200  // value greater than or equal to max_unichar_id
201  for (int i = 0; i < num_frag_parts; i++) {
202  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
203  while (!choice_lists_it[i].cycled_list() &&
204  unichar_id < max_unichar_id) {
205  choice_lists_it[i].forward();
206  unichar_id = choice_lists_it[i].data()->unichar_id();
207  }
208  if (choice_lists_it[i].cycled_list()) {
209  end_of_list = true;
210  break;
211  }
212  }
213 
214  if (end_of_list)
215  break;
216 
217  // Checks if the fragments are parts of the same character
218  UNICHAR_ID first_unichar_id = choice_lists_it[0].data()->unichar_id();
219  bool same_unichar = true;
220  for (int i = 1; i < num_frag_parts; i++) {
221  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
222  if (unichar_id != first_unichar_id) {
223  same_unichar = false;
224  break;
225  }
226  }
227 
228  if (same_unichar) {
229  // Add the merged character to the result
230  UNICHAR_ID merged_unichar_id = first_unichar_id;
231  inT16 merged_fontinfo_id = choice_lists_it[0].data()->fontinfo_id();
232  inT16 merged_fontinfo_id2 = choice_lists_it[0].data()->fontinfo_id2();
233  inT16 merged_min_xheight = choice_lists_it[0].data()->min_xheight();
234  inT16 merged_max_xheight = choice_lists_it[0].data()->max_xheight();
235  int merged_script_id = choice_lists_it[0].data()->script_id();
236  bool merged_adapted = choice_lists_it[0].data()->adapted();
237 
238  float merged_rating = 0, merged_certainty = 0;
239  for (int i = 0; i < num_frag_parts; i++) {
240  float rating = choice_lists_it[i].data()->rating();
241  float certainty = choice_lists_it[i].data()->certainty();
242 
243  if (i == 0 || certainty < merged_certainty)
244  merged_certainty = certainty;
245  merged_rating += rating;
246 
247  choice_lists_it[i].forward();
248  if (choice_lists_it[i].cycled_list())
249  end_of_list = true;
250  IntersectRange(choice_lists_it[i].data()->min_xheight(),
251  choice_lists_it[i].data()->max_xheight(),
252  &merged_min_xheight, &merged_max_xheight);
253  }
254 
255  merged_choice_it.add_to_end(new BLOB_CHOICE(merged_unichar_id,
256  merged_rating,
257  merged_certainty,
258  merged_fontinfo_id,
259  merged_fontinfo_id2,
260  merged_script_id,
261  merged_min_xheight,
262  merged_max_xheight,
263  merged_adapted));
264  }
265  }
266 
268  print_ratings_list("Merged Fragments", merged_choice,
269  unicharset);
270 
271  if (merged_choice->empty())
272  delete merged_choice;
273  else
274  ratings->put(row, column, merged_choice);
275 
276  delete [] choice_lists_it;
277 }
278 
279 
280 /**********************************************************************
281  * get_fragment_lists
282  *
283  * Recursively go through the ratings matrix to find lists of fragments
284  * to be merged in the function merge_and_put_fragment_lists.
285  * current_frag is the postion of the piece we are looking for.
286  * current_row is the row in the rating matrix we are currently at.
287  * start is the row we started initially, so that we can know where
288  * to append the results to the matrix. num_frag_parts is the total
289  * number of pieces we are looking for and num_blobs is the size of the
290  * ratings matrix.
291  **********************************************************************/
292 void Wordrec::get_fragment_lists(inT16 current_frag, inT16 current_row,
293  inT16 start, inT16 num_frag_parts,
294  inT16 num_blobs, MATRIX *ratings,
295  BLOB_CHOICE_LIST *choice_lists) {
296  if (current_frag == num_frag_parts) {
297  merge_and_put_fragment_lists(start, current_row - 1, num_frag_parts,
298  choice_lists, ratings);
299  return;
300  }
301 
302  for (inT16 x = current_row; x < num_blobs; x++) {
303  BLOB_CHOICE_LIST *choices = ratings->get(current_row, x);
304  if (choices == NULL)
305  continue;
306 
307  fill_filtered_fragment_list(choices, current_frag, num_frag_parts,
308  &choice_lists[current_frag]);
309  if (!choice_lists[current_frag].empty()) {
310  get_fragment_lists(current_frag + 1, x + 1, start, num_frag_parts,
311  num_blobs, ratings, choice_lists);
312  choice_lists[current_frag].clear();
313  }
314  }
315 }
316 
317 
318 /**********************************************************************
319  * merge_fragments
320  *
321  * Try to merge fragments in the ratings matrix and put the result in
322  * the corresponding row and column
323  **********************************************************************/
324 void Wordrec::merge_fragments(MATRIX *ratings, inT16 num_blobs) {
325  BLOB_CHOICE_LIST choice_lists[CHAR_FRAGMENT::kMaxChunks];
326  for (inT16 start = 0; start < num_blobs; start++) {
327  for (int frag_parts = 2; frag_parts <= CHAR_FRAGMENT::kMaxChunks;
328  frag_parts++) {
329  get_fragment_lists(0, start, start, frag_parts, num_blobs,
330  ratings, choice_lists);
331  }
332  }
333 
334  // Delete fragments from the rating matrix
335  for (inT16 x = 0; x < num_blobs; x++) {
336  for (inT16 y = x; y < num_blobs; y++) {
337  BLOB_CHOICE_LIST *choices = ratings->get(x, y);
338  if (choices != NULL) {
339  BLOB_CHOICE_IT choices_it(choices);
340  for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
341  choices_it.forward()) {
342  UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
343  const CHAR_FRAGMENT *frag =
344  unicharset.get_fragment(choice_unichar_id);
345  if (frag != NULL)
346  delete choices_it.extract();
347  }
348  }
349  }
350  }
351 }
352 
353 
354 /**********************************************************************
355  * get_piece_rating
356  *
357  * Check to see if this piece has already been classified. If it has
358  * return that rating. Otherwise build the piece from the smaller
359  * pieces, classify it, store the rating for later, and take the piece
360  * apart again.
361  **********************************************************************/
362 BLOB_CHOICE_LIST *Wordrec::get_piece_rating(MATRIX *ratings,
363  TBLOB *blobs,
364  const DENORM& denorm,
365  SEAMS seams,
366  inT16 start,
367  inT16 end,
368  BlamerBundle *blamer_bundle) {
369  BLOB_CHOICE_LIST *choices = ratings->get(start, end);
370  if (choices == NOT_CLASSIFIED) {
371  choices = classify_piece(blobs,
372  denorm,
373  seams,
374  start,
375  end,
376  blamer_bundle);
377  ratings->put(start, end, choices);
378  if (wordrec_debug_level > 1) {
379  tprintf("get_piece_rating(): updated ratings matrix\n");
380  ratings->print(getDict().getUnicharset());
381  }
382  }
383  return (choices);
384 }
385 
386 
387 /**********************************************************************
388  * record_blob_bounds
389  *
390  * Set up and initialize an array that holds the bounds of a set of
391  * blobs. Caller should delete[] the array.
392  **********************************************************************/
394  int nblobs = count_blobs(blobs);
395  TBOX *bboxes = new TBOX[nblobs];
396 
397  inT16 x = 0;
398  for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) {
399  bboxes[x] = blob->bounding_box();
400  x++;
401  }
402  return bboxes;
403 }
404 
405 
406 /**********************************************************************
407  * record_piece_ratings
408  *
409  * Save the choices for all the pieces that have been classified into
410  * a matrix that can be used to look them up later. A two dimensional
411  * matrix is created. The indices correspond to the starting and
412  * ending initial piece number.
413  **********************************************************************/
415  inT16 num_blobs = count_blobs(blobs);
416  TBOX *bounds = record_blob_bounds(blobs);
417  MATRIX *ratings = new MATRIX(num_blobs);
418 
419  for (int x = 0; x < num_blobs; x++) {
420  for (int y = x; y < num_blobs; y++) {
421  TBOX piecebox = bounds_of_piece(bounds, x, y);
422  BLOB_CHOICE_LIST *choices = blob_match_table.get_match_by_box(piecebox);
423  if (choices != NULL) {
424  ratings->put(x, y, choices);
425  }
426  }
427  }
428 
430  merge_fragments(ratings, num_blobs);
431 
432  delete []bounds;
433  return ratings;
434 }
435 
436 } // namespace tesseract