tesseract  3.04.00
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
commontraining.cpp
Go to the documentation of this file.
1 // Copyright 2008 Google Inc. All Rights Reserved.
2 // Author: scharron@google.com (Samuel Charron)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 
14 #include "commontraining.h"
15 
16 #include "allheaders.h"
17 #include "ccutil.h"
18 #include "classify.h"
19 #include "cluster.h"
20 #include "clusttool.h"
21 #include "efio.h"
22 #include "emalloc.h"
23 #include "featdefs.h"
24 #include "fontinfo.h"
25 #include "freelist.h"
26 #include "globals.h"
27 #include "intfeaturespace.h"
28 #include "mastertrainer.h"
29 #include "mf.h"
30 #include "ndminx.h"
31 #include "oldlist.h"
32 #include "params.h"
33 #include "shapetable.h"
34 #include "tessdatamanager.h"
35 #include "tessopt.h"
36 #include "tprintf.h"
37 #include "unicity_table.h"
38 
39 #include <math.h>
40 
41 using tesseract::CCUtil;
46 
47 // Global Variables.
48 
49 // global variable to hold configuration parameters to control clustering
50 // -M 0.625 -B 0.05 -I 1.0 -C 1e-6.
51 CLUSTERCONFIG Config = { elliptical, 0.625, 0.05, 1.0, 1e-6, 0 };
54 
55 INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging");
56 INT_PARAM_FLAG(load_images, 0, "Load images with tr files");
57 STRING_PARAM_FLAG(configfile, "", "File to load more configs from");
58 STRING_PARAM_FLAG(D, "", "Directory to write output files to");
59 STRING_PARAM_FLAG(F, "font_properties", "File listing font properties");
60 STRING_PARAM_FLAG(X, "", "File listing font xheights");
61 STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from");
62 STRING_PARAM_FLAG(O, "", "File to write unicharset to");
63 STRING_PARAM_FLAG(T, "", "File to load trainer from");
64 STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to");
65 STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string");
66 DOUBLE_PARAM_FLAG(clusterconfig_min_samples_fraction, Config.MinSamples,
67  "Min number of samples per proto as % of total");
68 DOUBLE_PARAM_FLAG(clusterconfig_max_illegal, Config.MaxIllegal,
69  "Max percentage of samples in a cluster which have more"
70  " than 1 feature in that cluster");
71 DOUBLE_PARAM_FLAG(clusterconfig_independence, Config.Independence,
72  "Desired independence between dimensions");
73 DOUBLE_PARAM_FLAG(clusterconfig_confidence, Config.Confidence,
74  "Desired confidence in prototypes created");
75 
76 /*
77  ** Parameters:
78  ** argc number of command line arguments to parse
79  ** argv command line arguments
80  ** Globals:
81  ** Config current clustering parameters
82  ** Operation:
83  ** This routine parses the command line arguments that were
84  ** passed to the program and ses them to set relevant
85  ** training-related global parameters
86  ** Return: none
87  ** Exceptions: Illegal options terminate the program.
88  */
89 void ParseArguments(int* argc, char ***argv) {
90  STRING usage;
91  if (*argc) {
92  usage += (*argv)[0];
93  }
94  usage += " [.tr files ...]";
95  tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);
96  // Record the index of the first non-flag argument to 1, since we set
97  // remove_flags to true when parsing the flags.
98  tessoptind = 1;
99  // Set some global values based on the flags.
100  Config.MinSamples =
101  MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_min_samples_fraction)));
102  Config.MaxIllegal =
103  MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_max_illegal)));
104  Config.Independence =
105  MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_independence)));
106  Config.Confidence =
107  MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_confidence)));
108  // Set additional parameters from config file if specified.
109  if (!FLAGS_configfile.empty()) {
111  FLAGS_configfile.c_str(),
113  ccutil.params());
114  }
115 }
116 
117 namespace tesseract {
118 // Helper loads shape table from the given file.
119 ShapeTable* LoadShapeTable(const STRING& file_prefix) {
120  ShapeTable* shape_table = NULL;
121  STRING shape_table_file = file_prefix;
122  shape_table_file += kShapeTableFileSuffix;
123  FILE* shape_fp = fopen(shape_table_file.string(), "rb");
124  if (shape_fp != NULL) {
125  shape_table = new ShapeTable;
126  if (!shape_table->DeSerialize(false, shape_fp)) {
127  delete shape_table;
128  shape_table = NULL;
129  tprintf("Error: Failed to read shape table %s\n",
130  shape_table_file.string());
131  } else {
132  int num_shapes = shape_table->NumShapes();
133  tprintf("Read shape table %s of %d shapes\n",
134  shape_table_file.string(), num_shapes);
135  }
136  fclose(shape_fp);
137  } else {
138  tprintf("Warning: No shape table file present: %s\n",
139  shape_table_file.string());
140  }
141  return shape_table;
142 }
143 
144 // Helper to write the shape_table.
145 void WriteShapeTable(const STRING& file_prefix, const ShapeTable& shape_table) {
146  STRING shape_table_file = file_prefix;
147  shape_table_file += kShapeTableFileSuffix;
148  FILE* fp = fopen(shape_table_file.string(), "wb");
149  if (fp != NULL) {
150  if (!shape_table.Serialize(fp)) {
151  fprintf(stderr, "Error writing shape table: %s\n",
152  shape_table_file.string());
153  }
154  fclose(fp);
155  } else {
156  fprintf(stderr, "Error creating shape table: %s\n",
157  shape_table_file.string());
158  }
159 }
160 
161 // Creates a MasterTraininer and loads the training data into it:
162 // Initializes feature_defs and IntegerFX.
163 // Loads the shape_table if shape_table != NULL.
164 // Loads initial unicharset from -U command-line option.
165 // If FLAGS_T is set, loads the majority of data from there, else:
166 // Loads font info from -F option.
167 // Loads xheights from -X option.
168 // Loads samples from .tr files in remaining command-line args.
169 // Deletes outliers and computes canonical samples.
170 // If FLAGS_output_trainer is set, saves the trainer for future use.
171 // Computes canonical and cloud features.
172 // If shape_table is not NULL, but failed to load, make a fake flat one,
173 // as shape clustering was not run.
174 MasterTrainer* LoadTrainingData(int argc, const char* const * argv,
175  bool replication,
176  ShapeTable** shape_table,
177  STRING* file_prefix) {
178  InitFeatureDefs(&feature_defs);
179  InitIntegerFX();
180  *file_prefix = "";
181  if (!FLAGS_D.empty()) {
182  *file_prefix += FLAGS_D.c_str();
183  *file_prefix += "/";
184  }
185  // If we are shape clustering (NULL shape_table) or we successfully load
186  // a shape_table written by a previous shape clustering, then
187  // shape_analysis will be true, meaning that the MasterTrainer will replace
188  // some members of the unicharset with their fragments.
189  bool shape_analysis = false;
190  if (shape_table != NULL) {
191  *shape_table = LoadShapeTable(*file_prefix);
192  if (*shape_table != NULL)
193  shape_analysis = true;
194  } else {
195  shape_analysis = true;
196  }
198  shape_analysis,
199  replication,
200  FLAGS_debug_level);
201  IntFeatureSpace fs;
203  if (FLAGS_T.empty()) {
204  trainer->LoadUnicharset(FLAGS_U.c_str());
205  // Get basic font information from font_properties.
206  if (!FLAGS_F.empty()) {
207  if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {
208  delete trainer;
209  return NULL;
210  }
211  }
212  if (!FLAGS_X.empty()) {
213  if (!trainer->LoadXHeights(FLAGS_X.c_str())) {
214  delete trainer;
215  return NULL;
216  }
217  }
218  trainer->SetFeatureSpace(fs);
219  const char* page_name;
220  // Load training data from .tr files on the command line.
221  while ((page_name = GetNextFilename(argc, argv)) != NULL) {
222  tprintf("Reading %s ...\n", page_name);
223  trainer->ReadTrainingSamples(page_name, feature_defs, false);
224 
225  // If there is a file with [lang].[fontname].exp[num].fontinfo present,
226  // read font spacing information in to fontinfo_table.
227  int pagename_len = strlen(page_name);
228  char *fontinfo_file_name = new char[pagename_len + 7];
229  strncpy(fontinfo_file_name, page_name, pagename_len - 2); // remove "tr"
230  strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo"); // +"fontinfo"
231  trainer->AddSpacingInfo(fontinfo_file_name);
232  delete[] fontinfo_file_name;
233 
234  // Load the images into memory if required by the classifier.
235  if (FLAGS_load_images) {
236  STRING image_name = page_name;
237  // Chop off the tr and replace with tif. Extension must be tif!
238  image_name.truncate_at(image_name.length() - 2);
239  image_name += "tif";
240  trainer->LoadPageImages(image_name.string());
241  }
242  }
243  trainer->PostLoadCleanup();
244  // Write the master trainer if required.
245  if (!FLAGS_output_trainer.empty()) {
246  FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb");
247  if (fp == NULL) {
248  tprintf("Can't create saved trainer data!\n");
249  } else {
250  trainer->Serialize(fp);
251  fclose(fp);
252  }
253  }
254  } else {
255  bool success = false;
256  tprintf("Loading master trainer from file:%s\n",
257  FLAGS_T.c_str());
258  FILE* fp = fopen(FLAGS_T.c_str(), "rb");
259  if (fp == NULL) {
260  tprintf("Can't read file %s to initialize master trainer\n",
261  FLAGS_T.c_str());
262  } else {
263  success = trainer->DeSerialize(false, fp);
264  fclose(fp);
265  }
266  if (!success) {
267  tprintf("Deserialize of master trainer failed!\n");
268  delete trainer;
269  return NULL;
270  }
271  trainer->SetFeatureSpace(fs);
272  }
273  trainer->PreTrainingSetup();
274  if (!FLAGS_O.empty() &&
275  !trainer->unicharset().save_to_file(FLAGS_O.c_str())) {
276  fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str());
277  delete trainer;
278  return NULL;
279  }
280  if (shape_table != NULL) {
281  // If we previously failed to load a shapetable, then shape clustering
282  // wasn't run so make a flat one now.
283  if (*shape_table == NULL) {
284  *shape_table = new ShapeTable;
285  trainer->SetupFlatShapeTable(*shape_table);
286  tprintf("Flat shape table summary: %s\n",
287  (*shape_table)->SummaryStr().string());
288  }
289  (*shape_table)->set_unicharset(trainer->unicharset());
290  }
291  return trainer;
292 }
293 
294 } // namespace tesseract.
295 
296 /*---------------------------------------------------------------------------*/
297 const char *GetNextFilename(int argc, const char* const * argv) {
298  /*
299  ** Parameters: none
300  ** Globals:
301  ** tessoptind defined by tessopt sys call
302  ** Operation:
303  ** This routine returns the next command line argument. If
304  ** there are no remaining command line arguments, it returns
305  ** NULL. This routine should only be called after all option
306  ** arguments have been parsed and removed with ParseArguments.
307  ** Return: Next command line argument or NULL.
308  ** Exceptions: none
309  ** History: Fri Aug 18 09:34:12 1989, DSJ, Created.
310  */
311  if (tessoptind < argc)
312  return argv[tessoptind++];
313  else
314  return NULL;
315 } /* GetNextFilename */
316 
317 
318 
319 /*---------------------------------------------------------------------------*/
321  LIST List,
322  char *Label)
323 
324 /*
325  ** Parameters:
326  ** List list to search
327  ** Label label to search for
328  ** Globals: none
329  ** Operation:
330  ** This routine searches thru a list of labeled lists to find
331  ** a list with the specified label. If a matching labeled list
332  ** cannot be found, NULL is returned.
333  ** Return: Labeled list with the specified Label or NULL.
334  ** Exceptions: none
335  ** History: Fri Aug 18 15:57:41 1989, DSJ, Created.
336  */
337 
338 {
339  LABELEDLIST LabeledList;
340 
341  iterate (List)
342  {
343  LabeledList = (LABELEDLIST) first_node (List);
344  if (strcmp (LabeledList->Label, Label) == 0)
345  return (LabeledList);
346  }
347  return (NULL);
348 
349 } /* FindList */
350 
351 /*---------------------------------------------------------------------------*/
353  const char *Label)
354 
355 /*
356  ** Parameters:
357  ** Label label for new list
358  ** Globals: none
359  ** Operation:
360  ** This routine allocates a new, empty labeled list and gives
361  ** it the specified label.
362  ** Return: New, empty labeled list.
363  ** Exceptions: none
364  ** History: Fri Aug 18 16:08:46 1989, DSJ, Created.
365  */
366 
367 {
368  LABELEDLIST LabeledList;
369 
370  LabeledList = (LABELEDLIST) Emalloc (sizeof (LABELEDLISTNODE));
371  LabeledList->Label = (char*)Emalloc (strlen (Label)+1);
372  strcpy (LabeledList->Label, Label);
373  LabeledList->List = NIL_LIST;
374  LabeledList->SampleCount = 0;
375  LabeledList->font_sample_count = 0;
376  return (LabeledList);
377 
378 } /* NewLabeledList */
379 
380 /*---------------------------------------------------------------------------*/
381 // TODO(rays) This is now used only by cntraining. Convert cntraining to use
382 // the new method or get rid of it entirely.
383 void ReadTrainingSamples(const FEATURE_DEFS_STRUCT& feature_defs,
384  const char *feature_name, int max_samples,
385  UNICHARSET* unicharset,
386  FILE* file, LIST* training_samples) {
387 /*
388 ** Parameters:
389 ** file open text file to read samples from
390 ** Globals: none
391 ** Operation:
392 ** This routine reads training samples from a file and
393 ** places them into a data structure which organizes the
394 ** samples by FontName and CharName. It then returns this
395 ** data structure.
396 ** Return: none
397 ** Exceptions: none
398 ** History: Fri Aug 18 13:11:39 1989, DSJ, Created.
399 ** Tue May 17 1998 simplifications to structure, illiminated
400 ** font, and feature specification levels of structure.
401 */
402  char buffer[2048];
403  char unichar[UNICHAR_LEN + 1];
404  LABELEDLIST char_sample;
405  FEATURE_SET feature_samples;
406  CHAR_DESC char_desc;
407  int i;
408  int feature_type = ShortNameToFeatureType(feature_defs, feature_name);
409  // Zero out the font_sample_count for all the classes.
410  LIST it = *training_samples;
411  iterate(it) {
412  char_sample = reinterpret_cast<LABELEDLIST>(first_node(it));
413  char_sample->font_sample_count = 0;
414  }
415 
416  while (fgets(buffer, 2048, file) != NULL) {
417  if (buffer[0] == '\n')
418  continue;
419 
420  sscanf(buffer, "%*s %s", unichar);
421  if (unicharset != NULL && !unicharset->contains_unichar(unichar)) {
422  unicharset->unichar_insert(unichar);
423  if (unicharset->size() > MAX_NUM_CLASSES) {
424  tprintf("Error: Size of unicharset in training is "
425  "greater than MAX_NUM_CLASSES\n");
426  exit(1);
427  }
428  }
429  char_sample = FindList(*training_samples, unichar);
430  if (char_sample == NULL) {
431  char_sample = NewLabeledList(unichar);
432  *training_samples = push(*training_samples, char_sample);
433  }
434  char_desc = ReadCharDescription(feature_defs, file);
435  feature_samples = char_desc->FeatureSets[feature_type];
436  if (char_sample->font_sample_count < max_samples || max_samples <= 0) {
437  char_sample->List = push(char_sample->List, feature_samples);
438  char_sample->SampleCount++;
439  char_sample->font_sample_count++;
440  } else {
441  FreeFeatureSet(feature_samples);
442  }
443  for (i = 0; i < char_desc->NumFeatureSets; i++) {
444  if (feature_type != i)
445  FreeFeatureSet(char_desc->FeatureSets[i]);
446  }
447  free(char_desc);
448  }
449 } // ReadTrainingSamples
450 
451 
452 /*---------------------------------------------------------------------------*/
453 void FreeTrainingSamples(LIST CharList) {
454 /*
455  ** Parameters:
456  ** FontList list of all fonts in document
457  ** Globals: none
458  ** Operation:
459  ** This routine deallocates all of the space allocated to
460  ** the specified list of training samples.
461  ** Return: none
462  ** Exceptions: none
463  ** History: Fri Aug 18 17:44:27 1989, DSJ, Created.
464  */
465  LABELEDLIST char_sample;
466  FEATURE_SET FeatureSet;
467  LIST FeatureList;
468 
469 
470  iterate(CharList) { /* iterate thru all of the fonts */
471  char_sample = (LABELEDLIST) first_node(CharList);
472  FeatureList = char_sample->List;
473  iterate(FeatureList) { /* iterate thru all of the classes */
474  FeatureSet = (FEATURE_SET) first_node(FeatureList);
475  FreeFeatureSet(FeatureSet);
476  }
477  FreeLabeledList(char_sample);
478  }
479  destroy(CharList);
480 } /* FreeTrainingSamples */
481 
482 /*---------------------------------------------------------------------------*/
483 void FreeLabeledList(LABELEDLIST LabeledList) {
484 /*
485  ** Parameters:
486  ** LabeledList labeled list to be freed
487  ** Globals: none
488  ** Operation:
489  ** This routine deallocates all of the memory consumed by
490  ** a labeled list. It does not free any memory which may be
491  ** consumed by the items in the list.
492  ** Return: none
493  ** Exceptions: none
494  ** History: Fri Aug 18 17:52:45 1989, DSJ, Created.
495  */
496  destroy(LabeledList->List);
497  free(LabeledList->Label);
498  free(LabeledList);
499 } /* FreeLabeledList */
500 
501 /*---------------------------------------------------------------------------*/
503  LABELEDLIST char_sample,
504  const char* program_feature_type) {
505 /*
506  ** Parameters:
507  ** char_sample: LABELEDLIST that holds all the feature information for a
508  ** given character.
509  ** Globals:
510  ** None
511  ** Operation:
512  ** This routine reads samples from a LABELEDLIST and enters
513  ** those samples into a clusterer data structure. This
514  ** data structure is then returned to the caller.
515  ** Return:
516  ** Pointer to new clusterer data structure.
517  ** Exceptions:
518  ** None
519  ** History:
520  ** 8/16/89, DSJ, Created.
521  */
522  uinT16 N;
523  int i, j;
524  FLOAT32 *Sample = NULL;
525  CLUSTERER *Clusterer;
526  inT32 CharID;
527  LIST FeatureList = NULL;
528  FEATURE_SET FeatureSet = NULL;
529 
530  int desc_index = ShortNameToFeatureType(FeatureDefs, program_feature_type);
531  N = FeatureDefs.FeatureDesc[desc_index]->NumParams;
532  Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc);
533 
534  FeatureList = char_sample->List;
535  CharID = 0;
536  iterate(FeatureList) {
537  FeatureSet = (FEATURE_SET) first_node(FeatureList);
538  for (i = 0; i < FeatureSet->MaxNumFeatures; i++) {
539  if (Sample == NULL)
540  Sample = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
541  for (j = 0; j < N; j++)
542  Sample[j] = FeatureSet->Features[i]->Params[j];
543  MakeSample (Clusterer, Sample, CharID);
544  }
545  CharID++;
546  }
547  if ( Sample != NULL ) free( Sample );
548  return( Clusterer );
549 
550 } /* SetUpForClustering */
551 
552 /*------------------------------------------------------------------------*/
553 void MergeInsignificantProtos(LIST ProtoList, const char* label,
554  CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
555  PROTOTYPE *Prototype;
556  bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0;
557 
558  LIST pProtoList = ProtoList;
559  iterate(pProtoList) {
560  Prototype = (PROTOTYPE *) first_node (pProtoList);
561  if (Prototype->Significant || Prototype->Merged)
562  continue;
563  FLOAT32 best_dist = 0.125;
564  PROTOTYPE* best_match = NULL;
565  // Find the nearest alive prototype.
566  LIST list_it = ProtoList;
567  iterate(list_it) {
568  PROTOTYPE* test_p = (PROTOTYPE *) first_node (list_it);
569  if (test_p != Prototype && !test_p->Merged) {
570  FLOAT32 dist = ComputeDistance(Clusterer->SampleSize,
571  Clusterer->ParamDesc,
572  Prototype->Mean, test_p->Mean);
573  if (dist < best_dist) {
574  best_match = test_p;
575  best_dist = dist;
576  }
577  }
578  }
579  if (best_match != NULL && !best_match->Significant) {
580  if (debug)
581  tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n",
582  best_match->NumSamples, Prototype->NumSamples,
583  best_match->Mean[0], best_match->Mean[1],
584  Prototype->Mean[0], Prototype->Mean[1]);
585  best_match->NumSamples = MergeClusters(Clusterer->SampleSize,
586  Clusterer->ParamDesc,
587  best_match->NumSamples,
588  Prototype->NumSamples,
589  best_match->Mean,
590  best_match->Mean, Prototype->Mean);
591  Prototype->NumSamples = 0;
592  Prototype->Merged = 1;
593  } else if (best_match != NULL) {
594  if (debug)
595  tprintf("Red proto at %g,%g matched a green one at %g,%g\n",
596  Prototype->Mean[0], Prototype->Mean[1],
597  best_match->Mean[0], best_match->Mean[1]);
598  Prototype->Merged = 1;
599  }
600  }
601  // Mark significant those that now have enough samples.
602  int min_samples = (inT32) (Config->MinSamples * Clusterer->NumChar);
603  pProtoList = ProtoList;
604  iterate(pProtoList) {
605  Prototype = (PROTOTYPE *) first_node (pProtoList);
606  // Process insignificant protos that do not match a green one
607  if (!Prototype->Significant && Prototype->NumSamples >= min_samples &&
608  !Prototype->Merged) {
609  if (debug)
610  tprintf("Red proto at %g,%g becoming green\n",
611  Prototype->Mean[0], Prototype->Mean[1]);
612  Prototype->Significant = true;
613  }
614  }
615 } /* MergeInsignificantProtos */
616 
617 /*-----------------------------------------------------------------------------*/
619  LIST ProtoList)
620 {
621  PROTOTYPE* Prototype;
622 
623  iterate(ProtoList)
624  {
625  Prototype = (PROTOTYPE *) first_node (ProtoList);
626  if(Prototype->Variance.Elliptical != NULL)
627  {
628  memfree(Prototype->Variance.Elliptical);
629  Prototype->Variance.Elliptical = NULL;
630  }
631  if(Prototype->Magnitude.Elliptical != NULL)
632  {
633  memfree(Prototype->Magnitude.Elliptical);
634  Prototype->Magnitude.Elliptical = NULL;
635  }
636  if(Prototype->Weight.Elliptical != NULL)
637  {
638  memfree(Prototype->Weight.Elliptical);
639  Prototype->Weight.Elliptical = NULL;
640  }
641  }
642 }
643 
644 /*------------------------------------------------------------------------*/
646  LIST ProtoList,
647  BOOL8 KeepSigProtos,
648  BOOL8 KeepInsigProtos,
649  int N)
650 
651 {
652  LIST NewProtoList = NIL_LIST;
653  LIST pProtoList;
654  PROTOTYPE* Proto;
655  PROTOTYPE* NewProto;
656  int i;
657 
658  pProtoList = ProtoList;
659  iterate(pProtoList)
660  {
661  Proto = (PROTOTYPE *) first_node (pProtoList);
662  if ((Proto->Significant && KeepSigProtos) ||
663  (!Proto->Significant && KeepInsigProtos))
664  {
665  NewProto = (PROTOTYPE *)Emalloc(sizeof(PROTOTYPE));
666 
667  NewProto->Mean = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
668  NewProto->Significant = Proto->Significant;
669  NewProto->Style = Proto->Style;
670  NewProto->NumSamples = Proto->NumSamples;
671  NewProto->Cluster = NULL;
672  NewProto->Distrib = NULL;
673 
674  for (i=0; i < N; i++)
675  NewProto->Mean[i] = Proto->Mean[i];
676  if (Proto->Variance.Elliptical != NULL)
677  {
678  NewProto->Variance.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
679  for (i=0; i < N; i++)
680  NewProto->Variance.Elliptical[i] = Proto->Variance.Elliptical[i];
681  }
682  else
683  NewProto->Variance.Elliptical = NULL;
684  //---------------------------------------------
685  if (Proto->Magnitude.Elliptical != NULL)
686  {
687  NewProto->Magnitude.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
688  for (i=0; i < N; i++)
689  NewProto->Magnitude.Elliptical[i] = Proto->Magnitude.Elliptical[i];
690  }
691  else
692  NewProto->Magnitude.Elliptical = NULL;
693  //------------------------------------------------
694  if (Proto->Weight.Elliptical != NULL)
695  {
696  NewProto->Weight.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
697  for (i=0; i < N; i++)
698  NewProto->Weight.Elliptical[i] = Proto->Weight.Elliptical[i];
699  }
700  else
701  NewProto->Weight.Elliptical = NULL;
702 
703  NewProto->TotalMagnitude = Proto->TotalMagnitude;
704  NewProto->LogMagnitude = Proto->LogMagnitude;
705  NewProtoList = push_last(NewProtoList, NewProto);
706  }
707  }
708  FreeProtoList(&ProtoList);
709  return (NewProtoList);
710 } /* RemoveInsignificantProtos */
711 
712 /*----------------------------------------------------------------------------*/
714  LIST List,
715  const char *Label)
716 {
717  MERGE_CLASS MergeClass;
718 
719  iterate (List)
720  {
721  MergeClass = (MERGE_CLASS) first_node (List);
722  if (strcmp (MergeClass->Label, Label) == 0)
723  return (MergeClass);
724  }
725  return (NULL);
726 
727 } /* FindClass */
728 
729 /*---------------------------------------------------------------------------*/
731  const char *Label)
732 {
733  MERGE_CLASS MergeClass;
734 
735  MergeClass = new MERGE_CLASS_NODE;
736  MergeClass->Label = (char*)Emalloc (strlen (Label)+1);
737  strcpy (MergeClass->Label, Label);
738  MergeClass->Class = NewClass (MAX_NUM_PROTOS, MAX_NUM_CONFIGS);
739  return (MergeClass);
740 
741 } /* NewLabeledClass */
742 
743 /*-----------------------------------------------------------------------------*/
745  LIST ClassList)
746 
747 /*
748  ** Parameters:
749  ** FontList list of all fonts in document
750  ** Globals: none
751  ** Operation:
752  ** This routine deallocates all of the space allocated to
753  ** the specified list of training samples.
754  ** Return: none
755  ** Exceptions: none
756  ** History: Fri Aug 18 17:44:27 1989, DSJ, Created.
757  */
758 
759 {
760  MERGE_CLASS MergeClass;
761 
762  iterate (ClassList) /* iterate thru all of the fonts */
763  {
764  MergeClass = (MERGE_CLASS) first_node (ClassList);
765  free (MergeClass->Label);
766  FreeClass(MergeClass->Class);
767  delete MergeClass;
768  }
769  destroy (ClassList);
770 
771 } /* FreeLabeledClassList */
772 
775  LIST LabeledClassList) {
776  MERGE_CLASS MergeClass;
777  CLASS_TYPE Class;
778  int NumProtos;
779  int NumConfigs;
780  int NumWords;
781  int i, j;
782  float Values[3];
783  PROTO NewProto;
784  PROTO OldProto;
785  BIT_VECTOR NewConfig;
786  BIT_VECTOR OldConfig;
787 
788  // printf("Float2Int ...\n");
789 
790  CLASS_STRUCT* float_classes = new CLASS_STRUCT[unicharset.size()];
791  iterate(LabeledClassList)
792  {
793  UnicityTableEqEq<int> font_set;
794  MergeClass = (MERGE_CLASS) first_node (LabeledClassList);
795  Class = &float_classes[unicharset.unichar_to_id(MergeClass->Label)];
796  NumProtos = MergeClass->Class->NumProtos;
797  NumConfigs = MergeClass->Class->NumConfigs;
798  font_set.move(&MergeClass->Class->font_set);
799  Class->NumProtos = NumProtos;
800  Class->MaxNumProtos = NumProtos;
801  Class->Prototypes = (PROTO) Emalloc (sizeof(PROTO_STRUCT) * NumProtos);
802  for(i=0; i < NumProtos; i++)
803  {
804  NewProto = ProtoIn(Class, i);
805  OldProto = ProtoIn(MergeClass->Class, i);
806  Values[0] = OldProto->X;
807  Values[1] = OldProto->Y;
808  Values[2] = OldProto->Angle;
809  Normalize(Values);
810  NewProto->X = OldProto->X;
811  NewProto->Y = OldProto->Y;
812  NewProto->Length = OldProto->Length;
813  NewProto->Angle = OldProto->Angle;
814  NewProto->A = Values[0];
815  NewProto->B = Values[1];
816  NewProto->C = Values[2];
817  }
818 
819  Class->NumConfigs = NumConfigs;
820  Class->MaxNumConfigs = NumConfigs;
821  Class->font_set.move(&font_set);
822  Class->Configurations = (BIT_VECTOR*) Emalloc (sizeof(BIT_VECTOR) * NumConfigs);
823  NumWords = WordsInVectorOfSize(NumProtos);
824  for(i=0; i < NumConfigs; i++)
825  {
826  NewConfig = NewBitVector(NumProtos);
827  OldConfig = MergeClass->Class->Configurations[i];
828  for(j=0; j < NumWords; j++)
829  NewConfig[j] = OldConfig[j];
830  Class->Configurations[i] = NewConfig;
831  }
832  }
833  return float_classes;
834 } // SetUpForFloat2Int
835 
836 /*--------------------------------------------------------------------------*/
837 void Normalize (
838  float *Values)
839 {
840  register float Slope;
841  register float Intercept;
842  register float Normalizer;
843 
844  Slope = tan (Values [2] * 2 * PI);
845  Intercept = Values [1] - Slope * Values [0];
846  Normalizer = 1 / sqrt (Slope * Slope + 1.0);
847 
848  Values [0] = Slope * Normalizer;
849  Values [1] = - Normalizer;
850  Values [2] = Intercept * Normalizer;
851 } // Normalize
852 
853 /*-------------------------------------------------------------------------*/
855  LIST CharList)
856 
857 {
858  LABELEDLIST char_sample;
859 
860  iterate (CharList) /* iterate thru all of the fonts */
861  {
862  char_sample = (LABELEDLIST) first_node (CharList);
863  FreeLabeledList (char_sample);
864  }
865  destroy (CharList);
866 
867 } // FreeNormProtoList
868 
869 /*---------------------------------------------------------------------------*/
871  LIST* NormProtoList,
872  LIST ProtoList,
873  char* CharName)
874 {
875  PROTOTYPE* Proto;
876  LABELEDLIST LabeledProtoList;
877 
878  LabeledProtoList = NewLabeledList(CharName);
879  iterate(ProtoList)
880  {
881  Proto = (PROTOTYPE *) first_node (ProtoList);
882  LabeledProtoList->List = push(LabeledProtoList->List, Proto);
883  }
884  *NormProtoList = push(*NormProtoList, LabeledProtoList);
885 }
886 
887 /*---------------------------------------------------------------------------*/
889  LIST ProtoList,
890  BOOL8 CountSigProtos,
891  BOOL8 CountInsigProtos)
892 {
893  int N = 0;
894  PROTOTYPE *Proto;
895 
896  iterate(ProtoList)
897  {
898  Proto = (PROTOTYPE *) first_node ( ProtoList );
899  if (( Proto->Significant && CountSigProtos ) ||
900  ( ! Proto->Significant && CountInsigProtos ) )
901  N++;
902  }
903  return(N);
904 }
#define PI
Definition: const.h:19
CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File)
Definition: featdefs.cpp:263
bool LoadFontInfo(const char *filename)
PROTO_STRUCT * PROTO
Definition: protos.h:52
void LoadPageImages(const char *filename)
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
#define first_node(l)
Definition: oldlist.h:139
bool save_to_file(const char *const filename) const
Definition: unicharset.h:306
float FLOAT32
Definition: host.h:111
#define MAX(x, y)
Definition: ndminx.h:24
void Init(uinT8 xbuckets, uinT8 ybuckets, uinT8 thetabuckets)
LIST destroy(LIST list)
Definition: oldlist.cpp:187
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
void MergeInsignificantProtos(LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
#define ProtoIn(Class, Pid)
Definition: protos.h:123
DISTRIBUTION * Distrib
Definition: cluster.h:77
#define MAX_NUM_CONFIGS
Definition: intproto.h:46
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:121
struct LABELEDLISTNODE * LABELEDLIST
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
INT_PARAM_FLAG(debug_level, 0,"Level of Trainer debugging")
#define tprintf(...)
Definition: tprintf.h:31
#define MIN(x, y)
Definition: ndminx.h:28
void ParseArguments(int *argc, char ***argv)
inT16 NumConfigs
Definition: protos.h:62
const int kBoostXYBuckets
const int kBoostDirBuckets
LIST push_last(LIST list, void *item)
Definition: oldlist.cpp:338
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:50
FEATURE_SET FeatureSets[NUM_FEATURE_TYPES]
Definition: featdefs.h:44
void SetupFlatShapeTable(ShapeTable *shape_table)
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:250
CLUSTERER * MakeClusterer(inT16 SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:399
unsigned char BOOL8
Definition: host.h:113
SAMPLE * MakeSample(CLUSTERER *Clusterer, const FLOAT32 *Feature, inT32 CharID)
Definition: cluster.cpp:454
inT32 length() const
Definition: strngs.cpp:188
void Normalize(float *Values)
uinT32 NumFeatureSets
Definition: featdefs.h:43
bool DeSerialize(bool swap, FILE *fp)
Definition: shapetable.cpp:256
void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags)
#define iterate(l)
Definition: oldlist.h:159
CLUSTERCONFIG Config
FLOAT32 LogMagnitude
Definition: cluster.h:80
FLOATUNION Variance
Definition: cluster.h:81
FLOAT32 * Mean
Definition: cluster.h:78
UnicityTableEqEq< int > font_set
Definition: protos.h:65
FEATURE Features[1]
Definition: ocrfeatures.h:72
inT16 NumProtos
Definition: protos.h:59
DOUBLE_PARAM_FLAG(clusterconfig_min_samples_fraction, Config.MinSamples,"Min number of samples per proto as % of total")
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_defs, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
bool Serialize(FILE *fp) const
ShapeTable * LoadShapeTable(const STRING &file_prefix)
void FreeNormProtoList(LIST CharList)
CCUtil ccutil
void FreeTrainingSamples(LIST CharList)
BIT_VECTOR NewBitVector(int NumBits)
Definition: bitvec.cpp:90
void FreeLabeledList(LABELEDLIST LabeledList)
unsigned Significant
Definition: cluster.h:68
FLOATUNION Weight
Definition: cluster.h:83
FLOAT32 Independence
Definition: cluster.h:53
FLOAT32 MaxIllegal
Definition: cluster.h:51
CLASS_TYPE NewClass(int NumProtos, int NumConfigs)
Definition: protos.cpp:248
FLOAT32 X
Definition: protos.h:47
FLOAT32 TotalMagnitude
Definition: cluster.h:79
FLOAT32 Angle
Definition: protos.h:49
void truncate_at(inT32 index)
Definition: strngs.cpp:264
unsigned NumSamples
Definition: cluster.h:75
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:564
MasterTrainer * LoadTrainingData(int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
bool LoadXHeights(const char *filename)
LABELEDLIST NewLabeledList(const char *Label)
FLOAT64 Confidence
Definition: cluster.h:54
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, char *CharName)
CLASS_STRUCT * SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList)
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 * Elliptical
Definition: cluster.h:64
CLUSTER * Cluster
Definition: cluster.h:76
MERGE_CLASS NewLabeledClass(const char *Label)
inT16 MaxNumConfigs
Definition: protos.h:63
const UNICHARSET & unicharset() const
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:612
FLOAT32 MinSamples
Definition: cluster.h:50
LABELEDLIST FindList(LIST List, char *Label)
inT32 NumChar
Definition: cluster.h:93
FLOAT32 B
Definition: protos.h:45
#define WordsInVectorOfSize(NumBits)
Definition: bitvec.h:63
void LoadUnicharset(const char *filename)
FLOAT32 ComputeDistance(int k, PARAM_DESC *dim, FLOAT32 p1[], FLOAT32 p2[])
Definition: kdtree.cpp:486
void FreeClass(CLASS_TYPE Class)
Definition: protos.cpp:215
ParamsVectors * params()
Definition: ccutil.h:65
MERGE_CLASS_NODE * MERGE_CLASS
FLOAT32 Params[1]
Definition: ocrfeatures.h:65
void * Emalloc(int Size)
Definition: emalloc.cpp:35
FEATURE_SET_STRUCT * FEATURE_SET
Definition: ocrfeatures.h:74
FEATURE_DEFS_STRUCT feature_defs
#define NIL_LIST
Definition: oldlist.h:126
FLOAT32 Length
Definition: protos.h:50
STRING_PARAM_FLAG(configfile,"","File to load more configs from")
bool DeSerialize(bool swap, FILE *fp)
PARAM_DESC * ParamDesc
Definition: cluster.h:88
inT32 MergeClusters(inT16 N, register PARAM_DESC ParamDesc[], register inT32 n1, register inT32 n2, register FLOAT32 m[], register FLOAT32 m1[], register FLOAT32 m2[])
void InitIntegerFX()
Definition: intfx.cpp:55
uinT32 * BIT_VECTOR
Definition: bitvec.h:28
MERGE_CLASS FindClass(LIST List, const char *Label)
void move(UnicityTable< T > *from)
unsigned Style
Definition: cluster.h:74
int ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:302
LIST push(LIST list, void *element)
Definition: oldlist.cpp:323
FLOAT32 C
Definition: protos.h:46
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:644
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:41
Definition: strngs.h:44
void memfree(void *element)
Definition: freelist.cpp:30
void ReadTrainingSamples(const char *page_name, const FEATURE_DEFS_STRUCT &feature_defs, bool verification)
#define NULL
Definition: host.h:144
#define MAX_NUM_PROTOS
Definition: intproto.h:47
#define UNICHAR_LEN
Definition: unichar.h:30
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:59
int size() const
Definition: unicharset.h:297
void SetFeatureSpace(const IntFeatureSpace &fs)
Definition: mastertrainer.h:85
const char * string() const
Definition: strngs.cpp:193
FLOAT32 A
Definition: protos.h:44
unsigned Merged
Definition: cluster.h:69
inT16 MaxNumProtos
Definition: protos.h:60
void FreeFeatureSet(FEATURE_SET FeatureSet)
Definition: ocrfeatures.cpp:79
inT16 SampleSize
Definition: cluster.h:87
int NumberOfProtos(LIST ProtoList, BOOL8 CountSigProtos, BOOL8 CountInsigProtos)
int tessoptind
Definition: tessopt.cpp:24
PROTO Prototypes
Definition: protos.h:61
CLASS_TYPE Class
CONFIGS Configurations
Definition: protos.h:64
int NumShapes() const
Definition: shapetable.h:278
void FreeLabeledClassList(LIST ClassList)
void WriteShapeTable(const STRING &file_prefix, const ShapeTable &shape_table)
unsigned short uinT16
Definition: host.h:101
int inT32
Definition: host.h:102
LIST RemoveInsignificantProtos(LIST ProtoList, BOOL8 KeepSigProtos, BOOL8 KeepInsigProtos, int N)
FLOAT32 Y
Definition: protos.h:48
bool AddSpacingInfo(const char *filename)
const char * c_str() const
Definition: strngs.cpp:204
void CleanUpUnusedData(LIST ProtoList)
const char * GetNextFilename(int argc, const char *const *argv)