tesseract  3.04.00
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
normmatch.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: normmatch.c
3  ** Purpose: Simple matcher based on character normalization features.
4  ** Author: Dan Johnson
5  ** History: Wed Dec 19 16:18:06 1990, DSJ, Created.
6  **
7  ** (c) Copyright Hewlett-Packard Company, 1988.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  ******************************************************************************/
21 #include "normmatch.h"
22 
23 #include <stdio.h>
24 #include <math.h>
25 
26 #include "classify.h"
27 #include "clusttool.h"
28 #include "const.h"
29 #include "efio.h"
30 #include "emalloc.h"
31 #include "globals.h"
32 #include "helpers.h"
33 #include "normfeat.h"
34 #include "scanutils.h"
35 #include "unicharset.h"
36 #include "params.h"
37 
39 {
40  int NumParams;
43  int NumProtos;
44 };
45 
49 double NormEvidenceOf(register double NormAdj);
50 
51 void PrintNormMatch(FILE *File,
52  int NumParams,
53  PROTOTYPE *Proto,
54  FEATURE Feature);
55 
56 NORM_PROTOS *ReadNormProtos(FILE *File);
57 
62 /* control knobs used to control the normalization adjustment process */
63 double_VAR(classify_norm_adj_midpoint, 32.0, "Norm adjust midpoint ...");
64 double_VAR(classify_norm_adj_curl, 2.0, "Norm adjust curl ...");
65 // Weight of width variance against height and vertical position.
66 const double kWidthErrorWeighting = 0.125;
67 
71 /*---------------------------------------------------------------------------*/
72 namespace tesseract {
74  const FEATURE_STRUCT& feature,
75  BOOL8 DebugMatch) {
76 /*
77  ** Parameters:
78  ** ClassId id of class to match against
79  ** Feature character normalization feature
80  ** DebugMatch controls dump of debug info
81  ** Globals:
82  ** NormProtos character normalization prototypes
83  ** Operation: This routine compares Features against each character
84  ** normalization proto for ClassId and returns the match
85  ** rating of the best match.
86  ** Return: Best match rating for Feature against protos of ClassId.
87  ** Exceptions: none
88  ** History: Wed Dec 19 16:56:12 1990, DSJ, Created.
89  */
90  LIST Protos;
91  FLOAT32 BestMatch;
92  FLOAT32 Match;
93  FLOAT32 Delta;
94  PROTOTYPE *Proto;
95  int ProtoId;
96 
97  if (ClassId >= NormProtos->NumProtos) {
98  ClassId = NO_CLASS;
99  }
100 
101  /* handle requests for classification as noise */
102  if (ClassId == NO_CLASS) {
103  /* kludge - clean up constants and make into control knobs later */
104  Match = (feature.Params[CharNormLength] *
105  feature.Params[CharNormLength] * 500.0 +
106  feature.Params[CharNormRx] *
107  feature.Params[CharNormRx] * 8000.0 +
108  feature.Params[CharNormRy] *
109  feature.Params[CharNormRy] * 8000.0);
110  return (1.0 - NormEvidenceOf (Match));
111  }
112 
113  BestMatch = MAX_FLOAT32;
114  Protos = NormProtos->Protos[ClassId];
115 
116  if (DebugMatch) {
117  tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId));
118  }
119 
120  ProtoId = 0;
121  iterate(Protos) {
122  Proto = (PROTOTYPE *) first_node (Protos);
123  Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY];
124  Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY];
125  if (DebugMatch) {
126  tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
127  Proto->Mean[CharNormY], Delta,
128  Proto->Weight.Elliptical[CharNormY], Match);
129  }
130  Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx];
131  Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx];
132  if (DebugMatch) {
133  tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
134  Proto->Mean[CharNormRx], Delta,
135  Proto->Weight.Elliptical[CharNormRx], Match);
136  }
137  // Ry is width! See intfx.cpp.
138  Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy];
139  if (DebugMatch) {
140  tprintf("Width: Proto=%g, Delta=%g, Var=%g\n",
141  Proto->Mean[CharNormRy], Delta,
142  Proto->Weight.Elliptical[CharNormRy]);
143  }
144  Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy];
145  Delta *= kWidthErrorWeighting;
146  Match += Delta;
147  if (DebugMatch) {
148  tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n",
149  Match, Match / classify_norm_adj_midpoint,
150  NormEvidenceOf(Match), 256 * (1 - NormEvidenceOf(Match)));
151  }
152 
153  if (Match < BestMatch)
154  BestMatch = Match;
155 
156  ProtoId++;
157  }
158  return 1.0 - NormEvidenceOf(BestMatch);
159 } /* ComputeNormMatch */
160 
162  if (NormProtos != NULL) {
163  for (int i = 0; i < NormProtos->NumProtos; i++)
167  Efree(NormProtos);
168  NormProtos = NULL;
169  }
170 }
171 } // namespace tesseract
172 
176 /**********************************************************************
177  * NormEvidenceOf
178  *
179  * Return the new type of evidence number corresponding to this
180  * normalization adjustment. The equation that represents the transform is:
181  * 1 / (1 + (NormAdj / midpoint) ^ curl)
182  **********************************************************************/
183 double NormEvidenceOf(register double NormAdj) {
184  NormAdj /= classify_norm_adj_midpoint;
185 
186  if (classify_norm_adj_curl == 3)
187  NormAdj = NormAdj * NormAdj * NormAdj;
188  else if (classify_norm_adj_curl == 2)
189  NormAdj = NormAdj * NormAdj;
190  else
191  NormAdj = pow (NormAdj, classify_norm_adj_curl);
192  return (1.0 / (1.0 + NormAdj));
193 }
194 
195 
196 /*---------------------------------------------------------------------------*/
197 void PrintNormMatch(FILE *File,
198  int NumParams,
199  PROTOTYPE *Proto,
200  FEATURE Feature) {
201 /*
202  ** Parameters:
203  ** File open text file to dump match debug info to
204  ** NumParams # of parameters in proto and feature
205  ** Proto[] array of prototype parameters
206  ** Feature[] array of feature parameters
207  ** Globals: none
208  ** Operation: This routine dumps out detailed normalization match info.
209  ** Return: none
210  ** Exceptions: none
211  ** History: Wed Jan 2 09:49:35 1991, DSJ, Created.
212  */
213  int i;
214  FLOAT32 ParamMatch;
215  FLOAT32 TotalMatch;
216 
217  for (i = 0, TotalMatch = 0.0; i < NumParams; i++) {
218  ParamMatch = (Feature->Params[i] - Mean(Proto, i)) /
219  StandardDeviation(Proto, i);
220 
221  fprintf (File, " %6.1f", ParamMatch);
222 
223  if (i == CharNormY || i == CharNormRx)
224  TotalMatch += ParamMatch * ParamMatch;
225  }
226  fprintf (File, " --> %6.1f (%4.2f)\n",
227  TotalMatch, NormEvidenceOf (TotalMatch));
228 
229 } /* PrintNormMatch */
230 
231 
232 /*---------------------------------------------------------------------------*/
233 namespace tesseract {
235 /*
236  ** Parameters:
237  ** File open text file to read normalization protos from
238  ** Globals: none
239  ** Operation: This routine allocates a new data structure to hold
240  ** a set of character normalization protos. It then fills in
241  ** the data structure by reading from the specified File.
242  ** Return: Character normalization protos.
243  ** Exceptions: none
244  ** History: Wed Dec 19 16:38:49 1990, DSJ, Created.
245  */
247  int i;
248  char unichar[2 * UNICHAR_LEN + 1];
249  UNICHAR_ID unichar_id;
250  LIST Protos;
251  int NumProtos;
252 
253  /* allocate and initialization data structure */
254  NormProtos = (NORM_PROTOS *) Emalloc (sizeof (NORM_PROTOS));
255  NormProtos->NumProtos = unicharset.size();
256  NormProtos->Protos = (LIST *) Emalloc (NormProtos->NumProtos * sizeof(LIST));
257  for (i = 0; i < NormProtos->NumProtos; i++)
258  NormProtos->Protos[i] = NIL_LIST;
259 
260  /* read file header and save in data structure */
261  NormProtos->NumParams = ReadSampleSize (File);
262  NormProtos->ParamDesc = ReadParamDesc (File, NormProtos->NumParams);
263 
264  /* read protos for each class into a separate list */
265  while ((end_offset < 0 || ftell(File) < end_offset) &&
266  tfscanf(File, "%s %d", unichar, &NumProtos) == 2) {
267  if (unicharset.contains_unichar(unichar)) {
268  unichar_id = unicharset.unichar_to_id(unichar);
269  Protos = NormProtos->Protos[unichar_id];
270  for (i = 0; i < NumProtos; i++)
271  Protos =
272  push_last (Protos, ReadPrototype (File, NormProtos->NumParams));
273  NormProtos->Protos[unichar_id] = Protos;
274  } else {
275  cprintf("Error: unichar %s in normproto file is not in unichar set.\n",
276  unichar);
277  for (i = 0; i < NumProtos; i++)
278  FreePrototype(ReadPrototype (File, NormProtos->NumParams));
279  }
280  SkipNewline(File);
281  }
282  return (NormProtos);
283 } /* ReadNormProtos */
284 } // namespace tesseract
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
#define first_node(l)
Definition: oldlist.h:139
PARAM_DESC * ParamDesc
Definition: normmatch.cpp:41
float FLOAT32
Definition: host.h:111
#define double_VAR(name, val, comment)
Definition: params.h:286
#define tprintf(...)
Definition: tprintf.h:31
UNICHARSET unicharset
Definition: ccutil.h:72
double classify_norm_adj_midpoint
Definition: normmatch.cpp:63
int tfscanf(FILE *stream, const char *format,...)
Definition: scanutils.cpp:229
LIST push_last(LIST list, void *item)
Definition: oldlist.cpp:338
unsigned char BOOL8
Definition: host.h:113
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:35
uinT16 ReadSampleSize(FILE *File)
Definition: clusttool.cpp:46
#define iterate(l)
Definition: oldlist.h:159
FLOAT32 * Mean
Definition: cluster.h:78
LIST * Protos
Definition: normmatch.cpp:42
FLOATUNION Weight
Definition: cluster.h:83
void PrintNormMatch(FILE *File, int NumParams, PROTOTYPE *Proto, FEATURE Feature)
Definition: normmatch.cpp:197
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:564
void SkipNewline(FILE *file)
Definition: helpers.h:84
FLOAT32 * Elliptical
Definition: cluster.h:64
int UNICHAR_ID
Definition: unichar.h:33
NORM_PROTOS * ReadNormProtos(FILE *File)
FLOAT32 Params[1]
Definition: ocrfeatures.h:65
void * Emalloc(int Size)
Definition: emalloc.cpp:35
FLOAT32 ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, BOOL8 DebugMatch)
Definition: normmatch.cpp:73
#define NIL_LIST
Definition: oldlist.h:126
#define NO_CLASS
Definition: matchdefs.h:36
FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:643
double NormEvidenceOf(register double NormAdj)
Definition: normmatch.cpp:183
void Efree(void *ptr)
Definition: emalloc.cpp:85
#define MAX_FLOAT32
Definition: host.h:124
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:644
void FreePrototype(void *arg)
Definition: cluster.cpp:579
const double kWidthErrorWeighting
Definition: normmatch.cpp:66
#define NULL
Definition: host.h:144
#define UNICHAR_LEN
Definition: unichar.h:30
void cprintf(const char *format,...)
Definition: callcpp.cpp:40
int size() const
Definition: unicharset.h:297
PARAM_DESC * ReadParamDesc(FILE *File, uinT16 N)
Definition: clusttool.cpp:68
FLOAT32 StandardDeviation(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:657
double classify_norm_adj_curl
Definition: normmatch.cpp:64
NORM_PROTOS * ReadNormProtos(FILE *File, inT64 end_offset)
Definition: normmatch.cpp:234
PROTOTYPE * ReadPrototype(FILE *File, uinT16 N)
Definition: clusttool.cpp:114
NORM_PROTOS * NormProtos
Definition: classify.h:486
long long int inT64
Definition: host.h:108