tesseract  3.04.00
cluster.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: cluster.c
3  ** Purpose: Routines for clustering points in N-D space
4  ** Author: Dan Johnson
5  ** History: 5/29/89, DSJ, Created.
6  **
7  ** (c) Copyright Hewlett-Packard Company, 1988.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  ******************************************************************************/
18 #include "const.h"
19 #include "cluster.h"
20 #include "emalloc.h"
21 #include "genericheap.h"
22 #include "helpers.h"
23 #include "kdpair.h"
24 #include "matrix.h"
25 #include "tprintf.h"
26 #include "danerror.h"
27 #include "freelist.h"
28 #include <math.h>
29 
30 #define HOTELLING 1 // If true use Hotelling's test to decide where to split.
31 #define FTABLE_X 10 // Size of FTable.
32 #define FTABLE_Y 100 // Size of FTable.
33 
34 // Table of values approximating the cumulative F-distribution for a confidence of 1%.
35 const double FTable[FTABLE_Y][FTABLE_X] = {
36  {4052.19, 4999.52, 5403.34, 5624.62, 5763.65, 5858.97, 5928.33, 5981.10, 6022.50, 6055.85,},
37  {98.502, 99.000, 99.166, 99.249, 99.300, 99.333, 99.356, 99.374, 99.388, 99.399,},
38  {34.116, 30.816, 29.457, 28.710, 28.237, 27.911, 27.672, 27.489, 27.345, 27.229,},
39  {21.198, 18.000, 16.694, 15.977, 15.522, 15.207, 14.976, 14.799, 14.659, 14.546,},
40  {16.258, 13.274, 12.060, 11.392, 10.967, 10.672, 10.456, 10.289, 10.158, 10.051,},
41  {13.745, 10.925, 9.780, 9.148, 8.746, 8.466, 8.260, 8.102, 7.976, 7.874,},
42  {12.246, 9.547, 8.451, 7.847, 7.460, 7.191, 6.993, 6.840, 6.719, 6.620,},
43  {11.259, 8.649, 7.591, 7.006, 6.632, 6.371, 6.178, 6.029, 5.911, 5.814,},
44  {10.561, 8.022, 6.992, 6.422, 6.057, 5.802, 5.613, 5.467, 5.351, 5.257,},
45  {10.044, 7.559, 6.552, 5.994, 5.636, 5.386, 5.200, 5.057, 4.942, 4.849,},
46  { 9.646, 7.206, 6.217, 5.668, 5.316, 5.069, 4.886, 4.744, 4.632, 4.539,},
47  { 9.330, 6.927, 5.953, 5.412, 5.064, 4.821, 4.640, 4.499, 4.388, 4.296,},
48  { 9.074, 6.701, 5.739, 5.205, 4.862, 4.620, 4.441, 4.302, 4.191, 4.100,},
49  { 8.862, 6.515, 5.564, 5.035, 4.695, 4.456, 4.278, 4.140, 4.030, 3.939,},
50  { 8.683, 6.359, 5.417, 4.893, 4.556, 4.318, 4.142, 4.004, 3.895, 3.805,},
51  { 8.531, 6.226, 5.292, 4.773, 4.437, 4.202, 4.026, 3.890, 3.780, 3.691,},
52  { 8.400, 6.112, 5.185, 4.669, 4.336, 4.102, 3.927, 3.791, 3.682, 3.593,},
53  { 8.285, 6.013, 5.092, 4.579, 4.248, 4.015, 3.841, 3.705, 3.597, 3.508,},
54  { 8.185, 5.926, 5.010, 4.500, 4.171, 3.939, 3.765, 3.631, 3.523, 3.434,},
55  { 8.096, 5.849, 4.938, 4.431, 4.103, 3.871, 3.699, 3.564, 3.457, 3.368,},
56  { 8.017, 5.780, 4.874, 4.369, 4.042, 3.812, 3.640, 3.506, 3.398, 3.310,},
57  { 7.945, 5.719, 4.817, 4.313, 3.988, 3.758, 3.587, 3.453, 3.346, 3.258,},
58  { 7.881, 5.664, 4.765, 4.264, 3.939, 3.710, 3.539, 3.406, 3.299, 3.211,},
59  { 7.823, 5.614, 4.718, 4.218, 3.895, 3.667, 3.496, 3.363, 3.256, 3.168,},
60  { 7.770, 5.568, 4.675, 4.177, 3.855, 3.627, 3.457, 3.324, 3.217, 3.129,},
61  { 7.721, 5.526, 4.637, 4.140, 3.818, 3.591, 3.421, 3.288, 3.182, 3.094,},
62  { 7.677, 5.488, 4.601, 4.106, 3.785, 3.558, 3.388, 3.256, 3.149, 3.062,},
63  { 7.636, 5.453, 4.568, 4.074, 3.754, 3.528, 3.358, 3.226, 3.120, 3.032,},
64  { 7.598, 5.420, 4.538, 4.045, 3.725, 3.499, 3.330, 3.198, 3.092, 3.005,},
65  { 7.562, 5.390, 4.510, 4.018, 3.699, 3.473, 3.305, 3.173, 3.067, 2.979,},
66  { 7.530, 5.362, 4.484, 3.993, 3.675, 3.449, 3.281, 3.149, 3.043, 2.955,},
67  { 7.499, 5.336, 4.459, 3.969, 3.652, 3.427, 3.258, 3.127, 3.021, 2.934,},
68  { 7.471, 5.312, 4.437, 3.948, 3.630, 3.406, 3.238, 3.106, 3.000, 2.913,},
69  { 7.444, 5.289, 4.416, 3.927, 3.611, 3.386, 3.218, 3.087, 2.981, 2.894,},
70  { 7.419, 5.268, 4.396, 3.908, 3.592, 3.368, 3.200, 3.069, 2.963, 2.876,},
71  { 7.396, 5.248, 4.377, 3.890, 3.574, 3.351, 3.183, 3.052, 2.946, 2.859,},
72  { 7.373, 5.229, 4.360, 3.873, 3.558, 3.334, 3.167, 3.036, 2.930, 2.843,},
73  { 7.353, 5.211, 4.343, 3.858, 3.542, 3.319, 3.152, 3.021, 2.915, 2.828,},
74  { 7.333, 5.194, 4.327, 3.843, 3.528, 3.305, 3.137, 3.006, 2.901, 2.814,},
75  { 7.314, 5.179, 4.313, 3.828, 3.514, 3.291, 3.124, 2.993, 2.888, 2.801,},
76  { 7.296, 5.163, 4.299, 3.815, 3.501, 3.278, 3.111, 2.980, 2.875, 2.788,},
77  { 7.280, 5.149, 4.285, 3.802, 3.488, 3.266, 3.099, 2.968, 2.863, 2.776,},
78  { 7.264, 5.136, 4.273, 3.790, 3.476, 3.254, 3.087, 2.957, 2.851, 2.764,},
79  { 7.248, 5.123, 4.261, 3.778, 3.465, 3.243, 3.076, 2.946, 2.840, 2.754,},
80  { 7.234, 5.110, 4.249, 3.767, 3.454, 3.232, 3.066, 2.935, 2.830, 2.743,},
81  { 7.220, 5.099, 4.238, 3.757, 3.444, 3.222, 3.056, 2.925, 2.820, 2.733,},
82  { 7.207, 5.087, 4.228, 3.747, 3.434, 3.213, 3.046, 2.916, 2.811, 2.724,},
83  { 7.194, 5.077, 4.218, 3.737, 3.425, 3.204, 3.037, 2.907, 2.802, 2.715,},
84  { 7.182, 5.066, 4.208, 3.728, 3.416, 3.195, 3.028, 2.898, 2.793, 2.706,},
85  { 7.171, 5.057, 4.199, 3.720, 3.408, 3.186, 3.020, 2.890, 2.785, 2.698,},
86  { 7.159, 5.047, 4.191, 3.711, 3.400, 3.178, 3.012, 2.882, 2.777, 2.690,},
87  { 7.149, 5.038, 4.182, 3.703, 3.392, 3.171, 3.005, 2.874, 2.769, 2.683,},
88  { 7.139, 5.030, 4.174, 3.695, 3.384, 3.163, 2.997, 2.867, 2.762, 2.675,},
89  { 7.129, 5.021, 4.167, 3.688, 3.377, 3.156, 2.990, 2.860, 2.755, 2.668,},
90  { 7.119, 5.013, 4.159, 3.681, 3.370, 3.149, 2.983, 2.853, 2.748, 2.662,},
91  { 7.110, 5.006, 4.152, 3.674, 3.363, 3.143, 2.977, 2.847, 2.742, 2.655,},
92  { 7.102, 4.998, 4.145, 3.667, 3.357, 3.136, 2.971, 2.841, 2.736, 2.649,},
93  { 7.093, 4.991, 4.138, 3.661, 3.351, 3.130, 2.965, 2.835, 2.730, 2.643,},
94  { 7.085, 4.984, 4.132, 3.655, 3.345, 3.124, 2.959, 2.829, 2.724, 2.637,},
95  { 7.077, 4.977, 4.126, 3.649, 3.339, 3.119, 2.953, 2.823, 2.718, 2.632,},
96  { 7.070, 4.971, 4.120, 3.643, 3.333, 3.113, 2.948, 2.818, 2.713, 2.626,},
97  { 7.062, 4.965, 4.114, 3.638, 3.328, 3.108, 2.942, 2.813, 2.708, 2.621,},
98  { 7.055, 4.959, 4.109, 3.632, 3.323, 3.103, 2.937, 2.808, 2.703, 2.616,},
99  { 7.048, 4.953, 4.103, 3.627, 3.318, 3.098, 2.932, 2.803, 2.698, 2.611,},
100  { 7.042, 4.947, 4.098, 3.622, 3.313, 3.093, 2.928, 2.798, 2.693, 2.607,},
101  { 7.035, 4.942, 4.093, 3.618, 3.308, 3.088, 2.923, 2.793, 2.689, 2.602,},
102  { 7.029, 4.937, 4.088, 3.613, 3.304, 3.084, 2.919, 2.789, 2.684, 2.598,},
103  { 7.023, 4.932, 4.083, 3.608, 3.299, 3.080, 2.914, 2.785, 2.680, 2.593,},
104  { 7.017, 4.927, 4.079, 3.604, 3.295, 3.075, 2.910, 2.781, 2.676, 2.589,},
105  { 7.011, 4.922, 4.074, 3.600, 3.291, 3.071, 2.906, 2.777, 2.672, 2.585,},
106  { 7.006, 4.917, 4.070, 3.596, 3.287, 3.067, 2.902, 2.773, 2.668, 2.581,},
107  { 7.001, 4.913, 4.066, 3.591, 3.283, 3.063, 2.898, 2.769, 2.664, 2.578,},
108  { 6.995, 4.908, 4.062, 3.588, 3.279, 3.060, 2.895, 2.765, 2.660, 2.574,},
109  { 6.990, 4.904, 4.058, 3.584, 3.275, 3.056, 2.891, 2.762, 2.657, 2.570,},
110  { 6.985, 4.900, 4.054, 3.580, 3.272, 3.052, 2.887, 2.758, 2.653, 2.567,},
111  { 6.981, 4.896, 4.050, 3.577, 3.268, 3.049, 2.884, 2.755, 2.650, 2.563,},
112  { 6.976, 4.892, 4.047, 3.573, 3.265, 3.046, 2.881, 2.751, 2.647, 2.560,},
113  { 6.971, 4.888, 4.043, 3.570, 3.261, 3.042, 2.877, 2.748, 2.644, 2.557,},
114  { 6.967, 4.884, 4.040, 3.566, 3.258, 3.039, 2.874, 2.745, 2.640, 2.554,},
115  { 6.963, 4.881, 4.036, 3.563, 3.255, 3.036, 2.871, 2.742, 2.637, 2.551,},
116  { 6.958, 4.877, 4.033, 3.560, 3.252, 3.033, 2.868, 2.739, 2.634, 2.548,},
117  { 6.954, 4.874, 4.030, 3.557, 3.249, 3.030, 2.865, 2.736, 2.632, 2.545,},
118  { 6.950, 4.870, 4.027, 3.554, 3.246, 3.027, 2.863, 2.733, 2.629, 2.542,},
119  { 6.947, 4.867, 4.024, 3.551, 3.243, 3.025, 2.860, 2.731, 2.626, 2.539,},
120  { 6.943, 4.864, 4.021, 3.548, 3.240, 3.022, 2.857, 2.728, 2.623, 2.537,},
121  { 6.939, 4.861, 4.018, 3.545, 3.238, 3.019, 2.854, 2.725, 2.621, 2.534,},
122  { 6.935, 4.858, 4.015, 3.543, 3.235, 3.017, 2.852, 2.723, 2.618, 2.532,},
123  { 6.932, 4.855, 4.012, 3.540, 3.233, 3.014, 2.849, 2.720, 2.616, 2.529,},
124  { 6.928, 4.852, 4.010, 3.538, 3.230, 3.012, 2.847, 2.718, 2.613, 2.527,},
125  { 6.925, 4.849, 4.007, 3.535, 3.228, 3.009, 2.845, 2.715, 2.611, 2.524,},
126  { 6.922, 4.846, 4.004, 3.533, 3.225, 3.007, 2.842, 2.713, 2.609, 2.522,},
127  { 6.919, 4.844, 4.002, 3.530, 3.223, 3.004, 2.840, 2.711, 2.606, 2.520,},
128  { 6.915, 4.841, 3.999, 3.528, 3.221, 3.002, 2.838, 2.709, 2.604, 2.518,},
129  { 6.912, 4.838, 3.997, 3.525, 3.218, 3.000, 2.835, 2.706, 2.602, 2.515,},
130  { 6.909, 4.836, 3.995, 3.523, 3.216, 2.998, 2.833, 2.704, 2.600, 2.513,},
131  { 6.906, 4.833, 3.992, 3.521, 3.214, 2.996, 2.831, 2.702, 2.598, 2.511,},
132  { 6.904, 4.831, 3.990, 3.519, 3.212, 2.994, 2.829, 2.700, 2.596, 2.509,},
133  { 6.901, 4.829, 3.988, 3.517, 3.210, 2.992, 2.827, 2.698, 2.594, 2.507,},
134  { 6.898, 4.826, 3.986, 3.515, 3.208, 2.990, 2.825, 2.696, 2.592, 2.505,},
135  { 6.895, 4.824, 3.984, 3.513, 3.206, 2.988, 2.823, 2.694, 2.590, 2.503}
136 };
137 
138 /* define the variance which will be used as a minimum variance for any
139  dimension of any feature. Since most features are calculated from numbers
140  with a precision no better than 1 in 128, the variance should never be
141  less than the square of this number for parameters whose range is 1. */
142 #define MINVARIANCE 0.0004
143 
144 /* define the absolute minimum number of samples which must be present in
145  order to accurately test hypotheses about underlying probability
146  distributions. Define separately the minimum samples that are needed
147  before a statistical analysis is attempted; this number should be
148  equal to MINSAMPLES but can be set to a lower number for early testing
149  when very few samples are available. */
150 #define MINSAMPLESPERBUCKET 5
151 #define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET)
152 #define MINSAMPLESNEEDED 1
153 
154 /* define the size of the table which maps normalized samples to
155  histogram buckets. Also define the number of standard deviations
156  in a normal distribution which are considered to be significant.
157  The mapping table will be defined in such a way that it covers
158  the specified number of standard deviations on either side of
159  the mean. BUCKETTABLESIZE should always be even. */
160 #define BUCKETTABLESIZE 1024
161 #define NORMALEXTENT 3.0
162 
163 struct TEMPCLUSTER {
166 };
167 
170 
171 struct STATISTICS {
174  FLOAT32 *Min; // largest negative distance from the mean
175  FLOAT32 *Max; // largest positive distance from the mean
176 };
177 
178 struct BUCKETS {
179  DISTRIBUTION Distribution; // distribution being tested for
180  uinT32 SampleCount; // # of samples in histogram
181  FLOAT64 Confidence; // confidence level of test
182  FLOAT64 ChiSquared; // test threshold
183  uinT16 NumberOfBuckets; // number of cells in histogram
184  uinT16 Bucket[BUCKETTABLESIZE];// mapping to histogram buckets
185  uinT32 *Count; // frequency of occurence histogram
186  FLOAT32 *ExpectedCount; // expected histogram
187 };
188 
189 struct CHISTRUCT{
193 };
194 
195 // For use with KDWalk / MakePotentialClusters
197  ClusterHeap *heap; // heap used to hold temp clusters, "best" on top
198  TEMPCLUSTER *candidates; // array of potential clusters
199  KDTREE *tree; // kd-tree to be searched for neighbors
200  inT32 next; // next candidate to be used
201 };
202 
203 typedef FLOAT64 (*DENSITYFUNC) (inT32);
204 typedef FLOAT64 (*SOLVEFUNC) (CHISTRUCT *, double);
205 
206 #define Odd(N) ((N)%2)
207 #define Mirror(N,R) ((R) - (N) - 1)
208 #define Abs(N) ( ( (N) < 0 ) ? ( -(N) ) : (N) )
209 
210 //--------------Global Data Definitions and Declarations----------------------
211 /* the following variables describe a discrete normal distribution
212  which is used by NormalDensity() and NormalBucket(). The
213  constant NORMALEXTENT determines how many standard
214  deviations of the distribution are mapped onto the fixed
215  discrete range of x. x=0 is mapped to -NORMALEXTENT standard
216  deviations and x=BUCKETTABLESIZE is mapped to
217  +NORMALEXTENT standard deviations. */
218 #define SqrtOf2Pi 2.506628275
219 static const FLOAT64 kNormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT);
220 static const FLOAT64 kNormalVariance =
222 static const FLOAT64 kNormalMagnitude =
223  (2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE);
224 static const FLOAT64 kNormalMean = BUCKETTABLESIZE / 2;
225 
226 /* define lookup tables used to compute the number of histogram buckets
227  that should be used for a given number of samples. */
228 #define LOOKUPTABLESIZE 8
229 #define MAXDEGREESOFFREEDOM MAXBUCKETS
230 
231 static const uinT32 kCountTable[LOOKUPTABLESIZE] = {
232  MINSAMPLES, 200, 400, 600, 800, 1000, 1500, 2000
233 }; // number of samples
234 
235 static const uinT16 kBucketsTable[LOOKUPTABLESIZE] = {
236  MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS
237 }; // number of buckets
238 
239 /*-------------------------------------------------------------------------
240  Private Function Prototypes
241 --------------------------------------------------------------------------*/
242 void CreateClusterTree(CLUSTERER *Clusterer);
243 
245  inT32 Level);
246 
248  CLUSTER *Cluster,
249  FLOAT32 *Distance);
250 
251 CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster);
252 
254 register PARAM_DESC ParamDesc[],
255 register inT32 n1,
256 register inT32 n2,
257 register FLOAT32 m[],
258 register FLOAT32 m1[], register FLOAT32 m2[]);
259 
261 
262 PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,
263  CLUSTERCONFIG *Config,
264  CLUSTER *Cluster);
265 
267  CLUSTER *Cluster,
268  STATISTICS *Statistics,
269  PROTOSTYLE Style,
270  inT32 MinSamples);
271 
273  CLUSTERCONFIG *Config,
274  CLUSTER *Cluster,
275  STATISTICS *Statistics);
276 
278  CLUSTER *Cluster,
279  STATISTICS *Statistics,
280  BUCKETS *Buckets);
281 
283  CLUSTER *Cluster,
284  STATISTICS *Statistics,
285  BUCKETS *Buckets);
286 
288  CLUSTER *Cluster,
289  STATISTICS *Statistics,
290  BUCKETS *NormalBuckets,
291  FLOAT64 Confidence);
292 
293 void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc);
294 
295 void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics);
296 
298 PARAM_DESC ParamDesc[], CLUSTER * Cluster);
299 
301  CLUSTER *Cluster,
302  STATISTICS *Statistics);
303 
305  CLUSTER *Cluster,
306  STATISTICS *Statistics);
307 
308 PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics);
309 
310 PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster);
311 
312 BOOL8 Independent (PARAM_DESC ParamDesc[],
313 inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence);
314 
315 BUCKETS *GetBuckets(CLUSTERER* clusterer,
316  DISTRIBUTION Distribution,
317  uinT32 SampleCount,
318  FLOAT64 Confidence);
319 
320 BUCKETS *MakeBuckets(DISTRIBUTION Distribution,
321  uinT32 SampleCount,
322  FLOAT64 Confidence);
323 
325 
327 
329 
331 
333 
334 void FillBuckets(BUCKETS *Buckets,
335  CLUSTER *Cluster,
336  uinT16 Dim,
337  PARAM_DESC *ParamDesc,
338  FLOAT32 Mean,
339  FLOAT32 StdDev);
340 
341 uinT16 NormalBucket(PARAM_DESC *ParamDesc,
342  FLOAT32 x,
343  FLOAT32 Mean,
344  FLOAT32 StdDev);
345 
346 uinT16 UniformBucket(PARAM_DESC *ParamDesc,
347  FLOAT32 x,
348  FLOAT32 Mean,
349  FLOAT32 StdDev);
350 
351 BOOL8 DistributionOK(BUCKETS *Buckets);
352 
353 void FreeStatistics(STATISTICS *Statistics);
354 
355 void FreeBuckets(BUCKETS *Buckets);
356 
357 void FreeCluster(CLUSTER *Cluster);
358 
359 uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets);
360 
361 int NumBucketsMatch(void *arg1, // BUCKETS *Histogram,
362  void *arg2); // uinT16 *DesiredNumberOfBuckets);
363 
364 int ListEntryMatch(void *arg1, void *arg2);
365 
366 void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount);
367 
368 void InitBuckets(BUCKETS *Buckets);
369 
370 int AlphaMatch(void *arg1, // CHISTRUCT *ChiStruct,
371  void *arg2); // CHISTRUCT *SearchKey);
372 
373 CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha);
374 
375 FLOAT64 Solve(SOLVEFUNC Function,
376  void *FunctionParams,
377  FLOAT64 InitialGuess,
378  FLOAT64 Accuracy);
379 
380 FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x);
381 
383  CLUSTER *Cluster,
384  FLOAT32 MaxIllegal);
385 
386 double InvertMatrix(const float* input, int size, float* inv);
387 
388 //--------------------------Public Code--------------------------------------
398 CLUSTERER *
399 MakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[]) {
400  CLUSTERER *Clusterer;
401  int i;
402 
403  // allocate main clusterer data structure and init simple fields
404  Clusterer = (CLUSTERER *) Emalloc (sizeof (CLUSTERER));
405  Clusterer->SampleSize = SampleSize;
406  Clusterer->NumberOfSamples = 0;
407  Clusterer->NumChar = 0;
408 
409  // init fields which will not be used initially
410  Clusterer->Root = NULL;
411  Clusterer->ProtoList = NIL_LIST;
412 
413  // maintain a copy of param descriptors in the clusterer data structure
414  Clusterer->ParamDesc =
415  (PARAM_DESC *) Emalloc (SampleSize * sizeof (PARAM_DESC));
416  for (i = 0; i < SampleSize; i++) {
417  Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
418  Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
419  Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
420  Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
421  Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
422  Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
423  Clusterer->ParamDesc[i].MidRange =
424  (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
425  }
426 
427  // allocate a kd tree to hold the samples
428  Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
429 
430  // Initialize cache of histogram buckets to minimize recomputing them.
431  for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
432  for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
433  Clusterer->bucket_cache[d][c] = NULL;
434  }
435 
436  return Clusterer;
437 } // MakeClusterer
438 
439 
454 SAMPLE* MakeSample(CLUSTERER * Clusterer, const FLOAT32* Feature,
455  inT32 CharID) {
456  SAMPLE *Sample;
457  int i;
458 
459  // see if the samples have already been clustered - if so trap an error
460  if (Clusterer->Root != NULL)
462  "Can't add samples after they have been clustered");
463 
464  // allocate the new sample and initialize it
465  Sample = (SAMPLE *) Emalloc (sizeof (SAMPLE) +
466  (Clusterer->SampleSize -
467  1) * sizeof (FLOAT32));
468  Sample->Clustered = FALSE;
469  Sample->Prototype = FALSE;
470  Sample->SampleCount = 1;
471  Sample->Left = NULL;
472  Sample->Right = NULL;
473  Sample->CharID = CharID;
474 
475  for (i = 0; i < Clusterer->SampleSize; i++)
476  Sample->Mean[i] = Feature[i];
477 
478  // add the sample to the KD tree - keep track of the total # of samples
479  Clusterer->NumberOfSamples++;
480  KDStore (Clusterer->KDTree, Sample->Mean, (char *) Sample);
481  if (CharID >= Clusterer->NumChar)
482  Clusterer->NumChar = CharID + 1;
483 
484  // execute hook for monitoring clustering operation
485  // (*SampleCreationHook)( Sample );
486 
487  return (Sample);
488 } // MakeSample
489 
490 
509  //only create cluster tree if samples have never been clustered before
510  if (Clusterer->Root == NULL)
511  CreateClusterTree(Clusterer);
512 
513  //deallocate the old prototype list if one exists
514  FreeProtoList (&Clusterer->ProtoList);
515  Clusterer->ProtoList = NIL_LIST;
516 
517  //compute prototypes starting at the root node in the tree
518  ComputePrototypes(Clusterer, Config);
519  return (Clusterer->ProtoList);
520 } // ClusterSamples
521 
522 
536 void FreeClusterer(CLUSTERER *Clusterer) {
537  if (Clusterer != NULL) {
538  memfree (Clusterer->ParamDesc);
539  if (Clusterer->KDTree != NULL)
540  FreeKDTree (Clusterer->KDTree);
541  if (Clusterer->Root != NULL)
542  FreeCluster (Clusterer->Root);
543  // Free up all used buckets structures.
544  for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
545  for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
546  if (Clusterer->bucket_cache[d][c] != NULL)
547  FreeBuckets(Clusterer->bucket_cache[d][c]);
548  }
549 
550  memfree(Clusterer);
551  }
552 } // FreeClusterer
553 
554 
564 void FreeProtoList(LIST *ProtoList) {
565  destroy_nodes(*ProtoList, FreePrototype);
566 } // FreeProtoList
567 
568 
579 void FreePrototype(void *arg) { //PROTOTYPE *Prototype)
580  PROTOTYPE *Prototype = (PROTOTYPE *) arg;
581 
582  // unmark the corresponding cluster (if there is one
583  if (Prototype->Cluster != NULL)
584  Prototype->Cluster->Prototype = FALSE;
585 
586  // deallocate the prototype statistics and then the prototype itself
587  if (Prototype->Distrib != NULL)
588  memfree (Prototype->Distrib);
589  if (Prototype->Mean != NULL)
590  memfree (Prototype->Mean);
591  if (Prototype->Style != spherical) {
592  if (Prototype->Variance.Elliptical != NULL)
593  memfree (Prototype->Variance.Elliptical);
594  if (Prototype->Magnitude.Elliptical != NULL)
595  memfree (Prototype->Magnitude.Elliptical);
596  if (Prototype->Weight.Elliptical != NULL)
597  memfree (Prototype->Weight.Elliptical);
598  }
599  memfree(Prototype);
600 } // FreePrototype
601 
602 
618 CLUSTER *NextSample(LIST *SearchState) {
619  CLUSTER *Cluster;
620 
621  if (*SearchState == NIL_LIST)
622  return (NULL);
623  Cluster = (CLUSTER *) first_node (*SearchState);
624  *SearchState = pop (*SearchState);
625  while (TRUE) {
626  if (Cluster->Left == NULL)
627  return (Cluster);
628  *SearchState = push (*SearchState, Cluster->Right);
629  Cluster = Cluster->Left;
630  }
631 } // NextSample
632 
633 
643 FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension) {
644  return (Proto->Mean[Dimension]);
645 } // Mean
646 
647 
658  switch (Proto->Style) {
659  case spherical:
660  return ((FLOAT32) sqrt ((double) Proto->Variance.Spherical));
661  case elliptical:
662  return ((FLOAT32)
663  sqrt ((double) Proto->Variance.Elliptical[Dimension]));
664  case mixed:
665  switch (Proto->Distrib[Dimension]) {
666  case normal:
667  return ((FLOAT32)
668  sqrt ((double) Proto->Variance.Elliptical[Dimension]));
669  case uniform:
670  case D_random:
671  return (Proto->Variance.Elliptical[Dimension]);
672  case DISTRIBUTION_COUNT:
673  ASSERT_HOST(!"Distribution count not allowed!");
674  }
675  }
676  return 0.0f;
677 } // StandardDeviation
678 
679 
680 /*---------------------------------------------------------------------------
681  Private Code
682 ----------------------------------------------------------------------------*/
698 void CreateClusterTree(CLUSTERER *Clusterer) {
699  ClusteringContext context;
700  ClusterPair HeapEntry;
701  TEMPCLUSTER *PotentialCluster;
702 
703  // each sample and its nearest neighbor form a "potential" cluster
704  // save these in a heap with the "best" potential clusters on top
705  context.tree = Clusterer->KDTree;
706  context.candidates = (TEMPCLUSTER *)
707  Emalloc(Clusterer->NumberOfSamples * sizeof(TEMPCLUSTER));
708  context.next = 0;
709  context.heap = new ClusterHeap(Clusterer->NumberOfSamples);
710  KDWalk(context.tree, (void_proc)MakePotentialClusters, &context);
711 
712  // form potential clusters into actual clusters - always do "best" first
713  while (context.heap->Pop(&HeapEntry)) {
714  PotentialCluster = HeapEntry.data;
715 
716  // if main cluster of potential cluster is already in another cluster
717  // then we don't need to worry about it
718  if (PotentialCluster->Cluster->Clustered) {
719  continue;
720  }
721 
722  // if main cluster is not yet clustered, but its nearest neighbor is
723  // then we must find a new nearest neighbor
724  else if (PotentialCluster->Neighbor->Clustered) {
725  PotentialCluster->Neighbor =
726  FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
727  &HeapEntry.key);
728  if (PotentialCluster->Neighbor != NULL) {
729  context.heap->Push(&HeapEntry);
730  }
731  }
732 
733  // if neither cluster is already clustered, form permanent cluster
734  else {
735  PotentialCluster->Cluster =
736  MakeNewCluster(Clusterer, PotentialCluster);
737  PotentialCluster->Neighbor =
738  FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
739  &HeapEntry.key);
740  if (PotentialCluster->Neighbor != NULL) {
741  context.heap->Push(&HeapEntry);
742  }
743  }
744  }
745 
746  // the root node in the cluster tree is now the only node in the kd-tree
747  Clusterer->Root = (CLUSTER *) RootOf(Clusterer->KDTree);
748 
749  // free up the memory used by the K-D tree, heap, and temp clusters
750  FreeKDTree(context.tree);
751  Clusterer->KDTree = NULL;
752  delete context.heap;
753  memfree(context.candidates);
754 } // CreateClusterTree
755 
756 
769  CLUSTER *Cluster, inT32 Level) {
770  ClusterPair HeapEntry;
771  int next = context->next;
772  context->candidates[next].Cluster = Cluster;
773  HeapEntry.data = &(context->candidates[next]);
774  context->candidates[next].Neighbor =
775  FindNearestNeighbor(context->tree,
776  context->candidates[next].Cluster,
777  &HeapEntry.key);
778  if (context->candidates[next].Neighbor != NULL) {
779  context->heap->Push(&HeapEntry);
780  context->next++;
781  }
782 } // MakePotentialClusters
783 
784 
801 CLUSTER *
802 FindNearestNeighbor(KDTREE * Tree, CLUSTER * Cluster, FLOAT32 * Distance)
803 #define MAXNEIGHBORS 2
804 #define MAXDISTANCE MAX_FLOAT32
805 {
807  FLOAT32 Dist[MAXNEIGHBORS];
808  int NumberOfNeighbors;
809  inT32 i;
810  CLUSTER *BestNeighbor;
811 
812  // find the 2 nearest neighbors of the cluster
814  &NumberOfNeighbors, (void **)Neighbor, Dist);
815 
816  // search for the nearest neighbor that is not the cluster itself
817  *Distance = MAXDISTANCE;
818  BestNeighbor = NULL;
819  for (i = 0; i < NumberOfNeighbors; i++) {
820  if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
821  *Distance = Dist[i];
822  BestNeighbor = Neighbor[i];
823  }
824  }
825  return BestNeighbor;
826 } // FindNearestNeighbor
827 
828 
841 CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) {
842  CLUSTER *Cluster;
843 
844  // allocate the new cluster and initialize it
845  Cluster = (CLUSTER *) Emalloc(
846  sizeof(CLUSTER) + (Clusterer->SampleSize - 1) * sizeof(FLOAT32));
847  Cluster->Clustered = FALSE;
848  Cluster->Prototype = FALSE;
849  Cluster->Left = TempCluster->Cluster;
850  Cluster->Right = TempCluster->Neighbor;
851  Cluster->CharID = -1;
852 
853  // mark the old clusters as "clustered" and delete them from the kd-tree
854  Cluster->Left->Clustered = TRUE;
855  Cluster->Right->Clustered = TRUE;
856  KDDelete(Clusterer->KDTree, Cluster->Left->Mean, Cluster->Left);
857  KDDelete(Clusterer->KDTree, Cluster->Right->Mean, Cluster->Right);
858 
859  // compute the mean and sample count for the new cluster
860  Cluster->SampleCount =
861  MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc,
862  Cluster->Left->SampleCount, Cluster->Right->SampleCount,
863  Cluster->Mean, Cluster->Left->Mean, Cluster->Right->Mean);
864 
865  // add the new cluster to the KD tree
866  KDStore(Clusterer->KDTree, Cluster->Mean, Cluster);
867  return Cluster;
868 } // MakeNewCluster
869 
870 
887  PARAM_DESC ParamDesc[],
888  inT32 n1,
889  inT32 n2,
890  FLOAT32 m[],
891  FLOAT32 m1[], FLOAT32 m2[]) {
892  inT32 i, n;
893 
894  n = n1 + n2;
895  for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
896  if (ParamDesc->Circular) {
897  // if distance between means is greater than allowed
898  // reduce upper point by one "rotation" to compute mean
899  // then normalize the mean back into the accepted range
900  if ((*m2 - *m1) > ParamDesc->HalfRange) {
901  *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
902  if (*m < ParamDesc->Min)
903  *m += ParamDesc->Range;
904  }
905  else if ((*m1 - *m2) > ParamDesc->HalfRange) {
906  *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
907  if (*m < ParamDesc->Min)
908  *m += ParamDesc->Range;
909  }
910  else
911  *m = (n1 * *m1 + n2 * *m2) / n;
912  }
913  else
914  *m = (n1 * *m1 + n2 * *m2) / n;
915  }
916  return n;
917 } // MergeClusters
918 
919 
931 void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
932  LIST ClusterStack = NIL_LIST;
933  CLUSTER *Cluster;
934  PROTOTYPE *Prototype;
935 
936  // use a stack to keep track of clusters waiting to be processed
937  // initially the only cluster on the stack is the root cluster
938  if (Clusterer->Root != NULL)
939  ClusterStack = push (NIL_LIST, Clusterer->Root);
940 
941  // loop until we have analyzed all clusters which are potential prototypes
942  while (ClusterStack != NIL_LIST) {
943  // remove the next cluster to be analyzed from the stack
944  // try to make a prototype from the cluster
945  // if successful, put it on the proto list, else split the cluster
946  Cluster = (CLUSTER *) first_node (ClusterStack);
947  ClusterStack = pop (ClusterStack);
948  Prototype = MakePrototype(Clusterer, Config, Cluster);
949  if (Prototype != NULL) {
950  Clusterer->ProtoList = push (Clusterer->ProtoList, Prototype);
951  }
952  else {
953  ClusterStack = push (ClusterStack, Cluster->Right);
954  ClusterStack = push (ClusterStack, Cluster->Left);
955  }
956  }
957 } // ComputePrototypes
958 
959 
979  CLUSTERCONFIG *Config,
980  CLUSTER *Cluster) {
981  STATISTICS *Statistics;
982  PROTOTYPE *Proto;
983  BUCKETS *Buckets;
984 
985  // filter out clusters which contain samples from the same character
986  if (MultipleCharSamples (Clusterer, Cluster, Config->MaxIllegal))
987  return NULL;
988 
989  // compute the covariance matrix and ranges for the cluster
990  Statistics =
991  ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
992 
993  // check for degenerate clusters which need not be analyzed further
994  // note that the MinSamples test assumes that all clusters with multiple
995  // character samples have been removed (as above)
996  Proto = MakeDegenerateProto(
997  Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle,
998  (inT32) (Config->MinSamples * Clusterer->NumChar));
999  if (Proto != NULL) {
1000  FreeStatistics(Statistics);
1001  return Proto;
1002  }
1003  // check to ensure that all dimensions are independent
1004  if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize,
1005  Statistics->CoVariance, Config->Independence)) {
1006  FreeStatistics(Statistics);
1007  return NULL;
1008  }
1009 
1010  if (HOTELLING && Config->ProtoStyle == elliptical) {
1011  Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
1012  if (Proto != NULL) {
1013  FreeStatistics(Statistics);
1014  return Proto;
1015  }
1016  }
1017 
1018  // create a histogram data structure used to evaluate distributions
1019  Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount,
1020  Config->Confidence);
1021 
1022  // create a prototype based on the statistics and test it
1023  switch (Config->ProtoStyle) {
1024  case spherical:
1025  Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
1026  break;
1027  case elliptical:
1028  Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
1029  break;
1030  case mixed:
1031  Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
1032  Config->Confidence);
1033  break;
1034  case automatic:
1035  Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
1036  if (Proto != NULL)
1037  break;
1038  Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
1039  if (Proto != NULL)
1040  break;
1041  Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
1042  Config->Confidence);
1043  break;
1044  }
1045  FreeStatistics(Statistics);
1046  return Proto;
1047 } // MakePrototype
1048 
1049 
1071 PROTOTYPE *MakeDegenerateProto( //this was MinSample
1072  uinT16 N,
1073  CLUSTER *Cluster,
1074  STATISTICS *Statistics,
1075  PROTOSTYLE Style,
1076  inT32 MinSamples) {
1077  PROTOTYPE *Proto = NULL;
1078 
1079  if (MinSamples < MINSAMPLESNEEDED)
1080  MinSamples = MINSAMPLESNEEDED;
1081 
1082  if (Cluster->SampleCount < MinSamples) {
1083  switch (Style) {
1084  case spherical:
1085  Proto = NewSphericalProto (N, Cluster, Statistics);
1086  break;
1087  case elliptical:
1088  case automatic:
1089  Proto = NewEllipticalProto (N, Cluster, Statistics);
1090  break;
1091  case mixed:
1092  Proto = NewMixedProto (N, Cluster, Statistics);
1093  break;
1094  }
1095  Proto->Significant = FALSE;
1096  }
1097  return (Proto);
1098 } // MakeDegenerateProto
1099 
1114  CLUSTERCONFIG *Config,
1115  CLUSTER *Cluster,
1116  STATISTICS *Statistics) {
1117  // Fraction of the number of samples used as a range around 1 within
1118  // which a cluster has the magic size that allows a boost to the
1119  // FTable by kFTableBoostMargin, thus allowing clusters near the
1120  // magic size (equal to the number of sample characters) to be more
1121  // likely to stay together.
1122  const double kMagicSampleMargin = 0.0625;
1123  const double kFTableBoostMargin = 2.0;
1124 
1125  int N = Clusterer->SampleSize;
1126  CLUSTER* Left = Cluster->Left;
1127  CLUSTER* Right = Cluster->Right;
1128  if (Left == NULL || Right == NULL)
1129  return NULL;
1130  int TotalDims = Left->SampleCount + Right->SampleCount;
1131  if (TotalDims < N + 1 || TotalDims < 2)
1132  return NULL;
1133  const int kMatrixSize = N * N * sizeof(FLOAT32);
1134  FLOAT32* Covariance = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
1135  FLOAT32* Inverse = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
1136  FLOAT32* Delta = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32)));
1137  // Compute a new covariance matrix that only uses essential features.
1138  for (int i = 0; i < N; ++i) {
1139  int row_offset = i * N;
1140  if (!Clusterer->ParamDesc[i].NonEssential) {
1141  for (int j = 0; j < N; ++j) {
1142  if (!Clusterer->ParamDesc[j].NonEssential)
1143  Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
1144  else
1145  Covariance[j + row_offset] = 0.0f;
1146  }
1147  } else {
1148  for (int j = 0; j < N; ++j) {
1149  if (i == j)
1150  Covariance[j + row_offset] = 1.0f;
1151  else
1152  Covariance[j + row_offset] = 0.0f;
1153  }
1154  }
1155  }
1156  double err = InvertMatrix(Covariance, N, Inverse);
1157  if (err > 1) {
1158  tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
1159  }
1160  int EssentialN = 0;
1161  for (int dim = 0; dim < N; ++dim) {
1162  if (!Clusterer->ParamDesc[dim].NonEssential) {
1163  Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
1164  ++EssentialN;
1165  } else {
1166  Delta[dim] = 0.0f;
1167  }
1168  }
1169  // Compute Hotelling's T-squared.
1170  double Tsq = 0.0;
1171  for (int x = 0; x < N; ++x) {
1172  double temp = 0.0;
1173  for (int y = 0; y < N; ++y) {
1174  temp += Inverse[y + N*x] * Delta[y];
1175  }
1176  Tsq += Delta[x] * temp;
1177  }
1178  memfree(Covariance);
1179  memfree(Inverse);
1180  memfree(Delta);
1181  // Changed this function to match the formula in
1182  // Statistical Methods in Medical Research p 473
1183  // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
1184  // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
1185  double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN);
1186  int Fx = EssentialN;
1187  if (Fx > FTABLE_X)
1188  Fx = FTABLE_X;
1189  --Fx;
1190  int Fy = TotalDims - EssentialN - 1;
1191  if (Fy > FTABLE_Y)
1192  Fy = FTABLE_Y;
1193  --Fy;
1194  double FTarget = FTable[Fy][Fx];
1195  if (Config->MagicSamples > 0 &&
1196  TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
1197  TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
1198  // Give magic-sized clusters a magic FTable boost.
1199  FTarget += kFTableBoostMargin;
1200  }
1201  if (F < FTarget) {
1202  return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1203  }
1204  return NULL;
1205 }
1206 
1207 /* MakeSphericalProto *******************************************************
1208 Parameters: Clusterer data struct containing samples being clustered
1209  Cluster cluster to be made into a spherical prototype
1210  Statistics statistical info about cluster
1211  Buckets histogram struct used to analyze distribution
1212 Operation: This routine tests the specified cluster to see if it can
1213  be approximated by a spherical normal distribution. If it
1214  can be, then a new prototype is formed and returned to the
1215  caller. If it can't be, then NULL is returned to the caller.
1216 Return: Pointer to new spherical prototype or NULL.
1217 Exceptions: None
1218 History: 6/1/89, DSJ, Created.
1219 ******************************************************************************/
1221  CLUSTER *Cluster,
1222  STATISTICS *Statistics,
1223  BUCKETS *Buckets) {
1224  PROTOTYPE *Proto = NULL;
1225  int i;
1226 
1227  // check that each dimension is a normal distribution
1228  for (i = 0; i < Clusterer->SampleSize; i++) {
1229  if (Clusterer->ParamDesc[i].NonEssential)
1230  continue;
1231 
1232  FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1233  Cluster->Mean[i],
1234  sqrt ((FLOAT64) (Statistics->AvgVariance)));
1235  if (!DistributionOK (Buckets))
1236  break;
1237  }
1238  // if all dimensions matched a normal distribution, make a proto
1239  if (i >= Clusterer->SampleSize)
1240  Proto = NewSphericalProto (Clusterer->SampleSize, Cluster, Statistics);
1241  return (Proto);
1242 } // MakeSphericalProto
1243 
1244 
1259  CLUSTER *Cluster,
1260  STATISTICS *Statistics,
1261  BUCKETS *Buckets) {
1262  PROTOTYPE *Proto = NULL;
1263  int i;
1264 
1265  // check that each dimension is a normal distribution
1266  for (i = 0; i < Clusterer->SampleSize; i++) {
1267  if (Clusterer->ParamDesc[i].NonEssential)
1268  continue;
1269 
1270  FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1271  Cluster->Mean[i],
1272  sqrt ((FLOAT64) Statistics->
1273  CoVariance[i * (Clusterer->SampleSize + 1)]));
1274  if (!DistributionOK (Buckets))
1275  break;
1276  }
1277  // if all dimensions matched a normal distribution, make a proto
1278  if (i >= Clusterer->SampleSize)
1279  Proto = NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1280  return (Proto);
1281 } // MakeEllipticalProto
1282 
1283 
1303  CLUSTER *Cluster,
1304  STATISTICS *Statistics,
1305  BUCKETS *NormalBuckets,
1306  FLOAT64 Confidence) {
1307  PROTOTYPE *Proto;
1308  int i;
1309  BUCKETS *UniformBuckets = NULL;
1310  BUCKETS *RandomBuckets = NULL;
1311 
1312  // create a mixed proto to work on - initially assume all dimensions normal*/
1313  Proto = NewMixedProto (Clusterer->SampleSize, Cluster, Statistics);
1314 
1315  // find the proper distribution for each dimension
1316  for (i = 0; i < Clusterer->SampleSize; i++) {
1317  if (Clusterer->ParamDesc[i].NonEssential)
1318  continue;
1319 
1320  FillBuckets (NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1321  Proto->Mean[i],
1322  sqrt ((FLOAT64) Proto->Variance.Elliptical[i]));
1323  if (DistributionOK (NormalBuckets))
1324  continue;
1325 
1326  if (RandomBuckets == NULL)
1327  RandomBuckets =
1328  GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence);
1329  MakeDimRandom (i, Proto, &(Clusterer->ParamDesc[i]));
1330  FillBuckets (RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1331  Proto->Mean[i], Proto->Variance.Elliptical[i]);
1332  if (DistributionOK (RandomBuckets))
1333  continue;
1334 
1335  if (UniformBuckets == NULL)
1336  UniformBuckets =
1337  GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence);
1338  MakeDimUniform(i, Proto, Statistics);
1339  FillBuckets (UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1340  Proto->Mean[i], Proto->Variance.Elliptical[i]);
1341  if (DistributionOK (UniformBuckets))
1342  continue;
1343  break;
1344  }
1345  // if any dimension failed to match a distribution, discard the proto
1346  if (i < Clusterer->SampleSize) {
1347  FreePrototype(Proto);
1348  Proto = NULL;
1349  }
1350  return (Proto);
1351 } // MakeMixedProto
1352 
1353 
1354 /* MakeDimRandom *************************************************************
1355 Parameters: i index of dimension to be changed
1356  Proto prototype whose dimension is to be altered
1357  ParamDesc description of specified dimension
1358 Operation: This routine alters the ith dimension of the specified
1359  mixed prototype to be D_random.
1360 Return: None
1361 Exceptions: None
1362 History: 6/20/89, DSJ, Created.
1363 ******************************************************************************/
1364 void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) {
1365  Proto->Distrib[i] = D_random;
1366  Proto->Mean[i] = ParamDesc->MidRange;
1367  Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;
1368 
1369  // subtract out the previous magnitude of this dimension from the total
1370  Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1371  Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;
1372  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1373  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1374 
1375  // note that the proto Weight is irrelevant for D_random protos
1376 } // MakeDimRandom
1377 
1378 
1389 void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics) {
1390  Proto->Distrib[i] = uniform;
1391  Proto->Mean[i] = Proto->Cluster->Mean[i] +
1392  (Statistics->Min[i] + Statistics->Max[i]) / 2;
1393  Proto->Variance.Elliptical[i] =
1394  (Statistics->Max[i] - Statistics->Min[i]) / 2;
1395  if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1396  Proto->Variance.Elliptical[i] = MINVARIANCE;
1397 
1398  // subtract out the previous magnitude of this dimension from the total
1399  Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1400  Proto->Magnitude.Elliptical[i] =
1401  1.0 / (2.0 * Proto->Variance.Elliptical[i]);
1402  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1403  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1404 
1405  // note that the proto Weight is irrelevant for uniform protos
1406 } // MakeDimUniform
1407 
1408 
1425 STATISTICS *
1426 ComputeStatistics (inT16 N, PARAM_DESC ParamDesc[], CLUSTER * Cluster) {
1427  STATISTICS *Statistics;
1428  int i, j;
1429  FLOAT32 *CoVariance;
1430  FLOAT32 *Distance;
1431  LIST SearchState;
1432  SAMPLE *Sample;
1433  uinT32 SampleCountAdjustedForBias;
1434 
1435  // allocate memory to hold the statistics results
1436  Statistics = (STATISTICS *) Emalloc (sizeof (STATISTICS));
1437  Statistics->CoVariance = (FLOAT32 *) Emalloc (N * N * sizeof (FLOAT32));
1438  Statistics->Min = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1439  Statistics->Max = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1440 
1441  // allocate temporary memory to hold the sample to mean distances
1442  Distance = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1443 
1444  // initialize the statistics
1445  Statistics->AvgVariance = 1.0;
1446  CoVariance = Statistics->CoVariance;
1447  for (i = 0; i < N; i++) {
1448  Statistics->Min[i] = 0.0;
1449  Statistics->Max[i] = 0.0;
1450  for (j = 0; j < N; j++, CoVariance++)
1451  *CoVariance = 0;
1452  }
1453  // find each sample in the cluster and merge it into the statistics
1454  InitSampleSearch(SearchState, Cluster);
1455  while ((Sample = NextSample (&SearchState)) != NULL) {
1456  for (i = 0; i < N; i++) {
1457  Distance[i] = Sample->Mean[i] - Cluster->Mean[i];
1458  if (ParamDesc[i].Circular) {
1459  if (Distance[i] > ParamDesc[i].HalfRange)
1460  Distance[i] -= ParamDesc[i].Range;
1461  if (Distance[i] < -ParamDesc[i].HalfRange)
1462  Distance[i] += ParamDesc[i].Range;
1463  }
1464  if (Distance[i] < Statistics->Min[i])
1465  Statistics->Min[i] = Distance[i];
1466  if (Distance[i] > Statistics->Max[i])
1467  Statistics->Max[i] = Distance[i];
1468  }
1469  CoVariance = Statistics->CoVariance;
1470  for (i = 0; i < N; i++)
1471  for (j = 0; j < N; j++, CoVariance++)
1472  *CoVariance += Distance[i] * Distance[j];
1473  }
1474  // normalize the variances by the total number of samples
1475  // use SampleCount-1 instead of SampleCount to get an unbiased estimate
1476  // also compute the geometic mean of the diagonal variances
1477  // ensure that clusters with only 1 sample are handled correctly
1478  if (Cluster->SampleCount > 1)
1479  SampleCountAdjustedForBias = Cluster->SampleCount - 1;
1480  else
1481  SampleCountAdjustedForBias = 1;
1482  CoVariance = Statistics->CoVariance;
1483  for (i = 0; i < N; i++)
1484  for (j = 0; j < N; j++, CoVariance++) {
1485  *CoVariance /= SampleCountAdjustedForBias;
1486  if (j == i) {
1487  if (*CoVariance < MINVARIANCE)
1488  *CoVariance = MINVARIANCE;
1489  Statistics->AvgVariance *= *CoVariance;
1490  }
1491  }
1492  Statistics->AvgVariance = (float)pow((double)Statistics->AvgVariance,
1493  1.0 / N);
1494 
1495  // release temporary memory and return
1496  memfree(Distance);
1497  return (Statistics);
1498 } // ComputeStatistics
1499 
1500 
1515  CLUSTER *Cluster,
1516  STATISTICS *Statistics) {
1517  PROTOTYPE *Proto;
1518 
1519  Proto = NewSimpleProto (N, Cluster);
1520 
1521  Proto->Variance.Spherical = Statistics->AvgVariance;
1522  if (Proto->Variance.Spherical < MINVARIANCE)
1523  Proto->Variance.Spherical = MINVARIANCE;
1524 
1525  Proto->Magnitude.Spherical =
1526  1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical));
1527  Proto->TotalMagnitude = (float)pow((double)Proto->Magnitude.Spherical,
1528  (double) N);
1529  Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
1530  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1531 
1532  return (Proto);
1533 } // NewSphericalProto
1534 
1535 
1549  CLUSTER *Cluster,
1550  STATISTICS *Statistics) {
1551  PROTOTYPE *Proto;
1552  FLOAT32 *CoVariance;
1553  int i;
1554 
1555  Proto = NewSimpleProto (N, Cluster);
1556  Proto->Variance.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1557  Proto->Magnitude.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1558  Proto->Weight.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1559 
1560  CoVariance = Statistics->CoVariance;
1561  Proto->TotalMagnitude = 1.0;
1562  for (i = 0; i < N; i++, CoVariance += N + 1) {
1563  Proto->Variance.Elliptical[i] = *CoVariance;
1564  if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1565  Proto->Variance.Elliptical[i] = MINVARIANCE;
1566 
1567  Proto->Magnitude.Elliptical[i] =
1568  1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i]));
1569  Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i];
1570  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1571  }
1572  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1573  Proto->Style = elliptical;
1574  return (Proto);
1575 } // NewEllipticalProto
1576 
1577 
1593 PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics) {
1594  PROTOTYPE *Proto;
1595  int i;
1596 
1597  Proto = NewEllipticalProto (N, Cluster, Statistics);
1598  Proto->Distrib = (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION));
1599 
1600  for (i = 0; i < N; i++) {
1601  Proto->Distrib[i] = normal;
1602  }
1603  Proto->Style = mixed;
1604  return (Proto);
1605 } // NewMixedProto
1606 
1607 
1619  PROTOTYPE *Proto;
1620  int i;
1621 
1622  Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE));
1623  Proto->Mean = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1624 
1625  for (i = 0; i < N; i++)
1626  Proto->Mean[i] = Cluster->Mean[i];
1627  Proto->Distrib = NULL;
1628 
1629  Proto->Significant = TRUE;
1630  Proto->Merged = FALSE;
1631  Proto->Style = spherical;
1632  Proto->NumSamples = Cluster->SampleCount;
1633  Proto->Cluster = Cluster;
1634  Proto->Cluster->Prototype = TRUE;
1635  return (Proto);
1636 } // NewSimpleProto
1637 
1638 
1659 BOOL8
1661 inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence) {
1662  int i, j;
1663  FLOAT32 *VARii; // points to ith on-diagonal element
1664  FLOAT32 *VARjj; // points to jth on-diagonal element
1665  FLOAT32 CorrelationCoeff;
1666 
1667  VARii = CoVariance;
1668  for (i = 0; i < N; i++, VARii += N + 1) {
1669  if (ParamDesc[i].NonEssential)
1670  continue;
1671 
1672  VARjj = VARii + N + 1;
1673  CoVariance = VARii + 1;
1674  for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
1675  if (ParamDesc[j].NonEssential)
1676  continue;
1677 
1678  if ((*VARii == 0.0) || (*VARjj == 0.0))
1679  CorrelationCoeff = 0.0;
1680  else
1681  CorrelationCoeff =
1682  sqrt (sqrt (*CoVariance * *CoVariance / (*VARii * *VARjj)));
1683  if (CorrelationCoeff > Independence)
1684  return (FALSE);
1685  }
1686  }
1687  return (TRUE);
1688 } // Independent
1689 
1690 
1710  DISTRIBUTION Distribution,
1711  uinT32 SampleCount,
1712  FLOAT64 Confidence) {
1713  // Get an old bucket structure with the same number of buckets.
1714  uinT16 NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
1715  BUCKETS *Buckets =
1716  clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS];
1717 
1718  // If a matching bucket structure is not found, make one and save it.
1719  if (Buckets == NULL) {
1720  Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
1721  clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] =
1722  Buckets;
1723  } else {
1724  // Just adjust the existing buckets.
1725  if (SampleCount != Buckets->SampleCount)
1726  AdjustBuckets(Buckets, SampleCount);
1727  if (Confidence != Buckets->Confidence) {
1728  Buckets->Confidence = Confidence;
1729  Buckets->ChiSquared = ComputeChiSquared(
1730  DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets),
1731  Confidence);
1732  }
1733  InitBuckets(Buckets);
1734  }
1735  return Buckets;
1736 } // GetBuckets
1737 
1738 
1760  uinT32 SampleCount,
1761  FLOAT64 Confidence) {
1762  const DENSITYFUNC DensityFunction[] =
1763  { NormalDensity, UniformDensity, UniformDensity };
1764  int i, j;
1765  BUCKETS *Buckets;
1766  FLOAT64 BucketProbability;
1767  FLOAT64 NextBucketBoundary;
1768  FLOAT64 Probability;
1769  FLOAT64 ProbabilityDelta;
1770  FLOAT64 LastProbDensity;
1771  FLOAT64 ProbDensity;
1772  uinT16 CurrentBucket;
1773  BOOL8 Symmetrical;
1774 
1775  // allocate memory needed for data structure
1776  Buckets = reinterpret_cast<BUCKETS*>(Emalloc(sizeof(BUCKETS)));
1777  Buckets->NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
1778  Buckets->SampleCount = SampleCount;
1779  Buckets->Confidence = Confidence;
1780  Buckets->Count = reinterpret_cast<uinT32*>(
1781  Emalloc(Buckets->NumberOfBuckets * sizeof(uinT32)));
1782  Buckets->ExpectedCount = reinterpret_cast<FLOAT32*>(
1783  Emalloc(Buckets->NumberOfBuckets * sizeof(FLOAT32)));
1784 
1785  // initialize simple fields
1786  Buckets->Distribution = Distribution;
1787  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
1788  Buckets->Count[i] = 0;
1789  Buckets->ExpectedCount[i] = 0.0;
1790  }
1791 
1792  // all currently defined distributions are symmetrical
1793  Symmetrical = TRUE;
1794  Buckets->ChiSquared = ComputeChiSquared(
1795  DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
1796 
1797  if (Symmetrical) {
1798  // allocate buckets so that all have approx. equal probability
1799  BucketProbability = 1.0 / (FLOAT64) (Buckets->NumberOfBuckets);
1800 
1801  // distribution is symmetric so fill in upper half then copy
1802  CurrentBucket = Buckets->NumberOfBuckets / 2;
1803  if (Odd (Buckets->NumberOfBuckets))
1804  NextBucketBoundary = BucketProbability / 2;
1805  else
1806  NextBucketBoundary = BucketProbability;
1807 
1808  Probability = 0.0;
1809  LastProbDensity =
1810  (*DensityFunction[(int) Distribution]) (BUCKETTABLESIZE / 2);
1811  for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) {
1812  ProbDensity = (*DensityFunction[(int) Distribution]) (i + 1);
1813  ProbabilityDelta = Integral (LastProbDensity, ProbDensity, 1.0);
1814  Probability += ProbabilityDelta;
1815  if (Probability > NextBucketBoundary) {
1816  if (CurrentBucket < Buckets->NumberOfBuckets - 1)
1817  CurrentBucket++;
1818  NextBucketBoundary += BucketProbability;
1819  }
1820  Buckets->Bucket[i] = CurrentBucket;
1821  Buckets->ExpectedCount[CurrentBucket] +=
1822  (FLOAT32) (ProbabilityDelta * SampleCount);
1823  LastProbDensity = ProbDensity;
1824  }
1825  // place any leftover probability into the last bucket
1826  Buckets->ExpectedCount[CurrentBucket] +=
1827  (FLOAT32) ((0.5 - Probability) * SampleCount);
1828 
1829  // copy upper half of distribution to lower half
1830  for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--)
1831  Buckets->Bucket[i] =
1832  Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets);
1833 
1834  // copy upper half of expected counts to lower half
1835  for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--)
1836  Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];
1837  }
1838  return Buckets;
1839 } // MakeBuckets
1840 
1841 
1842 //---------------------------------------------------------------------------
1844 /*
1845  ** Parameters:
1846  ** SampleCount number of samples to be tested
1847  ** Operation:
1848  ** This routine computes the optimum number of histogram
1849  ** buckets that should be used in a chi-squared goodness of
1850  ** fit test for the specified number of samples. The optimum
1851  ** number is computed based on Table 4.1 on pg. 147 of
1852  ** "Measurement and Analysis of Random Data" by Bendat & Piersol.
1853  ** Linear interpolation is used to interpolate between table
1854  ** values. The table is intended for a 0.05 level of
1855  ** significance (alpha). This routine assumes that it is
1856  ** equally valid for other alpha's, which may not be true.
1857  ** Return:
1858  ** Optimum number of histogram buckets
1859  ** Exceptions:
1860  ** None
1861  ** History:
1862  ** 6/5/89, DSJ, Created.
1863  */
1864  uinT8 Last, Next;
1865  FLOAT32 Slope;
1866 
1867  if (SampleCount < kCountTable[0])
1868  return kBucketsTable[0];
1869 
1870  for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) {
1871  if (SampleCount <= kCountTable[Next]) {
1872  Slope = (FLOAT32) (kBucketsTable[Next] - kBucketsTable[Last]) /
1873  (FLOAT32) (kCountTable[Next] - kCountTable[Last]);
1874  return ((uinT16) (kBucketsTable[Last] +
1875  Slope * (SampleCount - kCountTable[Last])));
1876  }
1877  }
1878  return kBucketsTable[Last];
1879 } // OptimumNumberOfBuckets
1880 
1881 
1882 //---------------------------------------------------------------------------
1883 FLOAT64
1884 ComputeChiSquared (uinT16 DegreesOfFreedom, FLOAT64 Alpha)
1885 /*
1886  ** Parameters:
1887  ** DegreesOfFreedom determines shape of distribution
1888  ** Alpha probability of right tail
1889  ** Operation:
1890  ** This routine computes the chi-squared value which will
1891  ** leave a cumulative probability of Alpha in the right tail
1892  ** of a chi-squared distribution with the specified number of
1893  ** degrees of freedom. Alpha must be between 0 and 1.
1894  ** DegreesOfFreedom must be even. The routine maintains an
1895  ** array of lists. Each list corresponds to a different
1896  ** number of degrees of freedom. Each entry in the list
1897  ** corresponds to a different alpha value and its corresponding
1898  ** chi-squared value. Therefore, once a particular chi-squared
1899  ** value is computed, it is stored in the list and never
1900  ** needs to be computed again.
1901  ** Return: Desired chi-squared value
1902  ** Exceptions: none
1903  ** History: 6/5/89, DSJ, Created.
1904  */
1905 #define CHIACCURACY 0.01
1906 #define MINALPHA (1e-200)
1907 {
1908  static LIST ChiWith[MAXDEGREESOFFREEDOM + 1];
1909 
1910  CHISTRUCT *OldChiSquared;
1911  CHISTRUCT SearchKey;
1912 
1913  // limit the minimum alpha that can be used - if alpha is too small
1914  // it may not be possible to compute chi-squared.
1915  Alpha = ClipToRange(Alpha, MINALPHA, 1.0);
1916  if (Odd (DegreesOfFreedom))
1917  DegreesOfFreedom++;
1918 
1919  /* find the list of chi-squared values which have already been computed
1920  for the specified number of degrees of freedom. Search the list for
1921  the desired chi-squared. */
1922  SearchKey.Alpha = Alpha;
1923  OldChiSquared = (CHISTRUCT *) first_node (search (ChiWith[DegreesOfFreedom],
1924  &SearchKey, AlphaMatch));
1925 
1926  if (OldChiSquared == NULL) {
1927  OldChiSquared = NewChiStruct (DegreesOfFreedom, Alpha);
1928  OldChiSquared->ChiSquared = Solve (ChiArea, OldChiSquared,
1929  (FLOAT64) DegreesOfFreedom,
1930  (FLOAT64) CHIACCURACY);
1931  ChiWith[DegreesOfFreedom] = push (ChiWith[DegreesOfFreedom],
1932  OldChiSquared);
1933  }
1934  else {
1935  // further optimization might move OldChiSquared to front of list
1936  }
1937 
1938  return (OldChiSquared->ChiSquared);
1939 
1940 } // ComputeChiSquared
1941 
1942 
1943 //---------------------------------------------------------------------------
1945 /*
1946  ** Parameters:
1947  ** x number to compute the normal probability density for
1948  ** Globals:
1949  ** kNormalMean mean of a discrete normal distribution
1950  ** kNormalVariance variance of a discrete normal distribution
1951  ** kNormalMagnitude magnitude of a discrete normal distribution
1952  ** Operation:
1953  ** This routine computes the probability density function
1954  ** of a discrete normal distribution defined by the global
1955  ** variables kNormalMean, kNormalVariance, and kNormalMagnitude.
1956  ** Normal magnitude could, of course, be computed in terms of
1957  ** the normal variance but it is precomputed for efficiency.
1958  ** Return:
1959  ** The value of the normal distribution at x.
1960  ** Exceptions:
1961  ** None
1962  ** History:
1963  ** 6/4/89, DSJ, Created.
1964  */
1965  FLOAT64 Distance;
1966 
1967  Distance = x - kNormalMean;
1968  return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance);
1969 } // NormalDensity
1970 
1971 
1972 //---------------------------------------------------------------------------
1974 /*
1975  ** Parameters:
1976  ** x number to compute the uniform probability density for
1977  ** Operation:
1978  ** This routine computes the probability density function
1979  ** of a uniform distribution at the specified point. The
1980  ** range of the distribution is from 0 to BUCKETTABLESIZE.
1981  ** Return:
1982  ** The value of the uniform distribution at x.
1983  ** Exceptions:
1984  ** None
1985  ** History:
1986  ** 6/5/89, DSJ, Created.
1987  */
1988  static FLOAT64 UniformDistributionDensity = (FLOAT64) 1.0 / BUCKETTABLESIZE;
1989 
1990  if ((x >= 0.0) && (x <= BUCKETTABLESIZE))
1991  return UniformDistributionDensity;
1992  else
1993  return (FLOAT64) 0.0;
1994 } // UniformDensity
1995 
1996 
1997 //---------------------------------------------------------------------------
1999 /*
2000  ** Parameters:
2001  ** f1 value of function at x1
2002  ** f2 value of function at x2
2003  ** Dx x2 - x1 (should always be positive)
2004  ** Operation:
2005  ** This routine computes a trapezoidal approximation to the
2006  ** integral of a function over a small delta in x.
2007  ** Return:
2008  ** Approximation of the integral of the function from x1 to x2.
2009  ** Exceptions:
2010  ** None
2011  ** History:
2012  ** 6/5/89, DSJ, Created.
2013  */
2014  return (f1 + f2) * Dx / 2.0;
2015 } // Integral
2016 
2017 
2018 //---------------------------------------------------------------------------
2019 void FillBuckets(BUCKETS *Buckets,
2020  CLUSTER *Cluster,
2021  uinT16 Dim,
2022  PARAM_DESC *ParamDesc,
2023  FLOAT32 Mean,
2024  FLOAT32 StdDev) {
2025 /*
2026  ** Parameters:
2027  ** Buckets histogram buckets to count samples
2028  ** Cluster cluster whose samples are being analyzed
2029  ** Dim dimension of samples which is being analyzed
2030  ** ParamDesc description of the dimension
2031  ** Mean "mean" of the distribution
2032  ** StdDev "standard deviation" of the distribution
2033  ** Operation:
2034  ** This routine counts the number of cluster samples which
2035  ** fall within the various histogram buckets in Buckets. Only
2036  ** one dimension of each sample is examined. The exact meaning
2037  ** of the Mean and StdDev parameters depends on the
2038  ** distribution which is being analyzed (this info is in the
2039  ** Buckets data structure). For normal distributions, Mean
2040  ** and StdDev have the expected meanings. For uniform and
2041  ** random distributions the Mean is the center point of the
2042  ** range and the StdDev is 1/2 the range. A dimension with
2043  ** zero standard deviation cannot be statistically analyzed.
2044  ** In this case, a pseudo-analysis is used.
2045  ** Return:
2046  ** None (the Buckets data structure is filled in)
2047  ** Exceptions:
2048  ** None
2049  ** History:
2050  ** 6/5/89, DSJ, Created.
2051  */
2052  uinT16 BucketID;
2053  int i;
2054  LIST SearchState;
2055  SAMPLE *Sample;
2056 
2057  // initialize the histogram bucket counts to 0
2058  for (i = 0; i < Buckets->NumberOfBuckets; i++)
2059  Buckets->Count[i] = 0;
2060 
2061  if (StdDev == 0.0) {
2062  /* if the standard deviation is zero, then we can't statistically
2063  analyze the cluster. Use a pseudo-analysis: samples exactly on
2064  the mean are distributed evenly across all buckets. Samples greater
2065  than the mean are placed in the last bucket; samples less than the
2066  mean are placed in the first bucket. */
2067 
2068  InitSampleSearch(SearchState, Cluster);
2069  i = 0;
2070  while ((Sample = NextSample (&SearchState)) != NULL) {
2071  if (Sample->Mean[Dim] > Mean)
2072  BucketID = Buckets->NumberOfBuckets - 1;
2073  else if (Sample->Mean[Dim] < Mean)
2074  BucketID = 0;
2075  else
2076  BucketID = i;
2077  Buckets->Count[BucketID] += 1;
2078  i++;
2079  if (i >= Buckets->NumberOfBuckets)
2080  i = 0;
2081  }
2082  }
2083  else {
2084  // search for all samples in the cluster and add to histogram buckets
2085  InitSampleSearch(SearchState, Cluster);
2086  while ((Sample = NextSample (&SearchState)) != NULL) {
2087  switch (Buckets->Distribution) {
2088  case normal:
2089  BucketID = NormalBucket (ParamDesc, Sample->Mean[Dim],
2090  Mean, StdDev);
2091  break;
2092  case D_random:
2093  case uniform:
2094  BucketID = UniformBucket (ParamDesc, Sample->Mean[Dim],
2095  Mean, StdDev);
2096  break;
2097  default:
2098  BucketID = 0;
2099  }
2100  Buckets->Count[Buckets->Bucket[BucketID]] += 1;
2101  }
2102  }
2103 } // FillBuckets
2104 
2105 
2106 //---------------------------------------------------------------------------*/
2108  FLOAT32 x,
2109  FLOAT32 Mean,
2110  FLOAT32 StdDev) {
2111 /*
2112  ** Parameters:
2113  ** ParamDesc used to identify circular dimensions
2114  ** x value to be normalized
2115  ** Mean mean of normal distribution
2116  ** StdDev standard deviation of normal distribution
2117  ** Operation:
2118  ** This routine determines which bucket x falls into in the
2119  ** discrete normal distribution defined by kNormalMean
2120  ** and kNormalStdDev. x values which exceed the range of
2121  ** the discrete distribution are clipped.
2122  ** Return:
2123  ** Bucket number into which x falls
2124  ** Exceptions:
2125  ** None
2126  ** History:
2127  ** 6/5/89, DSJ, Created.
2128  */
2129  FLOAT32 X;
2130 
2131  // wraparound circular parameters if necessary
2132  if (ParamDesc->Circular) {
2133  if (x - Mean > ParamDesc->HalfRange)
2134  x -= ParamDesc->Range;
2135  else if (x - Mean < -ParamDesc->HalfRange)
2136  x += ParamDesc->Range;
2137  }
2138 
2139  X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean;
2140  if (X < 0)
2141  return 0;
2142  if (X > BUCKETTABLESIZE - 1)
2143  return ((uinT16) (BUCKETTABLESIZE - 1));
2144  return (uinT16) floor((FLOAT64) X);
2145 } // NormalBucket
2146 
2147 
2148 //---------------------------------------------------------------------------
2150  FLOAT32 x,
2151  FLOAT32 Mean,
2152  FLOAT32 StdDev) {
2153 /*
2154  ** Parameters:
2155  ** ParamDesc used to identify circular dimensions
2156  ** x value to be normalized
2157  ** Mean center of range of uniform distribution
2158  ** StdDev 1/2 the range of the uniform distribution
2159  ** Operation:
2160  ** This routine determines which bucket x falls into in the
2161  ** discrete uniform distribution defined by
2162  ** BUCKETTABLESIZE. x values which exceed the range of
2163  ** the discrete distribution are clipped.
2164  ** Return:
2165  ** Bucket number into which x falls
2166  ** Exceptions:
2167  ** None
2168  ** History:
2169  ** 6/5/89, DSJ, Created.
2170  */
2171  FLOAT32 X;
2172 
2173  // wraparound circular parameters if necessary
2174  if (ParamDesc->Circular) {
2175  if (x - Mean > ParamDesc->HalfRange)
2176  x -= ParamDesc->Range;
2177  else if (x - Mean < -ParamDesc->HalfRange)
2178  x += ParamDesc->Range;
2179  }
2180 
2181  X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0);
2182  if (X < 0)
2183  return 0;
2184  if (X > BUCKETTABLESIZE - 1)
2185  return (uinT16) (BUCKETTABLESIZE - 1);
2186  return (uinT16) floor((FLOAT64) X);
2187 } // UniformBucket
2188 
2189 
2190 //---------------------------------------------------------------------------
2192 /*
2193  ** Parameters:
2194  ** Buckets histogram data to perform chi-square test on
2195  ** Operation:
2196  ** This routine performs a chi-square goodness of fit test
2197  ** on the histogram data in the Buckets data structure. TRUE
2198  ** is returned if the histogram matches the probability
2199  ** distribution which was specified when the Buckets
2200  ** structure was originally created. Otherwise FALSE is
2201  ** returned.
2202  ** Return:
2203  ** TRUE if samples match distribution, FALSE otherwise
2204  ** Exceptions:
2205  ** None
2206  ** History:
2207  ** 6/5/89, DSJ, Created.
2208  */
2209  FLOAT32 FrequencyDifference;
2210  FLOAT32 TotalDifference;
2211  int i;
2212 
2213  // compute how well the histogram matches the expected histogram
2214  TotalDifference = 0.0;
2215  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2216  FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];
2217  TotalDifference += (FrequencyDifference * FrequencyDifference) /
2218  Buckets->ExpectedCount[i];
2219  }
2220 
2221  // test to see if the difference is more than expected
2222  if (TotalDifference > Buckets->ChiSquared)
2223  return FALSE;
2224  else
2225  return TRUE;
2226 } // DistributionOK
2227 
2228 
2229 //---------------------------------------------------------------------------
2230 void FreeStatistics(STATISTICS *Statistics) {
2231 /*
2232  ** Parameters:
2233  ** Statistics pointer to data structure to be freed
2234  ** Operation:
2235  ** This routine frees the memory used by the statistics
2236  ** data structure.
2237  ** Return:
2238  ** None
2239  ** Exceptions:
2240  ** None
2241  ** History:
2242  ** 6/5/89, DSJ, Created.
2243  */
2244  memfree (Statistics->CoVariance);
2245  memfree (Statistics->Min);
2246  memfree (Statistics->Max);
2247  memfree(Statistics);
2248 } // FreeStatistics
2249 
2250 
2251 //---------------------------------------------------------------------------
2252 void FreeBuckets(BUCKETS *buckets) {
2253 /*
2254  ** Parameters:
2255  ** buckets pointer to data structure to be freed
2256  ** Operation:
2257  ** This routine properly frees the memory used by a BUCKETS.
2258  */
2259  Efree(buckets->Count);
2260  Efree(buckets->ExpectedCount);
2261  Efree(buckets);
2262 } // FreeBuckets
2263 
2264 
2265 //---------------------------------------------------------------------------
2266 void FreeCluster(CLUSTER *Cluster) {
2267 /*
2268  ** Parameters:
2269  ** Cluster pointer to cluster to be freed
2270  ** Operation:
2271  ** This routine frees the memory consumed by the specified
2272  ** cluster and all of its subclusters. This is done by
2273  ** recursive calls to FreeCluster().
2274  ** Return:
2275  ** None
2276  ** Exceptions:
2277  ** None
2278  ** History:
2279  ** 6/6/89, DSJ, Created.
2280  */
2281  if (Cluster != NULL) {
2282  FreeCluster (Cluster->Left);
2283  FreeCluster (Cluster->Right);
2284  memfree(Cluster);
2285  }
2286 } // FreeCluster
2287 
2288 
2289 //---------------------------------------------------------------------------
2290 uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets) {
2291 /*
2292  ** Parameters:
2293  ** Distribution distribution being tested for
2294  ** HistogramBuckets number of buckets in chi-square test
2295  ** Operation:
2296  ** This routine computes the degrees of freedom that should
2297  ** be used in a chi-squared test with the specified number of
2298  ** histogram buckets. The result is always rounded up to
2299  ** the next even number so that the value of chi-squared can be
2300  ** computed more easily. This will cause the value of
2301  ** chi-squared to be higher than the optimum value, resulting
2302  ** in the chi-square test being more lenient than optimum.
2303  ** Return: The number of degrees of freedom for a chi-square test
2304  ** Exceptions: none
2305  ** History: Thu Aug 3 14:04:18 1989, DSJ, Created.
2306  */
2307  static uinT8 DegreeOffsets[] = { 3, 3, 1 };
2308 
2309  uinT16 AdjustedNumBuckets;
2310 
2311  AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[(int) Distribution];
2312  if (Odd (AdjustedNumBuckets))
2313  AdjustedNumBuckets++;
2314  return (AdjustedNumBuckets);
2315 
2316 } // DegreesOfFreedom
2317 
2318 
2319 //---------------------------------------------------------------------------
2320 int NumBucketsMatch(void *arg1, // BUCKETS *Histogram,
2321  void *arg2) { // uinT16 *DesiredNumberOfBuckets)
2322 /*
2323  ** Parameters:
2324  ** Histogram current histogram being tested for a match
2325  ** DesiredNumberOfBuckets match key
2326  ** Operation:
2327  ** This routine is used to search a list of histogram data
2328  ** structures to find one with the specified number of
2329  ** buckets. It is called by the list search routines.
2330  ** Return: TRUE if Histogram matches DesiredNumberOfBuckets
2331  ** Exceptions: none
2332  ** History: Thu Aug 3 14:17:33 1989, DSJ, Created.
2333  */
2334  BUCKETS *Histogram = (BUCKETS *) arg1;
2335  uinT16 *DesiredNumberOfBuckets = (uinT16 *) arg2;
2336 
2337  return (*DesiredNumberOfBuckets == Histogram->NumberOfBuckets);
2338 
2339 } // NumBucketsMatch
2340 
2341 
2342 //---------------------------------------------------------------------------
2343 int ListEntryMatch(void *arg1, //ListNode
2344  void *arg2) { //Key
2345 /*
2346  ** Parameters: none
2347  ** Operation:
2348  ** This routine is used to search a list for a list node
2349  ** whose contents match Key. It is called by the list
2350  ** delete_d routine.
2351  ** Return: TRUE if ListNode matches Key
2352  ** Exceptions: none
2353  ** History: Thu Aug 3 14:23:58 1989, DSJ, Created.
2354  */
2355  return (arg1 == arg2);
2356 
2357 } // ListEntryMatch
2358 
2359 
2360 //---------------------------------------------------------------------------
2361 void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount) {
2362 /*
2363  ** Parameters:
2364  ** Buckets histogram data structure to adjust
2365  ** NewSampleCount new sample count to adjust to
2366  ** Operation:
2367  ** This routine multiplies each ExpectedCount histogram entry
2368  ** by NewSampleCount/OldSampleCount so that the histogram
2369  ** is now adjusted to the new sample count.
2370  ** Return: none
2371  ** Exceptions: none
2372  ** History: Thu Aug 3 14:31:14 1989, DSJ, Created.
2373  */
2374  int i;
2375  FLOAT64 AdjustFactor;
2376 
2377  AdjustFactor = (((FLOAT64) NewSampleCount) /
2378  ((FLOAT64) Buckets->SampleCount));
2379 
2380  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2381  Buckets->ExpectedCount[i] *= AdjustFactor;
2382  }
2383 
2384  Buckets->SampleCount = NewSampleCount;
2385 
2386 } // AdjustBuckets
2387 
2388 
2389 //---------------------------------------------------------------------------
2390 void InitBuckets(BUCKETS *Buckets) {
2391 /*
2392  ** Parameters:
2393  ** Buckets histogram data structure to init
2394  ** Operation:
2395  ** This routine sets the bucket counts in the specified histogram
2396  ** to zero.
2397  ** Return: none
2398  ** Exceptions: none
2399  ** History: Thu Aug 3 14:31:14 1989, DSJ, Created.
2400  */
2401  int i;
2402 
2403  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2404  Buckets->Count[i] = 0;
2405  }
2406 
2407 } // InitBuckets
2408 
2409 
2410 //---------------------------------------------------------------------------
2411 int AlphaMatch(void *arg1, //CHISTRUCT *ChiStruct,
2412  void *arg2) { //CHISTRUCT *SearchKey)
2413 /*
2414  ** Parameters:
2415  ** ChiStruct chi-squared struct being tested for a match
2416  ** SearchKey chi-squared struct that is the search key
2417  ** Operation:
2418  ** This routine is used to search a list of structures which
2419  ** hold pre-computed chi-squared values for a chi-squared
2420  ** value whose corresponding alpha field matches the alpha
2421  ** field of SearchKey.
2422  ** It is called by the list search routines.
2423  ** Return: TRUE if ChiStruct's Alpha matches SearchKey's Alpha
2424  ** Exceptions: none
2425  ** History: Thu Aug 3 14:17:33 1989, DSJ, Created.
2426  */
2427  CHISTRUCT *ChiStruct = (CHISTRUCT *) arg1;
2428  CHISTRUCT *SearchKey = (CHISTRUCT *) arg2;
2429 
2430  return (ChiStruct->Alpha == SearchKey->Alpha);
2431 
2432 } // AlphaMatch
2433 
2434 
2435 //---------------------------------------------------------------------------
2436 CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha) {
2437 /*
2438  ** Parameters:
2439  ** DegreesOfFreedom degrees of freedom for new chi value
2440  ** Alpha confidence level for new chi value
2441  ** Operation:
2442  ** This routine allocates a new data structure which is used
2443  ** to hold a chi-squared value along with its associated
2444  ** number of degrees of freedom and alpha value.
2445  ** Return: none
2446  ** Exceptions: none
2447  ** History: Fri Aug 4 11:04:59 1989, DSJ, Created.
2448  */
2450 
2451  NewChiStruct = (CHISTRUCT *) Emalloc (sizeof (CHISTRUCT));
2452  NewChiStruct->DegreesOfFreedom = DegreesOfFreedom;
2453  NewChiStruct->Alpha = Alpha;
2454  return (NewChiStruct);
2455 
2456 } // NewChiStruct
2457 
2458 
2459 //---------------------------------------------------------------------------
2460 FLOAT64
2461 Solve (SOLVEFUNC Function,
2462 void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy)
2463 /*
2464  ** Parameters:
2465  ** Function function whose zero is to be found
2466  ** FunctionParams arbitrary data to pass to function
2467  ** InitialGuess point to start solution search at
2468  ** Accuracy maximum allowed error
2469  ** Operation:
2470  ** This routine attempts to find an x value at which Function
2471  ** goes to zero (i.e. a root of the function ). It will only
2472  ** work correctly if a solution actually exists and there
2473  ** are no extrema between the solution and the InitialGuess.
2474  ** The algorithms used are extremely primitive.
2475  ** Return: Solution of function ( x for which f(x) = 0 ).
2476  ** Exceptions: none
2477  ** History: Fri Aug 4 11:08:59 1989, DSJ, Created.
2478  */
2479 #define INITIALDELTA 0.1
2480 #define DELTARATIO 0.1
2481 {
2482  FLOAT64 x;
2483  FLOAT64 f;
2484  FLOAT64 Slope;
2485  FLOAT64 Delta;
2486  FLOAT64 NewDelta;
2487  FLOAT64 xDelta;
2488  FLOAT64 LastPosX, LastNegX;
2489 
2490  x = InitialGuess;
2491  Delta = INITIALDELTA;
2492  LastPosX = MAX_FLOAT32;
2493  LastNegX = -MAX_FLOAT32;
2494  f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2495  while (Abs (LastPosX - LastNegX) > Accuracy) {
2496  // keep track of outer bounds of current estimate
2497  if (f < 0)
2498  LastNegX = x;
2499  else
2500  LastPosX = x;
2501 
2502  // compute the approx. slope of f(x) at the current point
2503  Slope =
2504  ((*Function) ((CHISTRUCT *) FunctionParams, x + Delta) - f) / Delta;
2505 
2506  // compute the next solution guess */
2507  xDelta = f / Slope;
2508  x -= xDelta;
2509 
2510  // reduce the delta used for computing slope to be a fraction of
2511  //the amount moved to get to the new guess
2512  NewDelta = Abs (xDelta) * DELTARATIO;
2513  if (NewDelta < Delta)
2514  Delta = NewDelta;
2515 
2516  // compute the value of the function at the new guess
2517  f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2518  }
2519  return (x);
2520 
2521 } // Solve
2522 
2523 
2524 //---------------------------------------------------------------------------
2526 /*
2527  ** Parameters:
2528  ** ChiParams contains degrees of freedom and alpha
2529  ** x value of chi-squared to evaluate
2530  ** Operation:
2531  ** This routine computes the area under a chi density curve
2532  ** from 0 to x, minus the desired area under the curve. The
2533  ** number of degrees of freedom of the chi curve is specified
2534  ** in the ChiParams structure. The desired area is also
2535  ** specified in the ChiParams structure as Alpha ( or 1 minus
2536  ** the desired area ). This routine is intended to be passed
2537  ** to the Solve() function to find the value of chi-squared
2538  ** which will yield a desired area under the right tail of
2539  ** the chi density curve. The function will only work for
2540  ** even degrees of freedom. The equations are based on
2541  ** integrating the chi density curve in parts to obtain
2542  ** a series that can be used to compute the area under the
2543  ** curve.
2544  ** Return: Error between actual and desired area under the chi curve.
2545  ** Exceptions: none
2546  ** History: Fri Aug 4 12:48:41 1989, DSJ, Created.
2547  */
2548  int i, N;
2549  FLOAT64 SeriesTotal;
2550  FLOAT64 Denominator;
2551  FLOAT64 PowerOfx;
2552 
2553  N = ChiParams->DegreesOfFreedom / 2 - 1;
2554  SeriesTotal = 1;
2555  Denominator = 1;
2556  PowerOfx = 1;
2557  for (i = 1; i <= N; i++) {
2558  Denominator *= 2 * i;
2559  PowerOfx *= x;
2560  SeriesTotal += PowerOfx / Denominator;
2561  }
2562  return ((SeriesTotal * exp (-0.5 * x)) - ChiParams->Alpha);
2563 
2564 } // ChiArea
2565 
2566 
2567 //---------------------------------------------------------------------------
2568 BOOL8
2570 CLUSTER * Cluster, FLOAT32 MaxIllegal)
2571 /*
2572  ** Parameters:
2573  ** Clusterer data structure holding cluster tree
2574  ** Cluster cluster containing samples to be tested
2575  ** MaxIllegal max percentage of samples allowed to have
2576  ** more than 1 feature in the cluster
2577  ** Operation:
2578  ** This routine looks at all samples in the specified cluster.
2579  ** It computes a running estimate of the percentage of the
2580  ** charaters which have more than 1 sample in the cluster.
2581  ** When this percentage exceeds MaxIllegal, TRUE is returned.
2582  ** Otherwise FALSE is returned. The CharID
2583  ** fields must contain integers which identify the training
2584  ** characters which were used to generate the sample. One
2585  ** integer is used for each sample. The NumChar field in
2586  ** the Clusterer must contain the number of characters in the
2587  ** training set. All CharID fields must be between 0 and
2588  ** NumChar-1. The main function of this routine is to help
2589  ** identify clusters which need to be split further, i.e. if
2590  ** numerous training characters have 2 or more features which are
2591  ** contained in the same cluster, then the cluster should be
2592  ** split.
2593  ** Return: TRUE if the cluster should be split, FALSE otherwise.
2594  ** Exceptions: none
2595  ** History: Wed Aug 30 11:13:05 1989, DSJ, Created.
2596  ** 2/22/90, DSJ, Added MaxIllegal control rather than always
2597  ** splitting illegal clusters.
2598  */
2599 #define ILLEGAL_CHAR 2
2600 {
2601  static BOOL8 *CharFlags = NULL;
2602  static inT32 NumFlags = 0;
2603  int i;
2604  LIST SearchState;
2605  SAMPLE *Sample;
2606  inT32 CharID;
2607  inT32 NumCharInCluster;
2608  inT32 NumIllegalInCluster;
2609  FLOAT32 PercentIllegal;
2610 
2611  // initial estimate assumes that no illegal chars exist in the cluster
2612  NumCharInCluster = Cluster->SampleCount;
2613  NumIllegalInCluster = 0;
2614 
2615  if (Clusterer->NumChar > NumFlags) {
2616  if (CharFlags != NULL)
2617  memfree(CharFlags);
2618  NumFlags = Clusterer->NumChar;
2619  CharFlags = (BOOL8 *) Emalloc (NumFlags * sizeof (BOOL8));
2620  }
2621 
2622  for (i = 0; i < NumFlags; i++)
2623  CharFlags[i] = FALSE;
2624 
2625  // find each sample in the cluster and check if we have seen it before
2626  InitSampleSearch(SearchState, Cluster);
2627  while ((Sample = NextSample (&SearchState)) != NULL) {
2628  CharID = Sample->CharID;
2629  if (CharFlags[CharID] == FALSE) {
2630  CharFlags[CharID] = TRUE;
2631  }
2632  else {
2633  if (CharFlags[CharID] == TRUE) {
2634  NumIllegalInCluster++;
2635  CharFlags[CharID] = ILLEGAL_CHAR;
2636  }
2637  NumCharInCluster--;
2638  PercentIllegal = (FLOAT32) NumIllegalInCluster / NumCharInCluster;
2639  if (PercentIllegal > MaxIllegal) {
2640  destroy(SearchState);
2641  return (TRUE);
2642  }
2643  }
2644  }
2645  return (FALSE);
2646 
2647 } // MultipleCharSamples
2648 
2649 // Compute the inverse of a matrix using LU decomposition with partial pivoting.
2650 // The return value is the sum of norms of the off-diagonal terms of the
2651 // product of a and inv. (A measure of the error.)
2652 double InvertMatrix(const float* input, int size, float* inv) {
2653  // Allocate memory for the 2D arrays.
2654  GENERIC_2D_ARRAY<double> U(size, size, 0.0);
2655  GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0);
2656  GENERIC_2D_ARRAY<double> L(size, size, 0.0);
2657 
2658  // Initialize the working matrices. U starts as input, L as I and U_inv as O.
2659  int row;
2660  int col;
2661  for (row = 0; row < size; row++) {
2662  for (col = 0; col < size; col++) {
2663  U[row][col] = input[row*size + col];
2664  L[row][col] = row == col ? 1.0 : 0.0;
2665  U_inv[row][col] = 0.0;
2666  }
2667  }
2668 
2669  // Compute forward matrix by inversion by LU decomposition of input.
2670  for (col = 0; col < size; ++col) {
2671  // Find best pivot
2672  int best_row = 0;
2673  double best_pivot = -1.0;
2674  for (row = col; row < size; ++row) {
2675  if (Abs(U[row][col]) > best_pivot) {
2676  best_pivot = Abs(U[row][col]);
2677  best_row = row;
2678  }
2679  }
2680  // Exchange pivot rows.
2681  if (best_row != col) {
2682  for (int k = 0; k < size; ++k) {
2683  double tmp = U[best_row][k];
2684  U[best_row][k] = U[col][k];
2685  U[col][k] = tmp;
2686  tmp = L[best_row][k];
2687  L[best_row][k] = L[col][k];
2688  L[col][k] = tmp;
2689  }
2690  }
2691  // Now do the pivot itself.
2692  for (row = col + 1; row < size; ++row) {
2693  double ratio = -U[row][col] / U[col][col];
2694  for (int j = col; j < size; ++j) {
2695  U[row][j] += U[col][j] * ratio;
2696  }
2697  for (int k = 0; k < size; ++k) {
2698  L[row][k] += L[col][k] * ratio;
2699  }
2700  }
2701  }
2702  // Next invert U.
2703  for (col = 0; col < size; ++col) {
2704  U_inv[col][col] = 1.0 / U[col][col];
2705  for (row = col - 1; row >= 0; --row) {
2706  double total = 0.0;
2707  for (int k = col; k > row; --k) {
2708  total += U[row][k] * U_inv[k][col];
2709  }
2710  U_inv[row][col] = -total / U[row][row];
2711  }
2712  }
2713  // Now the answer is U_inv.L.
2714  for (row = 0; row < size; row++) {
2715  for (col = 0; col < size; col++) {
2716  double sum = 0.0;
2717  for (int k = row; k < size; ++k) {
2718  sum += U_inv[row][k] * L[k][col];
2719  }
2720  inv[row*size + col] = sum;
2721  }
2722  }
2723  // Check matrix product.
2724  double error_sum = 0.0;
2725  for (row = 0; row < size; row++) {
2726  for (col = 0; col < size; col++) {
2727  double sum = 0.0;
2728  for (int k = 0; k < size; ++k) {
2729  sum += input[row*size + k] * inv[k *size + col];
2730  }
2731  if (row != col) {
2732  error_sum += Abs(sum);
2733  }
2734  }
2735  }
2736  return error_sum;
2737 }
CLUSTERER * MakeClusterer(inT16 SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:399
int inT32
Definition: host.h:102
CLUSTER * FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, FLOAT32 *Distance)
Definition: cluster.cpp:802
FLOAT32 MinSamples
Definition: cluster.h:50
tesseract::KDPairInc< float, TEMPCLUSTER * > ClusterPair
Definition: cluster.cpp:168
uinT16 OptimumNumberOfBuckets(uinT32 SampleCount)
Definition: cluster.cpp:1843
FLOAT32 * Mean
Definition: cluster.h:78
void CreateClusterTree(CLUSTERER *Clusterer)
Definition: cluster.cpp:698
CLUSTER * Cluster
Definition: cluster.cpp:164
uinT16 NumberOfBuckets
Definition: cluster.cpp:183
void FreePrototype(void *arg)
Definition: cluster.cpp:579
#define MAXNEIGHBORS
uinT16 NormalBucket(PARAM_DESC *ParamDesc, FLOAT32 x, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:2107
KDTREE * KDTree
Definition: cluster.h:90
CLUSTER * Cluster
Definition: cluster.h:76
void KDNearestNeighborSearch(KDTREE *Tree, FLOAT32 Query[], int QuerySize, FLOAT32 MaxDistance, int *NumberOfResults, void **NBuffer, FLOAT32 DBuffer[])
Definition: kdtree.cpp:307
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:508
void destroy_nodes(LIST list, void_dest destructor)
Definition: oldlist.cpp:204
FLOAT32 Mean[1]
Definition: cluster.h:39
DISTRIBUTION * Distrib
Definition: cluster.h:77
#define MAXDEGREESOFFREEDOM
Definition: cluster.cpp:229
#define tprintf(...)
Definition: tprintf.h:31
#define CHIACCURACY
FLOATUNION Variance
Definition: cluster.h:81
inT16 SampleSize
Definition: cluster.h:87
FLOAT64 Confidence
Definition: cluster.h:54
PROTOTYPE * MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
Definition: cluster.cpp:1220
FLOAT32 TotalMagnitude
Definition: cluster.h:79
FLOAT32 MaxIllegal
Definition: cluster.h:51
PROTOTYPE * NewSphericalProto(uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1514
tesseract::GenericHeap< ClusterPair > ClusterHeap
Definition: cluster.cpp:169
PROTOTYPE * MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster)
Definition: cluster.cpp:978
#define MAX_FLOAT32
Definition: host.h:124
DISTRIBUTION
Definition: cluster.h:58
LIST push(LIST list, void *element)
Definition: oldlist.cpp:323
CLUSTERCONFIG Config
#define DELTARATIO
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1-MINBUCKETS]
Definition: cluster.h:95
FLOAT64 ChiSquared
Definition: cluster.cpp:192
#define NULL
Definition: host.h:144
void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, inT32 Level)
Definition: cluster.cpp:768
unsigned int uinT32
Definition: host.h:103
BUCKETS * MakeBuckets(DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
Definition: cluster.cpp:1759
PROTOTYPE * MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
Definition: cluster.cpp:1258
FLOAT32 HalfRange
Definition: ocrfeatures.h:52
void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:931
#define FTABLE_Y
Definition: cluster.cpp:32
FLOAT32 MidRange
Definition: ocrfeatures.h:53
PROTOTYPE * MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *NormalBuckets, FLOAT64 Confidence)
Definition: cluster.cpp:1302
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:413
FLOAT64 Confidence
Definition: cluster.cpp:181
#define ILLEGAL_CHAR
unsigned Prototype
Definition: cluster.h:34
LIST pop(LIST list)
Definition: oldlist.cpp:305
uinT16 DegreesOfFreedom
Definition: cluster.cpp:190
#define FALSE
Definition: capi.h:29
PROTOTYPE * NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1593
CLUSTER * MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster)
Definition: cluster.cpp:841
inT32 NumberOfSamples
Definition: cluster.h:89
BOOL8 Independent(PARAM_DESC ParamDesc[], inT16 N, FLOAT32 *CoVariance, FLOAT32 Independence)
Definition: cluster.cpp:1660
void InitBuckets(BUCKETS *Buckets)
Definition: cluster.cpp:2390
uinT16 Bucket[BUCKETTABLESIZE]
Definition: cluster.cpp:184
PROTOTYPE * NewSimpleProto(inT16 N, CLUSTER *Cluster)
Definition: cluster.cpp:1618
inT32 MergeClusters(inT16 N, register PARAM_DESC ParamDesc[], register inT32 n1, register inT32 n2, register FLOAT32 m[], register FLOAT32 m1[], register FLOAT32 m2[])
FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx)
Definition: cluster.cpp:1998
STATISTICS * ComputeStatistics(inT16 N, PARAM_DESC ParamDesc[], CLUSTER *Cluster)
Definition: cluster.cpp:1426
CLUSTER * NextSample(LIST *SearchState)
Definition: cluster.cpp:618
uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets)
Definition: cluster.cpp:2290
void FreeCluster(CLUSTER *Cluster)
Definition: cluster.cpp:2266
BOOL8 MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, FLOAT32 MaxIllegal)
Definition: cluster.cpp:2569
#define MINALPHA
void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount)
Definition: cluster.cpp:2361
const double FTable[FTABLE_Y][FTABLE_X]
Definition: cluster.cpp:35
int NumBucketsMatch(void *arg1, void *arg2)
Definition: cluster.cpp:2320
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:536
#define BUCKETTABLESIZE
Definition: cluster.cpp:160
PROTOSTYLE ProtoStyle
Definition: cluster.h:49
#define LOOKUPTABLESIZE
Definition: cluster.cpp:228
BOOL8 DistributionOK(BUCKETS *Buckets)
Definition: cluster.cpp:2191
#define PI
Definition: const.h:19
BUCKETS * GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
Definition: cluster.cpp:1709
FLOAT64 NormalDensity(inT32 x)
Definition: cluster.cpp:1944
unsigned Significant
Definition: cluster.h:68
FLOAT32 * Elliptical
Definition: cluster.h:64
CHISTRUCT * NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha)
Definition: cluster.cpp:2436
SAMPLE * MakeSample(CLUSTERER *Clusterer, const FLOAT32 *Feature, inT32 CharID)
Definition: cluster.cpp:454
void FreeKDTree(KDTREE *Tree)
Definition: kdtree.cpp:346
#define Abs(N)
Definition: cluster.cpp:208
#define MAXBUCKETS
Definition: cluster.h:27
TEMPCLUSTER * candidates
Definition: cluster.cpp:198
FLOAT32 StandardDeviation(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:657
FLOAT64(* DENSITYFUNC)(inT32)
Definition: cluster.cpp:203
void FreeStatistics(STATISTICS *Statistics)
Definition: cluster.cpp:2230
FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:643
PARAM_DESC * ParamDesc
Definition: cluster.h:88
Definition: cluster.h:32
#define NORMALEXTENT
Definition: cluster.cpp:161
FLOAT32 * ExpectedCount
Definition: cluster.cpp:186
FLOAT64 ChiSquared
Definition: cluster.cpp:182
void Efree(void *ptr)
Definition: emalloc.cpp:85
short inT16
Definition: host.h:100
CLUSTER * Root
Definition: cluster.h:91
unsigned Merged
Definition: cluster.h:69
#define MINVARIANCE
Definition: cluster.cpp:142
inT8 NonEssential
Definition: ocrfeatures.h:48
uinT32 SampleCount
Definition: cluster.cpp:180
inT32 CharID
Definition: cluster.h:38
#define MINSAMPLESNEEDED
Definition: cluster.cpp:152
ClusterHeap * heap
Definition: cluster.cpp:197
FLOAT32 * Min
Definition: cluster.cpp:174
FLOATUNION Weight
Definition: cluster.h:83
#define MINBUCKETS
Definition: cluster.h:26
#define MAXDISTANCE
FLOATUNION Magnitude
Definition: cluster.h:82
struct sample * Right
Definition: cluster.h:37
#define INITIALDELTA
FLOAT64 ComputeChiSquared(uinT16 DegreesOfFreedom, FLOAT64 Alpha)
Definition: cluster.cpp:1884
void DoError(int Error, const char *Message)
Definition: danerror.cpp:32
struct sample * Left
Definition: cluster.h:36
FLOAT32 AvgVariance
Definition: cluster.cpp:172
double FLOAT64
Definition: host.h:112
unsigned NumSamples
Definition: cluster.h:75
unsigned short uinT16
Definition: host.h:101
FLOAT32 Spherical
Definition: cluster.h:63
#define NIL_LIST
Definition: oldlist.h:126
PROTOTYPE * MakeDegenerateProto(uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics, PROTOSTYLE Style, inT32 MinSamples)
Definition: cluster.cpp:1071
void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics)
Definition: cluster.cpp:1389
void FreeBuckets(BUCKETS *Buckets)
Definition: cluster.cpp:2252
int ListEntryMatch(void *arg1, void *arg2)
Definition: cluster.cpp:2343
PROTOSTYLE
Definition: cluster.h:44
FLOAT64 Alpha
Definition: cluster.cpp:191
inT32 NumChar
Definition: cluster.h:93
FLOAT32 * CoVariance
Definition: cluster.cpp:173
inT8 Circular
Definition: ocrfeatures.h:47
#define MINSAMPLES
Definition: cluster.cpp:151
unsigned Clustered
Definition: cluster.h:33
unsigned char BOOL8
Definition: host.h:113
#define TRUE
Definition: capi.h:28
Definition: kdtree.h:49
void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc)
Definition: cluster.cpp:1364
Definition: cluster.h:45
CLUSTER * Neighbor
Definition: cluster.cpp:165
PROTOTYPE * TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1113
void * Emalloc(int Size)
Definition: emalloc.cpp:35
FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x)
Definition: cluster.cpp:2525
PROTOTYPE * NewEllipticalProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1548
#define Odd(N)
Definition: cluster.cpp:206
KDTREE * MakeKDTree(inT16 KeySize, const PARAM_DESC KeyDesc[])
Definition: kdtree.cpp:184
void KDWalk(KDTREE *Tree, void_proc action, void *context)
Definition: kdtree.cpp:339
FLOAT32 * Max
Definition: cluster.cpp:175
bool Pop(Pair *entry)
Definition: genericheap.h:116
FLOAT32 Range
Definition: ocrfeatures.h:51
unsigned Style
Definition: cluster.h:74
void memfree(void *element)
Definition: freelist.cpp:30
FLOAT32 Min
Definition: ocrfeatures.h:49
void Push(Pair *entry)
Definition: genericheap.h:95
uinT16 UniformBucket(PARAM_DESC *ParamDesc, FLOAT32 x, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:2149
#define ALREADYCLUSTERED
Definition: cluster.h:133
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:115
#define first_node(l)
Definition: oldlist.h:139
#define InitSampleSearch(S, C)
Definition: cluster.h:105
#define SqrtOf2Pi
Definition: cluster.cpp:218
unsigned char uinT8
Definition: host.h:99
DISTRIBUTION Distribution
Definition: cluster.cpp:179
LIST destroy(LIST list)
Definition: oldlist.cpp:187
int MagicSamples
Definition: cluster.h:55
FLOAT64 UniformDensity(inT32 x)
Definition: cluster.cpp:1973
FLOAT64(* SOLVEFUNC)(CHISTRUCT *, double)
Definition: cluster.cpp:204
void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uinT16 Dim, PARAM_DESC *ParamDesc, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:2019
FLOAT32 Max
Definition: ocrfeatures.h:50
LIST ProtoList
Definition: cluster.h:92
void KDStore(KDTREE *Tree, FLOAT32 *Key, void *Data)
Definition: kdtree.cpp:209
int AlphaMatch(void *arg1, void *arg2)
Definition: cluster.cpp:2411
#define HOTELLING
Definition: cluster.cpp:30
void KDDelete(KDTREE *Tree, FLOAT32 Key[], void *Data)
Definition: kdtree.cpp:269
FLOAT32 LogMagnitude
Definition: cluster.h:80
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:564
float FLOAT32
Definition: host.h:111
#define FTABLE_X
Definition: cluster.cpp:31
#define ASSERT_HOST(x)
Definition: errcode.h:84
void(* void_proc)(...)
Definition: cutil.h:66
unsigned SampleCount
Definition: cluster.h:35
FLOAT64 Solve(SOLVEFUNC Function, void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy)
Definition: cluster.cpp:2461
Definition: cluster.h:59
#define Mirror(N, R)
Definition: cluster.cpp:207
uinT32 * Count
Definition: cluster.cpp:185
FLOAT32 Independence
Definition: cluster.h:53
#define RootOf(T)
Definition: kdtree.h:58
double InvertMatrix(const float *input, int size, float *inv)
Definition: cluster.cpp:2652