tesseract  3.04.00
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
cluster.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: cluster.c
3  ** Purpose: Routines for clustering points in N-D space
4  ** Author: Dan Johnson
5  ** History: 5/29/89, DSJ, Created.
6  **
7  ** (c) Copyright Hewlett-Packard Company, 1988.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  ******************************************************************************/
18 #include "const.h"
19 #include "cluster.h"
20 #include "emalloc.h"
21 #include "genericheap.h"
22 #include "helpers.h"
23 #include "kdpair.h"
24 #include "matrix.h"
25 #include "tprintf.h"
26 #include "danerror.h"
27 #include "freelist.h"
28 #include <math.h>
29 
30 #define HOTELLING 1 // If true use Hotelling's test to decide where to split.
31 #define FTABLE_X 10 // Size of FTable.
32 #define FTABLE_Y 100 // Size of FTable.
33 
34 // Table of values approximating the cumulative F-distribution for a confidence of 1%.
35 const double FTable[FTABLE_Y][FTABLE_X] = {
36  {4052.19, 4999.52, 5403.34, 5624.62, 5763.65, 5858.97, 5928.33, 5981.10, 6022.50, 6055.85,},
37  {98.502, 99.000, 99.166, 99.249, 99.300, 99.333, 99.356, 99.374, 99.388, 99.399,},
38  {34.116, 30.816, 29.457, 28.710, 28.237, 27.911, 27.672, 27.489, 27.345, 27.229,},
39  {21.198, 18.000, 16.694, 15.977, 15.522, 15.207, 14.976, 14.799, 14.659, 14.546,},
40  {16.258, 13.274, 12.060, 11.392, 10.967, 10.672, 10.456, 10.289, 10.158, 10.051,},
41  {13.745, 10.925, 9.780, 9.148, 8.746, 8.466, 8.260, 8.102, 7.976, 7.874,},
42  {12.246, 9.547, 8.451, 7.847, 7.460, 7.191, 6.993, 6.840, 6.719, 6.620,},
43  {11.259, 8.649, 7.591, 7.006, 6.632, 6.371, 6.178, 6.029, 5.911, 5.814,},
44  {10.561, 8.022, 6.992, 6.422, 6.057, 5.802, 5.613, 5.467, 5.351, 5.257,},
45  {10.044, 7.559, 6.552, 5.994, 5.636, 5.386, 5.200, 5.057, 4.942, 4.849,},
46  { 9.646, 7.206, 6.217, 5.668, 5.316, 5.069, 4.886, 4.744, 4.632, 4.539,},
47  { 9.330, 6.927, 5.953, 5.412, 5.064, 4.821, 4.640, 4.499, 4.388, 4.296,},
48  { 9.074, 6.701, 5.739, 5.205, 4.862, 4.620, 4.441, 4.302, 4.191, 4.100,},
49  { 8.862, 6.515, 5.564, 5.035, 4.695, 4.456, 4.278, 4.140, 4.030, 3.939,},
50  { 8.683, 6.359, 5.417, 4.893, 4.556, 4.318, 4.142, 4.004, 3.895, 3.805,},
51  { 8.531, 6.226, 5.292, 4.773, 4.437, 4.202, 4.026, 3.890, 3.780, 3.691,},
52  { 8.400, 6.112, 5.185, 4.669, 4.336, 4.102, 3.927, 3.791, 3.682, 3.593,},
53  { 8.285, 6.013, 5.092, 4.579, 4.248, 4.015, 3.841, 3.705, 3.597, 3.508,},
54  { 8.185, 5.926, 5.010, 4.500, 4.171, 3.939, 3.765, 3.631, 3.523, 3.434,},
55  { 8.096, 5.849, 4.938, 4.431, 4.103, 3.871, 3.699, 3.564, 3.457, 3.368,},
56  { 8.017, 5.780, 4.874, 4.369, 4.042, 3.812, 3.640, 3.506, 3.398, 3.310,},
57  { 7.945, 5.719, 4.817, 4.313, 3.988, 3.758, 3.587, 3.453, 3.346, 3.258,},
58  { 7.881, 5.664, 4.765, 4.264, 3.939, 3.710, 3.539, 3.406, 3.299, 3.211,},
59  { 7.823, 5.614, 4.718, 4.218, 3.895, 3.667, 3.496, 3.363, 3.256, 3.168,},
60  { 7.770, 5.568, 4.675, 4.177, 3.855, 3.627, 3.457, 3.324, 3.217, 3.129,},
61  { 7.721, 5.526, 4.637, 4.140, 3.818, 3.591, 3.421, 3.288, 3.182, 3.094,},
62  { 7.677, 5.488, 4.601, 4.106, 3.785, 3.558, 3.388, 3.256, 3.149, 3.062,},
63  { 7.636, 5.453, 4.568, 4.074, 3.754, 3.528, 3.358, 3.226, 3.120, 3.032,},
64  { 7.598, 5.420, 4.538, 4.045, 3.725, 3.499, 3.330, 3.198, 3.092, 3.005,},
65  { 7.562, 5.390, 4.510, 4.018, 3.699, 3.473, 3.305, 3.173, 3.067, 2.979,},
66  { 7.530, 5.362, 4.484, 3.993, 3.675, 3.449, 3.281, 3.149, 3.043, 2.955,},
67  { 7.499, 5.336, 4.459, 3.969, 3.652, 3.427, 3.258, 3.127, 3.021, 2.934,},
68  { 7.471, 5.312, 4.437, 3.948, 3.630, 3.406, 3.238, 3.106, 3.000, 2.913,},
69  { 7.444, 5.289, 4.416, 3.927, 3.611, 3.386, 3.218, 3.087, 2.981, 2.894,},
70  { 7.419, 5.268, 4.396, 3.908, 3.592, 3.368, 3.200, 3.069, 2.963, 2.876,},
71  { 7.396, 5.248, 4.377, 3.890, 3.574, 3.351, 3.183, 3.052, 2.946, 2.859,},
72  { 7.373, 5.229, 4.360, 3.873, 3.558, 3.334, 3.167, 3.036, 2.930, 2.843,},
73  { 7.353, 5.211, 4.343, 3.858, 3.542, 3.319, 3.152, 3.021, 2.915, 2.828,},
74  { 7.333, 5.194, 4.327, 3.843, 3.528, 3.305, 3.137, 3.006, 2.901, 2.814,},
75  { 7.314, 5.179, 4.313, 3.828, 3.514, 3.291, 3.124, 2.993, 2.888, 2.801,},
76  { 7.296, 5.163, 4.299, 3.815, 3.501, 3.278, 3.111, 2.980, 2.875, 2.788,},
77  { 7.280, 5.149, 4.285, 3.802, 3.488, 3.266, 3.099, 2.968, 2.863, 2.776,},
78  { 7.264, 5.136, 4.273, 3.790, 3.476, 3.254, 3.087, 2.957, 2.851, 2.764,},
79  { 7.248, 5.123, 4.261, 3.778, 3.465, 3.243, 3.076, 2.946, 2.840, 2.754,},
80  { 7.234, 5.110, 4.249, 3.767, 3.454, 3.232, 3.066, 2.935, 2.830, 2.743,},
81  { 7.220, 5.099, 4.238, 3.757, 3.444, 3.222, 3.056, 2.925, 2.820, 2.733,},
82  { 7.207, 5.087, 4.228, 3.747, 3.434, 3.213, 3.046, 2.916, 2.811, 2.724,},
83  { 7.194, 5.077, 4.218, 3.737, 3.425, 3.204, 3.037, 2.907, 2.802, 2.715,},
84  { 7.182, 5.066, 4.208, 3.728, 3.416, 3.195, 3.028, 2.898, 2.793, 2.706,},
85  { 7.171, 5.057, 4.199, 3.720, 3.408, 3.186, 3.020, 2.890, 2.785, 2.698,},
86  { 7.159, 5.047, 4.191, 3.711, 3.400, 3.178, 3.012, 2.882, 2.777, 2.690,},
87  { 7.149, 5.038, 4.182, 3.703, 3.392, 3.171, 3.005, 2.874, 2.769, 2.683,},
88  { 7.139, 5.030, 4.174, 3.695, 3.384, 3.163, 2.997, 2.867, 2.762, 2.675,},
89  { 7.129, 5.021, 4.167, 3.688, 3.377, 3.156, 2.990, 2.860, 2.755, 2.668,},
90  { 7.119, 5.013, 4.159, 3.681, 3.370, 3.149, 2.983, 2.853, 2.748, 2.662,},
91  { 7.110, 5.006, 4.152, 3.674, 3.363, 3.143, 2.977, 2.847, 2.742, 2.655,},
92  { 7.102, 4.998, 4.145, 3.667, 3.357, 3.136, 2.971, 2.841, 2.736, 2.649,},
93  { 7.093, 4.991, 4.138, 3.661, 3.351, 3.130, 2.965, 2.835, 2.730, 2.643,},
94  { 7.085, 4.984, 4.132, 3.655, 3.345, 3.124, 2.959, 2.829, 2.724, 2.637,},
95  { 7.077, 4.977, 4.126, 3.649, 3.339, 3.119, 2.953, 2.823, 2.718, 2.632,},
96  { 7.070, 4.971, 4.120, 3.643, 3.333, 3.113, 2.948, 2.818, 2.713, 2.626,},
97  { 7.062, 4.965, 4.114, 3.638, 3.328, 3.108, 2.942, 2.813, 2.708, 2.621,},
98  { 7.055, 4.959, 4.109, 3.632, 3.323, 3.103, 2.937, 2.808, 2.703, 2.616,},
99  { 7.048, 4.953, 4.103, 3.627, 3.318, 3.098, 2.932, 2.803, 2.698, 2.611,},
100  { 7.042, 4.947, 4.098, 3.622, 3.313, 3.093, 2.928, 2.798, 2.693, 2.607,},
101  { 7.035, 4.942, 4.093, 3.618, 3.308, 3.088, 2.923, 2.793, 2.689, 2.602,},
102  { 7.029, 4.937, 4.088, 3.613, 3.304, 3.084, 2.919, 2.789, 2.684, 2.598,},
103  { 7.023, 4.932, 4.083, 3.608, 3.299, 3.080, 2.914, 2.785, 2.680, 2.593,},
104  { 7.017, 4.927, 4.079, 3.604, 3.295, 3.075, 2.910, 2.781, 2.676, 2.589,},
105  { 7.011, 4.922, 4.074, 3.600, 3.291, 3.071, 2.906, 2.777, 2.672, 2.585,},
106  { 7.006, 4.917, 4.070, 3.596, 3.287, 3.067, 2.902, 2.773, 2.668, 2.581,},
107  { 7.001, 4.913, 4.066, 3.591, 3.283, 3.063, 2.898, 2.769, 2.664, 2.578,},
108  { 6.995, 4.908, 4.062, 3.588, 3.279, 3.060, 2.895, 2.765, 2.660, 2.574,},
109  { 6.990, 4.904, 4.058, 3.584, 3.275, 3.056, 2.891, 2.762, 2.657, 2.570,},
110  { 6.985, 4.900, 4.054, 3.580, 3.272, 3.052, 2.887, 2.758, 2.653, 2.567,},
111  { 6.981, 4.896, 4.050, 3.577, 3.268, 3.049, 2.884, 2.755, 2.650, 2.563,},
112  { 6.976, 4.892, 4.047, 3.573, 3.265, 3.046, 2.881, 2.751, 2.647, 2.560,},
113  { 6.971, 4.888, 4.043, 3.570, 3.261, 3.042, 2.877, 2.748, 2.644, 2.557,},
114  { 6.967, 4.884, 4.040, 3.566, 3.258, 3.039, 2.874, 2.745, 2.640, 2.554,},
115  { 6.963, 4.881, 4.036, 3.563, 3.255, 3.036, 2.871, 2.742, 2.637, 2.551,},
116  { 6.958, 4.877, 4.033, 3.560, 3.252, 3.033, 2.868, 2.739, 2.634, 2.548,},
117  { 6.954, 4.874, 4.030, 3.557, 3.249, 3.030, 2.865, 2.736, 2.632, 2.545,},
118  { 6.950, 4.870, 4.027, 3.554, 3.246, 3.027, 2.863, 2.733, 2.629, 2.542,},
119  { 6.947, 4.867, 4.024, 3.551, 3.243, 3.025, 2.860, 2.731, 2.626, 2.539,},
120  { 6.943, 4.864, 4.021, 3.548, 3.240, 3.022, 2.857, 2.728, 2.623, 2.537,},
121  { 6.939, 4.861, 4.018, 3.545, 3.238, 3.019, 2.854, 2.725, 2.621, 2.534,},
122  { 6.935, 4.858, 4.015, 3.543, 3.235, 3.017, 2.852, 2.723, 2.618, 2.532,},
123  { 6.932, 4.855, 4.012, 3.540, 3.233, 3.014, 2.849, 2.720, 2.616, 2.529,},
124  { 6.928, 4.852, 4.010, 3.538, 3.230, 3.012, 2.847, 2.718, 2.613, 2.527,},
125  { 6.925, 4.849, 4.007, 3.535, 3.228, 3.009, 2.845, 2.715, 2.611, 2.524,},
126  { 6.922, 4.846, 4.004, 3.533, 3.225, 3.007, 2.842, 2.713, 2.609, 2.522,},
127  { 6.919, 4.844, 4.002, 3.530, 3.223, 3.004, 2.840, 2.711, 2.606, 2.520,},
128  { 6.915, 4.841, 3.999, 3.528, 3.221, 3.002, 2.838, 2.709, 2.604, 2.518,},
129  { 6.912, 4.838, 3.997, 3.525, 3.218, 3.000, 2.835, 2.706, 2.602, 2.515,},
130  { 6.909, 4.836, 3.995, 3.523, 3.216, 2.998, 2.833, 2.704, 2.600, 2.513,},
131  { 6.906, 4.833, 3.992, 3.521, 3.214, 2.996, 2.831, 2.702, 2.598, 2.511,},
132  { 6.904, 4.831, 3.990, 3.519, 3.212, 2.994, 2.829, 2.700, 2.596, 2.509,},
133  { 6.901, 4.829, 3.988, 3.517, 3.210, 2.992, 2.827, 2.698, 2.594, 2.507,},
134  { 6.898, 4.826, 3.986, 3.515, 3.208, 2.990, 2.825, 2.696, 2.592, 2.505,},
135  { 6.895, 4.824, 3.984, 3.513, 3.206, 2.988, 2.823, 2.694, 2.590, 2.503}
136 };
137 
138 /* define the variance which will be used as a minimum variance for any
139  dimension of any feature. Since most features are calculated from numbers
140  with a precision no better than 1 in 128, the variance should never be
141  less than the square of this number for parameters whose range is 1. */
142 #define MINVARIANCE 0.0004
143 
144 /* define the absolute minimum number of samples which must be present in
145  order to accurately test hypotheses about underlying probability
146  distributions. Define separately the minimum samples that are needed
147  before a statistical analysis is attempted; this number should be
148  equal to MINSAMPLES but can be set to a lower number for early testing
149  when very few samples are available. */
150 #define MINSAMPLESPERBUCKET 5
151 #define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET)
152 #define MINSAMPLESNEEDED 1
153 
154 /* define the size of the table which maps normalized samples to
155  histogram buckets. Also define the number of standard deviations
156  in a normal distribution which are considered to be significant.
157  The mapping table will be defined in such a way that it covers
158  the specified number of standard deviations on either side of
159  the mean. BUCKETTABLESIZE should always be even. */
160 #define BUCKETTABLESIZE 1024
161 #define NORMALEXTENT 3.0
162 
163 struct TEMPCLUSTER {
166 };
167 
170 
171 struct STATISTICS {
174  FLOAT32 *Min; // largest negative distance from the mean
175  FLOAT32 *Max; // largest positive distance from the mean
176 };
177 
178 struct BUCKETS {
179  DISTRIBUTION Distribution; // distribution being tested for
180  uinT32 SampleCount; // # of samples in histogram
181  FLOAT64 Confidence; // confidence level of test
182  FLOAT64 ChiSquared; // test threshold
183  uinT16 NumberOfBuckets; // number of cells in histogram
184  uinT16 Bucket[BUCKETTABLESIZE];// mapping to histogram buckets
185  uinT32 *Count; // frequency of occurence histogram
186  FLOAT32 *ExpectedCount; // expected histogram
187 };
188 
189 struct CHISTRUCT{
193 };
194 
195 // For use with KDWalk / MakePotentialClusters
197  ClusterHeap *heap; // heap used to hold temp clusters, "best" on top
198  TEMPCLUSTER *candidates; // array of potential clusters
199  KDTREE *tree; // kd-tree to be searched for neighbors
200  inT32 next; // next candidate to be used
201 };
202 
203 typedef FLOAT64 (*DENSITYFUNC) (inT32);
204 typedef FLOAT64 (*SOLVEFUNC) (CHISTRUCT *, double);
205 
206 #define Odd(N) ((N)%2)
207 #define Mirror(N,R) ((R) - (N) - 1)
208 #define Abs(N) ( ( (N) < 0 ) ? ( -(N) ) : (N) )
209 
210 //--------------Global Data Definitions and Declarations----------------------
211 /* the following variables describe a discrete normal distribution
212  which is used by NormalDensity() and NormalBucket(). The
213  constant NORMALEXTENT determines how many standard
214  deviations of the distribution are mapped onto the fixed
215  discrete range of x. x=0 is mapped to -NORMALEXTENT standard
216  deviations and x=BUCKETTABLESIZE is mapped to
217  +NORMALEXTENT standard deviations. */
218 #define SqrtOf2Pi 2.506628275
219 static const FLOAT64 kNormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT);
220 static const FLOAT64 kNormalVariance =
222 static const FLOAT64 kNormalMagnitude =
223  (2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE);
224 static const FLOAT64 kNormalMean = BUCKETTABLESIZE / 2;
225 
226 /* define lookup tables used to compute the number of histogram buckets
227  that should be used for a given number of samples. */
228 #define LOOKUPTABLESIZE 8
229 #define MAXDEGREESOFFREEDOM MAXBUCKETS
230 
231 static const uinT32 kCountTable[LOOKUPTABLESIZE] = {
232  MINSAMPLES, 200, 400, 600, 800, 1000, 1500, 2000
233 }; // number of samples
234 
235 static const uinT16 kBucketsTable[LOOKUPTABLESIZE] = {
236  MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS
237 }; // number of buckets
238 
239 /*-------------------------------------------------------------------------
240  Private Function Prototypes
241 --------------------------------------------------------------------------*/
242 void CreateClusterTree(CLUSTERER *Clusterer);
243 
244 void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster,
245  inT32 Level);
246 
248  CLUSTER *Cluster,
249  FLOAT32 *Distance);
250 
251 CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster);
252 
254 register PARAM_DESC ParamDesc[],
255 register inT32 n1,
256 register inT32 n2,
257 register FLOAT32 m[],
258 register FLOAT32 m1[], register FLOAT32 m2[]);
259 
261 
262 PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,
264  CLUSTER *Cluster);
265 
267  CLUSTER *Cluster,
268  STATISTICS *Statistics,
269  PROTOSTYLE Style,
270  inT32 MinSamples);
271 
274  CLUSTER *Cluster,
275  STATISTICS *Statistics);
276 
278  CLUSTER *Cluster,
279  STATISTICS *Statistics,
280  BUCKETS *Buckets);
281 
283  CLUSTER *Cluster,
284  STATISTICS *Statistics,
285  BUCKETS *Buckets);
286 
288  CLUSTER *Cluster,
289  STATISTICS *Statistics,
290  BUCKETS *NormalBuckets,
291  FLOAT64 Confidence);
292 
293 void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc);
294 
295 void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics);
296 
298 PARAM_DESC ParamDesc[], CLUSTER * Cluster);
299 
301  CLUSTER *Cluster,
302  STATISTICS *Statistics);
303 
305  CLUSTER *Cluster,
306  STATISTICS *Statistics);
307 
308 PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics);
309 
310 PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster);
311 
312 BOOL8 Independent (PARAM_DESC ParamDesc[],
313 inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence);
314 
315 BUCKETS *GetBuckets(CLUSTERER* clusterer,
316  DISTRIBUTION Distribution,
317  uinT32 SampleCount,
318  FLOAT64 Confidence);
319 
320 BUCKETS *MakeBuckets(DISTRIBUTION Distribution,
321  uinT32 SampleCount,
322  FLOAT64 Confidence);
323 
325 
327 
329 
331 
333 
334 void FillBuckets(BUCKETS *Buckets,
335  CLUSTER *Cluster,
336  uinT16 Dim,
337  PARAM_DESC *ParamDesc,
338  FLOAT32 Mean,
339  FLOAT32 StdDev);
340 
341 uinT16 NormalBucket(PARAM_DESC *ParamDesc,
342  FLOAT32 x,
343  FLOAT32 Mean,
344  FLOAT32 StdDev);
345 
346 uinT16 UniformBucket(PARAM_DESC *ParamDesc,
347  FLOAT32 x,
348  FLOAT32 Mean,
349  FLOAT32 StdDev);
350 
351 BOOL8 DistributionOK(BUCKETS *Buckets);
352 
353 void FreeStatistics(STATISTICS *Statistics);
354 
355 void FreeBuckets(BUCKETS *Buckets);
356 
357 void FreeCluster(CLUSTER *Cluster);
358 
359 uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets);
360 
361 int NumBucketsMatch(void *arg1, // BUCKETS *Histogram,
362  void *arg2); // uinT16 *DesiredNumberOfBuckets);
363 
364 int ListEntryMatch(void *arg1, void *arg2);
365 
366 void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount);
367 
368 void InitBuckets(BUCKETS *Buckets);
369 
370 int AlphaMatch(void *arg1, // CHISTRUCT *ChiStruct,
371  void *arg2); // CHISTRUCT *SearchKey);
372 
374 
375 FLOAT64 Solve(SOLVEFUNC Function,
376  void *FunctionParams,
377  FLOAT64 InitialGuess,
378  FLOAT64 Accuracy);
379 
380 FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x);
381 
383  CLUSTER *Cluster,
384  FLOAT32 MaxIllegal);
385 
386 double InvertMatrix(const float* input, int size, float* inv);
387 
388 //--------------------------Public Code--------------------------------------
398 CLUSTERER *
399 MakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[]) {
400  CLUSTERER *Clusterer;
401  int i;
402 
403  // allocate main clusterer data structure and init simple fields
404  Clusterer = (CLUSTERER *) Emalloc (sizeof (CLUSTERER));
405  Clusterer->SampleSize = SampleSize;
406  Clusterer->NumberOfSamples = 0;
407  Clusterer->NumChar = 0;
408 
409  // init fields which will not be used initially
410  Clusterer->Root = NULL;
411  Clusterer->ProtoList = NIL_LIST;
412 
413  // maintain a copy of param descriptors in the clusterer data structure
414  Clusterer->ParamDesc =
415  (PARAM_DESC *) Emalloc (SampleSize * sizeof (PARAM_DESC));
416  for (i = 0; i < SampleSize; i++) {
417  Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
418  Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
419  Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
420  Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
421  Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
422  Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
423  Clusterer->ParamDesc[i].MidRange =
424  (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
425  }
426 
427  // allocate a kd tree to hold the samples
428  Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
429 
430  // Initialize cache of histogram buckets to minimize recomputing them.
431  for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
432  for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
433  Clusterer->bucket_cache[d][c] = NULL;
434  }
435 
436  return Clusterer;
437 } // MakeClusterer
438 
439 
454 SAMPLE* MakeSample(CLUSTERER * Clusterer, const FLOAT32* Feature,
455  inT32 CharID) {
456  SAMPLE *Sample;
457  int i;
458 
459  // see if the samples have already been clustered - if so trap an error
460  if (Clusterer->Root != NULL)
462  "Can't add samples after they have been clustered");
463 
464  // allocate the new sample and initialize it
465  Sample = (SAMPLE *) Emalloc (sizeof (SAMPLE) +
466  (Clusterer->SampleSize -
467  1) * sizeof (FLOAT32));
468  Sample->Clustered = FALSE;
469  Sample->Prototype = FALSE;
470  Sample->SampleCount = 1;
471  Sample->Left = NULL;
472  Sample->Right = NULL;
473  Sample->CharID = CharID;
474 
475  for (i = 0; i < Clusterer->SampleSize; i++)
476  Sample->Mean[i] = Feature[i];
477 
478  // add the sample to the KD tree - keep track of the total # of samples
479  Clusterer->NumberOfSamples++;
480  KDStore (Clusterer->KDTree, Sample->Mean, (char *) Sample);
481  if (CharID >= Clusterer->NumChar)
482  Clusterer->NumChar = CharID + 1;
483 
484  // execute hook for monitoring clustering operation
485  // (*SampleCreationHook)( Sample );
486 
487  return (Sample);
488 } // MakeSample
489 
490 
509  //only create cluster tree if samples have never been clustered before
510  if (Clusterer->Root == NULL)
511  CreateClusterTree(Clusterer);
512 
513  //deallocate the old prototype list if one exists
514  FreeProtoList (&Clusterer->ProtoList);
515  Clusterer->ProtoList = NIL_LIST;
516 
517  //compute prototypes starting at the root node in the tree
518  ComputePrototypes(Clusterer, Config);
519  return (Clusterer->ProtoList);
520 } // ClusterSamples
521 
522 
536 void FreeClusterer(CLUSTERER *Clusterer) {
537  if (Clusterer != NULL) {
538  memfree (Clusterer->ParamDesc);
539  if (Clusterer->KDTree != NULL)
540  FreeKDTree (Clusterer->KDTree);
541  if (Clusterer->Root != NULL)
542  FreeCluster (Clusterer->Root);
543  // Free up all used buckets structures.
544  for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
545  for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
546  if (Clusterer->bucket_cache[d][c] != NULL)
547  FreeBuckets(Clusterer->bucket_cache[d][c]);
548  }
549 
550  memfree(Clusterer);
551  }
552 } // FreeClusterer
553 
554 
564 void FreeProtoList(LIST *ProtoList) {
565  destroy_nodes(*ProtoList, FreePrototype);
566 } // FreeProtoList
567 
568 
579 void FreePrototype(void *arg) { //PROTOTYPE *Prototype)
580  PROTOTYPE *Prototype = (PROTOTYPE *) arg;
581 
582  // unmark the corresponding cluster (if there is one
583  if (Prototype->Cluster != NULL)
584  Prototype->Cluster->Prototype = FALSE;
585 
586  // deallocate the prototype statistics and then the prototype itself
587  if (Prototype->Distrib != NULL)
588  memfree (Prototype->Distrib);
589  if (Prototype->Mean != NULL)
590  memfree (Prototype->Mean);
591  if (Prototype->Style != spherical) {
592  if (Prototype->Variance.Elliptical != NULL)
593  memfree (Prototype->Variance.Elliptical);
594  if (Prototype->Magnitude.Elliptical != NULL)
595  memfree (Prototype->Magnitude.Elliptical);
596  if (Prototype->Weight.Elliptical != NULL)
597  memfree (Prototype->Weight.Elliptical);
598  }
599  memfree(Prototype);
600 } // FreePrototype
601 
602 
618 CLUSTER *NextSample(LIST *SearchState) {
619  CLUSTER *Cluster;
620 
621  if (*SearchState == NIL_LIST)
622  return (NULL);
623  Cluster = (CLUSTER *) first_node (*SearchState);
624  *SearchState = pop (*SearchState);
625  while (TRUE) {
626  if (Cluster->Left == NULL)
627  return (Cluster);
628  *SearchState = push (*SearchState, Cluster->Right);
629  Cluster = Cluster->Left;
630  }
631 } // NextSample
632 
633 
643 FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension) {
644  return (Proto->Mean[Dimension]);
645 } // Mean
646 
647 
658  switch (Proto->Style) {
659  case spherical:
660  return ((FLOAT32) sqrt ((double) Proto->Variance.Spherical));
661  case elliptical:
662  return ((FLOAT32)
663  sqrt ((double) Proto->Variance.Elliptical[Dimension]));
664  case mixed:
665  switch (Proto->Distrib[Dimension]) {
666  case normal:
667  return ((FLOAT32)
668  sqrt ((double) Proto->Variance.Elliptical[Dimension]));
669  case uniform:
670  case D_random:
671  return (Proto->Variance.Elliptical[Dimension]);
672  case DISTRIBUTION_COUNT:
673  ASSERT_HOST(!"Distribution count not allowed!");
674  }
675  }
676  return 0.0f;
677 } // StandardDeviation
678 
679 
680 /*---------------------------------------------------------------------------
681  Private Code
682 ----------------------------------------------------------------------------*/
698 void CreateClusterTree(CLUSTERER *Clusterer) {
699  ClusteringContext context;
700  ClusterPair HeapEntry;
701  TEMPCLUSTER *PotentialCluster;
702 
703  // each sample and its nearest neighbor form a "potential" cluster
704  // save these in a heap with the "best" potential clusters on top
705  context.tree = Clusterer->KDTree;
706  context.candidates = (TEMPCLUSTER *)
707  Emalloc(Clusterer->NumberOfSamples * sizeof(TEMPCLUSTER));
708  context.next = 0;
709  context.heap = new ClusterHeap(Clusterer->NumberOfSamples);
710  KDWalk(context.tree, (void_proc)MakePotentialClusters, &context);
711 
712  // form potential clusters into actual clusters - always do "best" first
713  while (context.heap->Pop(&HeapEntry)) {
714  PotentialCluster = HeapEntry.data;
715 
716  // if main cluster of potential cluster is already in another cluster
717  // then we don't need to worry about it
718  if (PotentialCluster->Cluster->Clustered) {
719  continue;
720  }
721 
722  // if main cluster is not yet clustered, but its nearest neighbor is
723  // then we must find a new nearest neighbor
724  else if (PotentialCluster->Neighbor->Clustered) {
725  PotentialCluster->Neighbor =
726  FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
727  &HeapEntry.key);
728  if (PotentialCluster->Neighbor != NULL) {
729  context.heap->Push(&HeapEntry);
730  }
731  }
732 
733  // if neither cluster is already clustered, form permanent cluster
734  else {
735  PotentialCluster->Cluster =
736  MakeNewCluster(Clusterer, PotentialCluster);
737  PotentialCluster->Neighbor =
738  FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
739  &HeapEntry.key);
740  if (PotentialCluster->Neighbor != NULL) {
741  context.heap->Push(&HeapEntry);
742  }
743  }
744  }
745 
746  // the root node in the cluster tree is now the only node in the kd-tree
747  Clusterer->Root = (CLUSTER *) RootOf(Clusterer->KDTree);
748 
749  // free up the memory used by the K-D tree, heap, and temp clusters
750  FreeKDTree(context.tree);
751  Clusterer->KDTree = NULL;
752  delete context.heap;
753  memfree(context.candidates);
754 } // CreateClusterTree
755 
756 
769  CLUSTER *Cluster, inT32 Level) {
770  ClusterPair HeapEntry;
771  int next = context->next;
772  context->candidates[next].Cluster = Cluster;
773  HeapEntry.data = &(context->candidates[next]);
774  context->candidates[next].Neighbor =
775  FindNearestNeighbor(context->tree,
776  context->candidates[next].Cluster,
777  &HeapEntry.key);
778  if (context->candidates[next].Neighbor != NULL) {
779  context->heap->Push(&HeapEntry);
780  context->next++;
781  }
782 } // MakePotentialClusters
783 
784 
801 CLUSTER *
802 FindNearestNeighbor(KDTREE * Tree, CLUSTER * Cluster, FLOAT32 * Distance)
803 #define MAXNEIGHBORS 2
804 #define MAXDISTANCE MAX_FLOAT32
805 {
806  CLUSTER *Neighbor[MAXNEIGHBORS];
807  FLOAT32 Dist[MAXNEIGHBORS];
808  int NumberOfNeighbors;
809  inT32 i;
810  CLUSTER *BestNeighbor;
811 
812  // find the 2 nearest neighbors of the cluster
814  &NumberOfNeighbors, (void **)Neighbor, Dist);
815 
816  // search for the nearest neighbor that is not the cluster itself
817  *Distance = MAXDISTANCE;
818  BestNeighbor = NULL;
819  for (i = 0; i < NumberOfNeighbors; i++) {
820  if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
821  *Distance = Dist[i];
822  BestNeighbor = Neighbor[i];
823  }
824  }
825  return BestNeighbor;
826 } // FindNearestNeighbor
827 
828 
841 CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) {
842  CLUSTER *Cluster;
843 
844  // allocate the new cluster and initialize it
845  Cluster = (CLUSTER *) Emalloc(
846  sizeof(CLUSTER) + (Clusterer->SampleSize - 1) * sizeof(FLOAT32));
847  Cluster->Clustered = FALSE;
848  Cluster->Prototype = FALSE;
849  Cluster->Left = TempCluster->Cluster;
850  Cluster->Right = TempCluster->Neighbor;
851  Cluster->CharID = -1;
852 
853  // mark the old clusters as "clustered" and delete them from the kd-tree
854  Cluster->Left->Clustered = TRUE;
855  Cluster->Right->Clustered = TRUE;
856  KDDelete(Clusterer->KDTree, Cluster->Left->Mean, Cluster->Left);
857  KDDelete(Clusterer->KDTree, Cluster->Right->Mean, Cluster->Right);
858 
859  // compute the mean and sample count for the new cluster
860  Cluster->SampleCount =
861  MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc,
862  Cluster->Left->SampleCount, Cluster->Right->SampleCount,
863  Cluster->Mean, Cluster->Left->Mean, Cluster->Right->Mean);
864 
865  // add the new cluster to the KD tree
866  KDStore(Clusterer->KDTree, Cluster->Mean, Cluster);
867  return Cluster;
868 } // MakeNewCluster
869 
870 
887  PARAM_DESC ParamDesc[],
888  inT32 n1,
889  inT32 n2,
890  FLOAT32 m[],
891  FLOAT32 m1[], FLOAT32 m2[]) {
892  inT32 i, n;
893 
894  n = n1 + n2;
895  for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
896  if (ParamDesc->Circular) {
897  // if distance between means is greater than allowed
898  // reduce upper point by one "rotation" to compute mean
899  // then normalize the mean back into the accepted range
900  if ((*m2 - *m1) > ParamDesc->HalfRange) {
901  *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
902  if (*m < ParamDesc->Min)
903  *m += ParamDesc->Range;
904  }
905  else if ((*m1 - *m2) > ParamDesc->HalfRange) {
906  *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
907  if (*m < ParamDesc->Min)
908  *m += ParamDesc->Range;
909  }
910  else
911  *m = (n1 * *m1 + n2 * *m2) / n;
912  }
913  else
914  *m = (n1 * *m1 + n2 * *m2) / n;
915  }
916  return n;
917 } // MergeClusters
918 
919 
932  LIST ClusterStack = NIL_LIST;
933  CLUSTER *Cluster;
934  PROTOTYPE *Prototype;
935 
936  // use a stack to keep track of clusters waiting to be processed
937  // initially the only cluster on the stack is the root cluster
938  if (Clusterer->Root != NULL)
939  ClusterStack = push (NIL_LIST, Clusterer->Root);
940 
941  // loop until we have analyzed all clusters which are potential prototypes
942  while (ClusterStack != NIL_LIST) {
943  // remove the next cluster to be analyzed from the stack
944  // try to make a prototype from the cluster
945  // if successful, put it on the proto list, else split the cluster
946  Cluster = (CLUSTER *) first_node (ClusterStack);
947  ClusterStack = pop (ClusterStack);
948  Prototype = MakePrototype(Clusterer, Config, Cluster);
949  if (Prototype != NULL) {
950  Clusterer->ProtoList = push (Clusterer->ProtoList, Prototype);
951  }
952  else {
953  ClusterStack = push (ClusterStack, Cluster->Right);
954  ClusterStack = push (ClusterStack, Cluster->Left);
955  }
956  }
957 } // ComputePrototypes
958 
959 
980  CLUSTER *Cluster) {
981  STATISTICS *Statistics;
982  PROTOTYPE *Proto;
983  BUCKETS *Buckets;
984 
985  // filter out clusters which contain samples from the same character
986  if (MultipleCharSamples (Clusterer, Cluster, Config->MaxIllegal))
987  return NULL;
988 
989  // compute the covariance matrix and ranges for the cluster
990  Statistics =
991  ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
992 
993  // check for degenerate clusters which need not be analyzed further
994  // note that the MinSamples test assumes that all clusters with multiple
995  // character samples have been removed (as above)
996  Proto = MakeDegenerateProto(
997  Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle,
998  (inT32) (Config->MinSamples * Clusterer->NumChar));
999  if (Proto != NULL) {
1000  FreeStatistics(Statistics);
1001  return Proto;
1002  }
1003  // check to ensure that all dimensions are independent
1004  if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize,
1005  Statistics->CoVariance, Config->Independence)) {
1006  FreeStatistics(Statistics);
1007  return NULL;
1008  }
1009 
1010  if (HOTELLING && Config->ProtoStyle == elliptical) {
1011  Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
1012  if (Proto != NULL) {
1013  FreeStatistics(Statistics);
1014  return Proto;
1015  }
1016  }
1017 
1018  // create a histogram data structure used to evaluate distributions
1019  Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount,
1020  Config->Confidence);
1021 
1022  // create a prototype based on the statistics and test it
1023  switch (Config->ProtoStyle) {
1024  case spherical:
1025  Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
1026  break;
1027  case elliptical:
1028  Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
1029  break;
1030  case mixed:
1031  Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
1032  Config->Confidence);
1033  break;
1034  case automatic:
1035  Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
1036  if (Proto != NULL)
1037  break;
1038  Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
1039  if (Proto != NULL)
1040  break;
1041  Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
1042  Config->Confidence);
1043  break;
1044  }
1045  FreeStatistics(Statistics);
1046  return Proto;
1047 } // MakePrototype
1048 
1049 
1071 PROTOTYPE *MakeDegenerateProto( //this was MinSample
1072  uinT16 N,
1073  CLUSTER *Cluster,
1074  STATISTICS *Statistics,
1075  PROTOSTYLE Style,
1076  inT32 MinSamples) {
1077  PROTOTYPE *Proto = NULL;
1078 
1079  if (MinSamples < MINSAMPLESNEEDED)
1080  MinSamples = MINSAMPLESNEEDED;
1081 
1082  if (Cluster->SampleCount < MinSamples) {
1083  switch (Style) {
1084  case spherical:
1085  Proto = NewSphericalProto (N, Cluster, Statistics);
1086  break;
1087  case elliptical:
1088  case automatic:
1089  Proto = NewEllipticalProto (N, Cluster, Statistics);
1090  break;
1091  case mixed:
1092  Proto = NewMixedProto (N, Cluster, Statistics);
1093  break;
1094  }
1095  Proto->Significant = FALSE;
1096  }
1097  return (Proto);
1098 } // MakeDegenerateProto
1099 
1115  CLUSTER *Cluster,
1116  STATISTICS *Statistics) {
1117  // Fraction of the number of samples used as a range around 1 within
1118  // which a cluster has the magic size that allows a boost to the
1119  // FTable by kFTableBoostMargin, thus allowing clusters near the
1120  // magic size (equal to the number of sample characters) to be more
1121  // likely to stay together.
1122  const double kMagicSampleMargin = 0.0625;
1123  const double kFTableBoostMargin = 2.0;
1124 
1125  int N = Clusterer->SampleSize;
1126  CLUSTER* Left = Cluster->Left;
1127  CLUSTER* Right = Cluster->Right;
1128  if (Left == NULL || Right == NULL)
1129  return NULL;
1130  int TotalDims = Left->SampleCount + Right->SampleCount;
1131  if (TotalDims < N + 1 || TotalDims < 2)
1132  return NULL;
1133  const int kMatrixSize = N * N * sizeof(FLOAT32);
1134  FLOAT32* Covariance = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
1135  FLOAT32* Inverse = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
1136  FLOAT32* Delta = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32)));
1137  // Compute a new covariance matrix that only uses essential features.
1138  for (int i = 0; i < N; ++i) {
1139  int row_offset = i * N;
1140  if (!Clusterer->ParamDesc[i].NonEssential) {
1141  for (int j = 0; j < N; ++j) {
1142  if (!Clusterer->ParamDesc[j].NonEssential)
1143  Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
1144  else
1145  Covariance[j + row_offset] = 0.0f;
1146  }
1147  } else {
1148  for (int j = 0; j < N; ++j) {
1149  if (i == j)
1150  Covariance[j + row_offset] = 1.0f;
1151  else
1152  Covariance[j + row_offset] = 0.0f;
1153  }
1154  }
1155  }
1156  double err = InvertMatrix(Covariance, N, Inverse);
1157  if (err > 1) {
1158  tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
1159  }
1160  int EssentialN = 0;
1161  for (int dim = 0; dim < N; ++dim) {
1162  if (!Clusterer->ParamDesc[dim].NonEssential) {
1163  Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
1164  ++EssentialN;
1165  } else {
1166  Delta[dim] = 0.0f;
1167  }
1168  }
1169  // Compute Hotelling's T-squared.
1170  double Tsq = 0.0;
1171  for (int x = 0; x < N; ++x) {
1172  double temp = 0.0;
1173  for (int y = 0; y < N; ++y) {
1174  temp += Inverse[y + N*x] * Delta[y];
1175  }
1176  Tsq += Delta[x] * temp;
1177  }
1178  memfree(Covariance);
1179  memfree(Inverse);
1180  memfree(Delta);
1181  // Changed this function to match the formula in
1182  // Statistical Methods in Medical Research p 473
1183  // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
1184  // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
1185  double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN);
1186  int Fx = EssentialN;
1187  if (Fx > FTABLE_X)
1188  Fx = FTABLE_X;
1189  --Fx;
1190  int Fy = TotalDims - EssentialN - 1;
1191  if (Fy > FTABLE_Y)
1192  Fy = FTABLE_Y;
1193  --Fy;
1194  double FTarget = FTable[Fy][Fx];
1195  if (Config->MagicSamples > 0 &&
1196  TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
1197  TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
1198  // Give magic-sized clusters a magic FTable boost.
1199  FTarget += kFTableBoostMargin;
1200  }
1201  if (F < FTarget) {
1202  return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1203  }
1204  return NULL;
1205 }
1206 
1207 /* MakeSphericalProto *******************************************************
1208 Parameters: Clusterer data struct containing samples being clustered
1209  Cluster cluster to be made into a spherical prototype
1210  Statistics statistical info about cluster
1211  Buckets histogram struct used to analyze distribution
1212 Operation: This routine tests the specified cluster to see if it can
1213  be approximated by a spherical normal distribution. If it
1214  can be, then a new prototype is formed and returned to the
1215  caller. If it can't be, then NULL is returned to the caller.
1216 Return: Pointer to new spherical prototype or NULL.
1217 Exceptions: None
1218 History: 6/1/89, DSJ, Created.
1219 ******************************************************************************/
1221  CLUSTER *Cluster,
1222  STATISTICS *Statistics,
1223  BUCKETS *Buckets) {
1224  PROTOTYPE *Proto = NULL;
1225  int i;
1226 
1227  // check that each dimension is a normal distribution
1228  for (i = 0; i < Clusterer->SampleSize; i++) {
1229  if (Clusterer->ParamDesc[i].NonEssential)
1230  continue;
1231 
1232  FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1233  Cluster->Mean[i],
1234  sqrt ((FLOAT64) (Statistics->AvgVariance)));
1235  if (!DistributionOK (Buckets))
1236  break;
1237  }
1238  // if all dimensions matched a normal distribution, make a proto
1239  if (i >= Clusterer->SampleSize)
1240  Proto = NewSphericalProto (Clusterer->SampleSize, Cluster, Statistics);
1241  return (Proto);
1242 } // MakeSphericalProto
1243 
1244 
1259  CLUSTER *Cluster,
1260  STATISTICS *Statistics,
1261  BUCKETS *Buckets) {
1262  PROTOTYPE *Proto = NULL;
1263  int i;
1264 
1265  // check that each dimension is a normal distribution
1266  for (i = 0; i < Clusterer->SampleSize; i++) {
1267  if (Clusterer->ParamDesc[i].NonEssential)
1268  continue;
1269 
1270  FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1271  Cluster->Mean[i],
1272  sqrt ((FLOAT64) Statistics->
1273  CoVariance[i * (Clusterer->SampleSize + 1)]));
1274  if (!DistributionOK (Buckets))
1275  break;
1276  }
1277  // if all dimensions matched a normal distribution, make a proto
1278  if (i >= Clusterer->SampleSize)
1279  Proto = NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1280  return (Proto);
1281 } // MakeEllipticalProto
1282 
1283 
1303  CLUSTER *Cluster,
1304  STATISTICS *Statistics,
1305  BUCKETS *NormalBuckets,
1306  FLOAT64 Confidence) {
1307  PROTOTYPE *Proto;
1308  int i;
1309  BUCKETS *UniformBuckets = NULL;
1310  BUCKETS *RandomBuckets = NULL;
1311 
1312  // create a mixed proto to work on - initially assume all dimensions normal*/
1313  Proto = NewMixedProto (Clusterer->SampleSize, Cluster, Statistics);
1314 
1315  // find the proper distribution for each dimension
1316  for (i = 0; i < Clusterer->SampleSize; i++) {
1317  if (Clusterer->ParamDesc[i].NonEssential)
1318  continue;
1319 
1320  FillBuckets (NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1321  Proto->Mean[i],
1322  sqrt ((FLOAT64) Proto->Variance.Elliptical[i]));
1323  if (DistributionOK (NormalBuckets))
1324  continue;
1325 
1326  if (RandomBuckets == NULL)
1327  RandomBuckets =
1328  GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence);
1329  MakeDimRandom (i, Proto, &(Clusterer->ParamDesc[i]));
1330  FillBuckets (RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1331  Proto->Mean[i], Proto->Variance.Elliptical[i]);
1332  if (DistributionOK (RandomBuckets))
1333  continue;
1334 
1335  if (UniformBuckets == NULL)
1336  UniformBuckets =
1337  GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence);
1338  MakeDimUniform(i, Proto, Statistics);
1339  FillBuckets (UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1340  Proto->Mean[i], Proto->Variance.Elliptical[i]);
1341  if (DistributionOK (UniformBuckets))
1342  continue;
1343  break;
1344  }
1345  // if any dimension failed to match a distribution, discard the proto
1346  if (i < Clusterer->SampleSize) {
1347  FreePrototype(Proto);
1348  Proto = NULL;
1349  }
1350  return (Proto);
1351 } // MakeMixedProto
1352 
1353 
1354 /* MakeDimRandom *************************************************************
1355 Parameters: i index of dimension to be changed
1356  Proto prototype whose dimension is to be altered
1357  ParamDesc description of specified dimension
1358 Operation: This routine alters the ith dimension of the specified
1359  mixed prototype to be D_random.
1360 Return: None
1361 Exceptions: None
1362 History: 6/20/89, DSJ, Created.
1363 ******************************************************************************/
1364 void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) {
1365  Proto->Distrib[i] = D_random;
1366  Proto->Mean[i] = ParamDesc->MidRange;
1367  Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;
1368 
1369  // subtract out the previous magnitude of this dimension from the total
1370  Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1371  Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;
1372  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1373  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1374 
1375  // note that the proto Weight is irrelevant for D_random protos
1376 } // MakeDimRandom
1377 
1378 
1389 void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics) {
1390  Proto->Distrib[i] = uniform;
1391  Proto->Mean[i] = Proto->Cluster->Mean[i] +
1392  (Statistics->Min[i] + Statistics->Max[i]) / 2;
1393  Proto->Variance.Elliptical[i] =
1394  (Statistics->Max[i] - Statistics->Min[i]) / 2;
1395  if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1396  Proto->Variance.Elliptical[i] = MINVARIANCE;
1397 
1398  // subtract out the previous magnitude of this dimension from the total
1399  Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1400  Proto->Magnitude.Elliptical[i] =
1401  1.0 / (2.0 * Proto->Variance.Elliptical[i]);
1402  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1403  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1404 
1405  // note that the proto Weight is irrelevant for uniform protos
1406 } // MakeDimUniform
1407 
1408 
1425 STATISTICS *
1426 ComputeStatistics (inT16 N, PARAM_DESC ParamDesc[], CLUSTER * Cluster) {
1427  STATISTICS *Statistics;
1428  int i, j;
1429  FLOAT32 *CoVariance;
1430  FLOAT32 *Distance;
1431  LIST SearchState;
1432  SAMPLE *Sample;
1433  uinT32 SampleCountAdjustedForBias;
1434 
1435  // allocate memory to hold the statistics results
1436  Statistics = (STATISTICS *) Emalloc (sizeof (STATISTICS));
1437  Statistics->CoVariance = (FLOAT32 *) Emalloc (N * N * sizeof (FLOAT32));
1438  Statistics->Min = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1439  Statistics->Max = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1440 
1441  // allocate temporary memory to hold the sample to mean distances
1442  Distance = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1443 
1444  // initialize the statistics
1445  Statistics->AvgVariance = 1.0;
1446  CoVariance = Statistics->CoVariance;
1447  for (i = 0; i < N; i++) {
1448  Statistics->Min[i] = 0.0;
1449  Statistics->Max[i] = 0.0;
1450  for (j = 0; j < N; j++, CoVariance++)
1451  *CoVariance = 0;
1452  }
1453  // find each sample in the cluster and merge it into the statistics
1454  InitSampleSearch(SearchState, Cluster);
1455  while ((Sample = NextSample (&SearchState)) != NULL) {
1456  for (i = 0; i < N; i++) {
1457  Distance[i] = Sample->Mean[i] - Cluster->Mean[i];
1458  if (ParamDesc[i].Circular) {
1459  if (Distance[i] > ParamDesc[i].HalfRange)
1460  Distance[i] -= ParamDesc[i].Range;
1461  if (Distance[i] < -ParamDesc[i].HalfRange)
1462  Distance[i] += ParamDesc[i].Range;
1463  }
1464  if (Distance[i] < Statistics->Min[i])
1465  Statistics->Min[i] = Distance[i];
1466  if (Distance[i] > Statistics->Max[i])
1467  Statistics->Max[i] = Distance[i];
1468  }
1469  CoVariance = Statistics->CoVariance;
1470  for (i = 0; i < N; i++)
1471  for (j = 0; j < N; j++, CoVariance++)
1472  *CoVariance += Distance[i] * Distance[j];
1473  }
1474  // normalize the variances by the total number of samples
1475  // use SampleCount-1 instead of SampleCount to get an unbiased estimate
1476  // also compute the geometic mean of the diagonal variances
1477  // ensure that clusters with only 1 sample are handled correctly
1478  if (Cluster->SampleCount > 1)
1479  SampleCountAdjustedForBias = Cluster->SampleCount - 1;
1480  else
1481  SampleCountAdjustedForBias = 1;
1482  CoVariance = Statistics->CoVariance;
1483  for (i = 0; i < N; i++)
1484  for (j = 0; j < N; j++, CoVariance++) {
1485  *CoVariance /= SampleCountAdjustedForBias;
1486  if (j == i) {
1487  if (*CoVariance < MINVARIANCE)
1488  *CoVariance = MINVARIANCE;
1489  Statistics->AvgVariance *= *CoVariance;
1490  }
1491  }
1492  Statistics->AvgVariance = (float)pow((double)Statistics->AvgVariance,
1493  1.0 / N);
1494 
1495  // release temporary memory and return
1496  memfree(Distance);
1497  return (Statistics);
1498 } // ComputeStatistics
1499 
1500 
1515  CLUSTER *Cluster,
1516  STATISTICS *Statistics) {
1517  PROTOTYPE *Proto;
1518 
1519  Proto = NewSimpleProto (N, Cluster);
1520 
1521  Proto->Variance.Spherical = Statistics->AvgVariance;
1522  if (Proto->Variance.Spherical < MINVARIANCE)
1523  Proto->Variance.Spherical = MINVARIANCE;
1524 
1525  Proto->Magnitude.Spherical =
1526  1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical));
1527  Proto->TotalMagnitude = (float)pow((double)Proto->Magnitude.Spherical,
1528  (double) N);
1529  Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
1530  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1531 
1532  return (Proto);
1533 } // NewSphericalProto
1534 
1535 
1549  CLUSTER *Cluster,
1550  STATISTICS *Statistics) {
1551  PROTOTYPE *Proto;
1552  FLOAT32 *CoVariance;
1553  int i;
1554 
1555  Proto = NewSimpleProto (N, Cluster);
1556  Proto->Variance.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1557  Proto->Magnitude.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1558  Proto->Weight.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1559 
1560  CoVariance = Statistics->CoVariance;
1561  Proto->TotalMagnitude = 1.0;
1562  for (i = 0; i < N; i++, CoVariance += N + 1) {
1563  Proto->Variance.Elliptical[i] = *CoVariance;
1564  if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1565  Proto->Variance.Elliptical[i] = MINVARIANCE;
1566 
1567  Proto->Magnitude.Elliptical[i] =
1568  1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i]));
1569  Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i];
1570  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1571  }
1572  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1573  Proto->Style = elliptical;
1574  return (Proto);
1575 } // NewEllipticalProto
1576 
1577 
1593 PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics) {
1594  PROTOTYPE *Proto;
1595  int i;
1596 
1597  Proto = NewEllipticalProto (N, Cluster, Statistics);
1598  Proto->Distrib = (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION));
1599 
1600  for (i = 0; i < N; i++) {
1601  Proto->Distrib[i] = normal;
1602  }
1603  Proto->Style = mixed;
1604  return (Proto);
1605 } // NewMixedProto
1606 
1607 
1619  PROTOTYPE *Proto;
1620  int i;
1621 
1622  Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE));
1623  Proto->Mean = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1624 
1625  for (i = 0; i < N; i++)
1626  Proto->Mean[i] = Cluster->Mean[i];
1627  Proto->Distrib = NULL;
1628 
1629  Proto->Significant = TRUE;
1630  Proto->Merged = FALSE;
1631  Proto->Style = spherical;
1632  Proto->NumSamples = Cluster->SampleCount;
1633  Proto->Cluster = Cluster;
1634  Proto->Cluster->Prototype = TRUE;
1635  return (Proto);
1636 } // NewSimpleProto
1637 
1638 
1659 BOOL8
1661 inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence) {
1662  int i, j;
1663  FLOAT32 *VARii; // points to ith on-diagonal element
1664  FLOAT32 *VARjj; // points to jth on-diagonal element
1665  FLOAT32 CorrelationCoeff;
1666 
1667  VARii = CoVariance;
1668  for (i = 0; i < N; i++, VARii += N + 1) {
1669  if (ParamDesc[i].NonEssential)
1670  continue;
1671 
1672  VARjj = VARii + N + 1;
1673  CoVariance = VARii + 1;
1674  for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
1675  if (ParamDesc[j].NonEssential)
1676  continue;
1677 
1678  if ((*VARii == 0.0) || (*VARjj == 0.0))
1679  CorrelationCoeff = 0.0;
1680  else
1681  CorrelationCoeff =
1682  sqrt (sqrt (*CoVariance * *CoVariance / (*VARii * *VARjj)));
1683  if (CorrelationCoeff > Independence)
1684  return (FALSE);
1685  }
1686  }
1687  return (TRUE);
1688 } // Independent
1689 
1690 
1710  DISTRIBUTION Distribution,
1711  uinT32 SampleCount,
1712  FLOAT64 Confidence) {
1713  // Get an old bucket structure with the same number of buckets.
1714  uinT16 NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
1715  BUCKETS *Buckets =
1716  clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS];
1717 
1718  // If a matching bucket structure is not found, make one and save it.
1719  if (Buckets == NULL) {
1720  Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
1721  clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] =
1722  Buckets;
1723  } else {
1724  // Just adjust the existing buckets.
1725  if (SampleCount != Buckets->SampleCount)
1726  AdjustBuckets(Buckets, SampleCount);
1727  if (Confidence != Buckets->Confidence) {
1728  Buckets->Confidence = Confidence;
1729  Buckets->ChiSquared = ComputeChiSquared(
1730  DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets),
1731  Confidence);
1732  }
1733  InitBuckets(Buckets);
1734  }
1735  return Buckets;
1736 } // GetBuckets
1737 
1738 
1760  uinT32 SampleCount,
1761  FLOAT64 Confidence) {
1762  const DENSITYFUNC DensityFunction[] =
1763  { NormalDensity, UniformDensity, UniformDensity };
1764  int i, j;
1765  BUCKETS *Buckets;
1766  FLOAT64 BucketProbability;
1767  FLOAT64 NextBucketBoundary;
1768  FLOAT64 Probability;
1769  FLOAT64 ProbabilityDelta;
1770  FLOAT64 LastProbDensity;
1771  FLOAT64 ProbDensity;
1772  uinT16 CurrentBucket;
1773  BOOL8 Symmetrical;
1774 
1775  // allocate memory needed for data structure
1776  Buckets = reinterpret_cast<BUCKETS*>(Emalloc(sizeof(BUCKETS)));
1777  Buckets->NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
1778  Buckets->SampleCount = SampleCount;
1779  Buckets->Confidence = Confidence;
1780  Buckets->Count = reinterpret_cast<uinT32*>(
1781  Emalloc(Buckets->NumberOfBuckets * sizeof(uinT32)));
1782  Buckets->ExpectedCount = reinterpret_cast<FLOAT32*>(
1783  Emalloc(Buckets->NumberOfBuckets * sizeof(FLOAT32)));
1784 
1785  // initialize simple fields
1786  Buckets->Distribution = Distribution;
1787  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
1788  Buckets->Count[i] = 0;
1789  Buckets->ExpectedCount[i] = 0.0;
1790  }
1791 
1792  // all currently defined distributions are symmetrical
1793  Symmetrical = TRUE;
1794  Buckets->ChiSquared = ComputeChiSquared(
1795  DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
1796 
1797  if (Symmetrical) {
1798  // allocate buckets so that all have approx. equal probability
1799  BucketProbability = 1.0 / (FLOAT64) (Buckets->NumberOfBuckets);
1800 
1801  // distribution is symmetric so fill in upper half then copy
1802  CurrentBucket = Buckets->NumberOfBuckets / 2;
1803  if (Odd (Buckets->NumberOfBuckets))
1804  NextBucketBoundary = BucketProbability / 2;
1805  else
1806  NextBucketBoundary = BucketProbability;
1807 
1808  Probability = 0.0;
1809  LastProbDensity =
1810  (*DensityFunction[(int) Distribution]) (BUCKETTABLESIZE / 2);
1811  for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) {
1812  ProbDensity = (*DensityFunction[(int) Distribution]) (i + 1);
1813  ProbabilityDelta = Integral (LastProbDensity, ProbDensity, 1.0);
1814  Probability += ProbabilityDelta;
1815  if (Probability > NextBucketBoundary) {
1816  if (CurrentBucket < Buckets->NumberOfBuckets - 1)
1817  CurrentBucket++;
1818  NextBucketBoundary += BucketProbability;
1819  }
1820  Buckets->Bucket[i] = CurrentBucket;
1821  Buckets->ExpectedCount[CurrentBucket] +=
1822  (FLOAT32) (ProbabilityDelta * SampleCount);
1823  LastProbDensity = ProbDensity;
1824  }
1825  // place any leftover probability into the last bucket
1826  Buckets->ExpectedCount[CurrentBucket] +=
1827  (FLOAT32) ((0.5 - Probability) * SampleCount);
1828 
1829  // copy upper half of distribution to lower half
1830  for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--)
1831  Buckets->Bucket[i] =
1832  Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets);
1833 
1834  // copy upper half of expected counts to lower half
1835  for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--)
1836  Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];
1837  }
1838  return Buckets;
1839 } // MakeBuckets
1840 
1841 
1842 //---------------------------------------------------------------------------
1844 /*
1845  ** Parameters:
1846  ** SampleCount number of samples to be tested
1847  ** Operation:
1848  ** This routine computes the optimum number of histogram
1849  ** buckets that should be used in a chi-squared goodness of
1850  ** fit test for the specified number of samples. The optimum
1851  ** number is computed based on Table 4.1 on pg. 147 of
1852  ** "Measurement and Analysis of Random Data" by Bendat & Piersol.
1853  ** Linear interpolation is used to interpolate between table
1854  ** values. The table is intended for a 0.05 level of
1855  ** significance (alpha). This routine assumes that it is
1856  ** equally valid for other alpha's, which may not be true.
1857  ** Return:
1858  ** Optimum number of histogram buckets
1859  ** Exceptions:
1860  ** None
1861  ** History:
1862  ** 6/5/89, DSJ, Created.
1863  */
1864  uinT8 Last, Next;
1865  FLOAT32 Slope;
1866 
1867  if (SampleCount < kCountTable[0])
1868  return kBucketsTable[0];
1869 
1870  for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) {
1871  if (SampleCount <= kCountTable[Next]) {
1872  Slope = (FLOAT32) (kBucketsTable[Next] - kBucketsTable[Last]) /
1873  (FLOAT32) (kCountTable[Next] - kCountTable[Last]);
1874  return ((uinT16) (kBucketsTable[Last] +
1875  Slope * (SampleCount - kCountTable[Last])));
1876  }
1877  }
1878  return kBucketsTable[Last];
1879 } // OptimumNumberOfBuckets
1880 
1881 
1882 //---------------------------------------------------------------------------
1883 FLOAT64
1885 /*
1886  ** Parameters:
1887  ** DegreesOfFreedom determines shape of distribution
1888  ** Alpha probability of right tail
1889  ** Operation:
1890  ** This routine computes the chi-squared value which will
1891  ** leave a cumulative probability of Alpha in the right tail
1892  ** of a chi-squared distribution with the specified number of
1893  ** degrees of freedom. Alpha must be between 0 and 1.
1894  ** DegreesOfFreedom must be even. The routine maintains an
1895  ** array of lists. Each list corresponds to a different
1896  ** number of degrees of freedom. Each entry in the list
1897  ** corresponds to a different alpha value and its corresponding
1898  ** chi-squared value. Therefore, once a particular chi-squared
1899  ** value is computed, it is stored in the list and never
1900  ** needs to be computed again.
1901  ** Return: Desired chi-squared value
1902  ** Exceptions: none
1903  ** History: 6/5/89, DSJ, Created.
1904  */
1905 #define CHIACCURACY 0.01
1906 #define MINALPHA (1e-200)
1907 {
1908  static LIST ChiWith[MAXDEGREESOFFREEDOM + 1];
1909 
1910  CHISTRUCT *OldChiSquared;
1911  CHISTRUCT SearchKey;
1912 
1913  // limit the minimum alpha that can be used - if alpha is too small
1914  // it may not be possible to compute chi-squared.
1915  Alpha = ClipToRange(Alpha, MINALPHA, 1.0);
1916  if (Odd (DegreesOfFreedom))
1917  DegreesOfFreedom++;
1918 
1919  /* find the list of chi-squared values which have already been computed
1920  for the specified number of degrees of freedom. Search the list for
1921  the desired chi-squared. */
1922  SearchKey.Alpha = Alpha;
1923  OldChiSquared = (CHISTRUCT *) first_node (search (ChiWith[DegreesOfFreedom],
1924  &SearchKey, AlphaMatch));
1925 
1926  if (OldChiSquared == NULL) {
1927  OldChiSquared = NewChiStruct (DegreesOfFreedom, Alpha);
1928  OldChiSquared->ChiSquared = Solve (ChiArea, OldChiSquared,
1929  (FLOAT64) DegreesOfFreedom,
1930  (FLOAT64) CHIACCURACY);
1931  ChiWith[DegreesOfFreedom] = push (ChiWith[DegreesOfFreedom],
1932  OldChiSquared);
1933  }
1934  else {
1935  // further optimization might move OldChiSquared to front of list
1936  }
1937 
1938  return (OldChiSquared->ChiSquared);
1939 
1940 } // ComputeChiSquared
1941 
1942 
1943 //---------------------------------------------------------------------------
1945 /*
1946  ** Parameters:
1947  ** x number to compute the normal probability density for
1948  ** Globals:
1949  ** kNormalMean mean of a discrete normal distribution
1950  ** kNormalVariance variance of a discrete normal distribution
1951  ** kNormalMagnitude magnitude of a discrete normal distribution
1952  ** Operation:
1953  ** This routine computes the probability density function
1954  ** of a discrete normal distribution defined by the global
1955  ** variables kNormalMean, kNormalVariance, and kNormalMagnitude.
1956  ** Normal magnitude could, of course, be computed in terms of
1957  ** the normal variance but it is precomputed for efficiency.
1958  ** Return:
1959  ** The value of the normal distribution at x.
1960  ** Exceptions:
1961  ** None
1962  ** History:
1963  ** 6/4/89, DSJ, Created.
1964  */
1965  FLOAT64 Distance;
1966 
1967  Distance = x - kNormalMean;
1968  return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance);
1969 } // NormalDensity
1970 
1971 
1972 //---------------------------------------------------------------------------
1974 /*
1975  ** Parameters:
1976  ** x number to compute the uniform probability density for
1977  ** Operation:
1978  ** This routine computes the probability density function
1979  ** of a uniform distribution at the specified point. The
1980  ** range of the distribution is from 0 to BUCKETTABLESIZE.
1981  ** Return:
1982  ** The value of the uniform distribution at x.
1983  ** Exceptions:
1984  ** None
1985  ** History:
1986  ** 6/5/89, DSJ, Created.
1987  */
1988  static FLOAT64 UniformDistributionDensity = (FLOAT64) 1.0 / BUCKETTABLESIZE;
1989 
1990  if ((x >= 0.0) && (x <= BUCKETTABLESIZE))
1991  return UniformDistributionDensity;
1992  else
1993  return (FLOAT64) 0.0;
1994 } // UniformDensity
1995 
1996 
1997 //---------------------------------------------------------------------------
1999 /*
2000  ** Parameters:
2001  ** f1 value of function at x1
2002  ** f2 value of function at x2
2003  ** Dx x2 - x1 (should always be positive)
2004  ** Operation:
2005  ** This routine computes a trapezoidal approximation to the
2006  ** integral of a function over a small delta in x.
2007  ** Return:
2008  ** Approximation of the integral of the function from x1 to x2.
2009  ** Exceptions:
2010  ** None
2011  ** History:
2012  ** 6/5/89, DSJ, Created.
2013  */
2014  return (f1 + f2) * Dx / 2.0;
2015 } // Integral
2016 
2017 
2018 //---------------------------------------------------------------------------
2019 void FillBuckets(BUCKETS *Buckets,
2020  CLUSTER *Cluster,
2021  uinT16 Dim,
2022  PARAM_DESC *ParamDesc,
2023  FLOAT32 Mean,
2024  FLOAT32 StdDev) {
2025 /*
2026  ** Parameters:
2027  ** Buckets histogram buckets to count samples
2028  ** Cluster cluster whose samples are being analyzed
2029  ** Dim dimension of samples which is being analyzed
2030  ** ParamDesc description of the dimension
2031  ** Mean "mean" of the distribution
2032  ** StdDev "standard deviation" of the distribution
2033  ** Operation:
2034  ** This routine counts the number of cluster samples which
2035  ** fall within the various histogram buckets in Buckets. Only
2036  ** one dimension of each sample is examined. The exact meaning
2037  ** of the Mean and StdDev parameters depends on the
2038  ** distribution which is being analyzed (this info is in the
2039  ** Buckets data structure). For normal distributions, Mean
2040  ** and StdDev have the expected meanings. For uniform and
2041  ** random distributions the Mean is the center point of the
2042  ** range and the StdDev is 1/2 the range. A dimension with
2043  ** zero standard deviation cannot be statistically analyzed.
2044  ** In this case, a pseudo-analysis is used.
2045  ** Return:
2046  ** None (the Buckets data structure is filled in)
2047  ** Exceptions:
2048  ** None
2049  ** History:
2050  ** 6/5/89, DSJ, Created.
2051  */
2052  uinT16 BucketID;
2053  int i;
2054  LIST SearchState;
2055  SAMPLE *Sample;
2056 
2057  // initialize the histogram bucket counts to 0
2058  for (i = 0; i < Buckets->NumberOfBuckets; i++)
2059  Buckets->Count[i] = 0;
2060 
2061  if (StdDev == 0.0) {
2062  /* if the standard deviation is zero, then we can't statistically
2063  analyze the cluster. Use a pseudo-analysis: samples exactly on
2064  the mean are distributed evenly across all buckets. Samples greater
2065  than the mean are placed in the last bucket; samples less than the
2066  mean are placed in the first bucket. */
2067 
2068  InitSampleSearch(SearchState, Cluster);
2069  i = 0;
2070  while ((Sample = NextSample (&SearchState)) != NULL) {
2071  if (Sample->Mean[Dim] > Mean)
2072  BucketID = Buckets->NumberOfBuckets - 1;
2073  else if (Sample->Mean[Dim] < Mean)
2074  BucketID = 0;
2075  else
2076  BucketID = i;
2077  Buckets->Count[BucketID] += 1;
2078  i++;
2079  if (i >= Buckets->NumberOfBuckets)
2080  i = 0;
2081  }
2082  }
2083  else {
2084  // search for all samples in the cluster and add to histogram buckets
2085  InitSampleSearch(SearchState, Cluster);
2086  while ((Sample = NextSample (&SearchState)) != NULL) {
2087  switch (Buckets->Distribution) {
2088  case normal:
2089  BucketID = NormalBucket (ParamDesc, Sample->Mean[Dim],
2090  Mean, StdDev);
2091  break;
2092  case D_random:
2093  case uniform:
2094  BucketID = UniformBucket (ParamDesc, Sample->Mean[Dim],
2095  Mean, StdDev);
2096  break;
2097  default:
2098  BucketID = 0;
2099  }
2100  Buckets->Count[Buckets->Bucket[BucketID]] += 1;
2101  }
2102  }
2103 } // FillBuckets
2104 
2105 
2106 //---------------------------------------------------------------------------*/
2108  FLOAT32 x,
2109  FLOAT32 Mean,
2110  FLOAT32 StdDev) {
2111 /*
2112  ** Parameters:
2113  ** ParamDesc used to identify circular dimensions
2114  ** x value to be normalized
2115  ** Mean mean of normal distribution
2116  ** StdDev standard deviation of normal distribution
2117  ** Operation:
2118  ** This routine determines which bucket x falls into in the
2119  ** discrete normal distribution defined by kNormalMean
2120  ** and kNormalStdDev. x values which exceed the range of
2121  ** the discrete distribution are clipped.
2122  ** Return:
2123  ** Bucket number into which x falls
2124  ** Exceptions:
2125  ** None
2126  ** History:
2127  ** 6/5/89, DSJ, Created.
2128  */
2129  FLOAT32 X;
2130 
2131  // wraparound circular parameters if necessary
2132  if (ParamDesc->Circular) {
2133  if (x - Mean > ParamDesc->HalfRange)
2134  x -= ParamDesc->Range;
2135  else if (x - Mean < -ParamDesc->HalfRange)
2136  x += ParamDesc->Range;
2137  }
2138 
2139  X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean;
2140  if (X < 0)
2141  return 0;
2142  if (X > BUCKETTABLESIZE - 1)
2143  return ((uinT16) (BUCKETTABLESIZE - 1));
2144  return (uinT16) floor((FLOAT64) X);
2145 } // NormalBucket
2146 
2147 
2148 //---------------------------------------------------------------------------
2150  FLOAT32 x,
2151  FLOAT32 Mean,
2152  FLOAT32 StdDev) {
2153 /*
2154  ** Parameters:
2155  ** ParamDesc used to identify circular dimensions
2156  ** x value to be normalized
2157  ** Mean center of range of uniform distribution
2158  ** StdDev 1/2 the range of the uniform distribution
2159  ** Operation:
2160  ** This routine determines which bucket x falls into in the
2161  ** discrete uniform distribution defined by
2162  ** BUCKETTABLESIZE. x values which exceed the range of
2163  ** the discrete distribution are clipped.
2164  ** Return:
2165  ** Bucket number into which x falls
2166  ** Exceptions:
2167  ** None
2168  ** History:
2169  ** 6/5/89, DSJ, Created.
2170  */
2171  FLOAT32 X;
2172 
2173  // wraparound circular parameters if necessary
2174  if (ParamDesc->Circular) {
2175  if (x - Mean > ParamDesc->HalfRange)
2176  x -= ParamDesc->Range;
2177  else if (x - Mean < -ParamDesc->HalfRange)
2178  x += ParamDesc->Range;
2179  }
2180 
2181  X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0);
2182  if (X < 0)
2183  return 0;
2184  if (X > BUCKETTABLESIZE - 1)
2185  return (uinT16) (BUCKETTABLESIZE - 1);
2186  return (uinT16) floor((FLOAT64) X);
2187 } // UniformBucket
2188 
2189 
2190 //---------------------------------------------------------------------------
2192 /*
2193  ** Parameters:
2194  ** Buckets histogram data to perform chi-square test on
2195  ** Operation:
2196  ** This routine performs a chi-square goodness of fit test
2197  ** on the histogram data in the Buckets data structure. TRUE
2198  ** is returned if the histogram matches the probability
2199  ** distribution which was specified when the Buckets
2200  ** structure was originally created. Otherwise FALSE is
2201  ** returned.
2202  ** Return:
2203  ** TRUE if samples match distribution, FALSE otherwise
2204  ** Exceptions:
2205  ** None
2206  ** History:
2207  ** 6/5/89, DSJ, Created.
2208  */
2209  FLOAT32 FrequencyDifference;
2210  FLOAT32 TotalDifference;
2211  int i;
2212 
2213  // compute how well the histogram matches the expected histogram
2214  TotalDifference = 0.0;
2215  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2216  FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];
2217  TotalDifference += (FrequencyDifference * FrequencyDifference) /
2218  Buckets->ExpectedCount[i];
2219  }
2220 
2221  // test to see if the difference is more than expected
2222  if (TotalDifference > Buckets->ChiSquared)
2223  return FALSE;
2224  else
2225  return TRUE;
2226 } // DistributionOK
2227 
2228 
2229 //---------------------------------------------------------------------------
2230 void FreeStatistics(STATISTICS *Statistics) {
2231 /*
2232  ** Parameters:
2233  ** Statistics pointer to data structure to be freed
2234  ** Operation:
2235  ** This routine frees the memory used by the statistics
2236  ** data structure.
2237  ** Return:
2238  ** None
2239  ** Exceptions:
2240  ** None
2241  ** History:
2242  ** 6/5/89, DSJ, Created.
2243  */
2244  memfree (Statistics->CoVariance);
2245  memfree (Statistics->Min);
2246  memfree (Statistics->Max);
2247  memfree(Statistics);
2248 } // FreeStatistics
2249 
2250 
2251 //---------------------------------------------------------------------------
2252 void FreeBuckets(BUCKETS *buckets) {
2253 /*
2254  ** Parameters:
2255  ** buckets pointer to data structure to be freed
2256  ** Operation:
2257  ** This routine properly frees the memory used by a BUCKETS.
2258  */
2259  Efree(buckets->Count);
2260  Efree(buckets->ExpectedCount);
2261  Efree(buckets);
2262 } // FreeBuckets
2263 
2264 
2265 //---------------------------------------------------------------------------
2266 void FreeCluster(CLUSTER *Cluster) {
2267 /*
2268  ** Parameters:
2269  ** Cluster pointer to cluster to be freed
2270  ** Operation:
2271  ** This routine frees the memory consumed by the specified
2272  ** cluster and all of its subclusters. This is done by
2273  ** recursive calls to FreeCluster().
2274  ** Return:
2275  ** None
2276  ** Exceptions:
2277  ** None
2278  ** History:
2279  ** 6/6/89, DSJ, Created.
2280  */
2281  if (Cluster != NULL) {
2282  FreeCluster (Cluster->Left);
2283  FreeCluster (Cluster->Right);
2284  memfree(Cluster);
2285  }
2286 } // FreeCluster
2287 
2288 
2289 //---------------------------------------------------------------------------
2290 uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets) {
2291 /*
2292  ** Parameters:
2293  ** Distribution distribution being tested for
2294  ** HistogramBuckets number of buckets in chi-square test
2295  ** Operation:
2296  ** This routine computes the degrees of freedom that should
2297  ** be used in a chi-squared test with the specified number of
2298  ** histogram buckets. The result is always rounded up to
2299  ** the next even number so that the value of chi-squared can be
2300  ** computed more easily. This will cause the value of
2301  ** chi-squared to be higher than the optimum value, resulting
2302  ** in the chi-square test being more lenient than optimum.
2303  ** Return: The number of degrees of freedom for a chi-square test
2304  ** Exceptions: none
2305  ** History: Thu Aug 3 14:04:18 1989, DSJ, Created.
2306  */
2307  static uinT8 DegreeOffsets[] = { 3, 3, 1 };
2308 
2309  uinT16 AdjustedNumBuckets;
2310 
2311  AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[(int) Distribution];
2312  if (Odd (AdjustedNumBuckets))
2313  AdjustedNumBuckets++;
2314  return (AdjustedNumBuckets);
2315 
2316 } // DegreesOfFreedom
2317 
2318 
2319 //---------------------------------------------------------------------------
2320 int NumBucketsMatch(void *arg1, // BUCKETS *Histogram,
2321  void *arg2) { // uinT16 *DesiredNumberOfBuckets)
2322 /*
2323  ** Parameters:
2324  ** Histogram current histogram being tested for a match
2325  ** DesiredNumberOfBuckets match key
2326  ** Operation:
2327  ** This routine is used to search a list of histogram data
2328  ** structures to find one with the specified number of
2329  ** buckets. It is called by the list search routines.
2330  ** Return: TRUE if Histogram matches DesiredNumberOfBuckets
2331  ** Exceptions: none
2332  ** History: Thu Aug 3 14:17:33 1989, DSJ, Created.
2333  */
2334  BUCKETS *Histogram = (BUCKETS *) arg1;
2335  uinT16 *DesiredNumberOfBuckets = (uinT16 *) arg2;
2336 
2337  return (*DesiredNumberOfBuckets == Histogram->NumberOfBuckets);
2338 
2339 } // NumBucketsMatch
2340 
2341 
2342 //---------------------------------------------------------------------------
2343 int ListEntryMatch(void *arg1, //ListNode
2344  void *arg2) { //Key
2345 /*
2346  ** Parameters: none
2347  ** Operation:
2348  ** This routine is used to search a list for a list node
2349  ** whose contents match Key. It is called by the list
2350  ** delete_d routine.
2351  ** Return: TRUE if ListNode matches Key
2352  ** Exceptions: none
2353  ** History: Thu Aug 3 14:23:58 1989, DSJ, Created.
2354  */
2355  return (arg1 == arg2);
2356 
2357 } // ListEntryMatch
2358 
2359 
2360 //---------------------------------------------------------------------------
2361 void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount) {
2362 /*
2363  ** Parameters:
2364  ** Buckets histogram data structure to adjust
2365  ** NewSampleCount new sample count to adjust to
2366  ** Operation:
2367  ** This routine multiplies each ExpectedCount histogram entry
2368  ** by NewSampleCount/OldSampleCount so that the histogram
2369  ** is now adjusted to the new sample count.
2370  ** Return: none
2371  ** Exceptions: none
2372  ** History: Thu Aug 3 14:31:14 1989, DSJ, Created.
2373  */
2374  int i;
2375  FLOAT64 AdjustFactor;
2376 
2377  AdjustFactor = (((FLOAT64) NewSampleCount) /
2378  ((FLOAT64) Buckets->SampleCount));
2379 
2380  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2381  Buckets->ExpectedCount[i] *= AdjustFactor;
2382  }
2383 
2384  Buckets->SampleCount = NewSampleCount;
2385 
2386 } // AdjustBuckets
2387 
2388 
2389 //---------------------------------------------------------------------------
2390 void InitBuckets(BUCKETS *Buckets) {
2391 /*
2392  ** Parameters:
2393  ** Buckets histogram data structure to init
2394  ** Operation:
2395  ** This routine sets the bucket counts in the specified histogram
2396  ** to zero.
2397  ** Return: none
2398  ** Exceptions: none
2399  ** History: Thu Aug 3 14:31:14 1989, DSJ, Created.
2400  */
2401  int i;
2402 
2403  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2404  Buckets->Count[i] = 0;
2405  }
2406 
2407 } // InitBuckets
2408 
2409 
2410 //---------------------------------------------------------------------------
2411 int AlphaMatch(void *arg1, //CHISTRUCT *ChiStruct,
2412  void *arg2) { //CHISTRUCT *SearchKey)
2413 /*
2414  ** Parameters:
2415  ** ChiStruct chi-squared struct being tested for a match
2416  ** SearchKey chi-squared struct that is the search key
2417  ** Operation:
2418  ** This routine is used to search a list of structures which
2419  ** hold pre-computed chi-squared values for a chi-squared
2420  ** value whose corresponding alpha field matches the alpha
2421  ** field of SearchKey.
2422  ** It is called by the list search routines.
2423  ** Return: TRUE if ChiStruct's Alpha matches SearchKey's Alpha
2424  ** Exceptions: none
2425  ** History: Thu Aug 3 14:17:33 1989, DSJ, Created.
2426  */
2427  CHISTRUCT *ChiStruct = (CHISTRUCT *) arg1;
2428  CHISTRUCT *SearchKey = (CHISTRUCT *) arg2;
2429 
2430  return (ChiStruct->Alpha == SearchKey->Alpha);
2431 
2432 } // AlphaMatch
2433 
2434 
2435 //---------------------------------------------------------------------------
2436 CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha) {
2437 /*
2438  ** Parameters:
2439  ** DegreesOfFreedom degrees of freedom for new chi value
2440  ** Alpha confidence level for new chi value
2441  ** Operation:
2442  ** This routine allocates a new data structure which is used
2443  ** to hold a chi-squared value along with its associated
2444  ** number of degrees of freedom and alpha value.
2445  ** Return: none
2446  ** Exceptions: none
2447  ** History: Fri Aug 4 11:04:59 1989, DSJ, Created.
2448  */
2450 
2451  NewChiStruct = (CHISTRUCT *) Emalloc (sizeof (CHISTRUCT));
2452  NewChiStruct->DegreesOfFreedom = DegreesOfFreedom;
2453  NewChiStruct->Alpha = Alpha;
2454  return (NewChiStruct);
2455 
2456 } // NewChiStruct
2457 
2458 
2459 //---------------------------------------------------------------------------
2460 FLOAT64
2461 Solve (SOLVEFUNC Function,
2462 void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy)
2463 /*
2464  ** Parameters:
2465  ** Function function whose zero is to be found
2466  ** FunctionParams arbitrary data to pass to function
2467  ** InitialGuess point to start solution search at
2468  ** Accuracy maximum allowed error
2469  ** Operation:
2470  ** This routine attempts to find an x value at which Function
2471  ** goes to zero (i.e. a root of the function ). It will only
2472  ** work correctly if a solution actually exists and there
2473  ** are no extrema between the solution and the InitialGuess.
2474  ** The algorithms used are extremely primitive.
2475  ** Return: Solution of function ( x for which f(x) = 0 ).
2476  ** Exceptions: none
2477  ** History: Fri Aug 4 11:08:59 1989, DSJ, Created.
2478  */
2479 #define INITIALDELTA 0.1
2480 #define DELTARATIO 0.1
2481 {
2482  FLOAT64 x;
2483  FLOAT64 f;
2484  FLOAT64 Slope;
2485  FLOAT64 Delta;
2486  FLOAT64 NewDelta;
2487  FLOAT64 xDelta;
2488  FLOAT64 LastPosX, LastNegX;
2489 
2490  x = InitialGuess;
2491  Delta = INITIALDELTA;
2492  LastPosX = MAX_FLOAT32;
2493  LastNegX = -MAX_FLOAT32;
2494  f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2495  while (Abs (LastPosX - LastNegX) > Accuracy) {
2496  // keep track of outer bounds of current estimate
2497  if (f < 0)
2498  LastNegX = x;
2499  else
2500  LastPosX = x;
2501 
2502  // compute the approx. slope of f(x) at the current point
2503  Slope =
2504  ((*Function) ((CHISTRUCT *) FunctionParams, x + Delta) - f) / Delta;
2505 
2506  // compute the next solution guess */
2507  xDelta = f / Slope;
2508  x -= xDelta;
2509 
2510  // reduce the delta used for computing slope to be a fraction of
2511  //the amount moved to get to the new guess
2512  NewDelta = Abs (xDelta) * DELTARATIO;
2513  if (NewDelta < Delta)
2514  Delta = NewDelta;
2515 
2516  // compute the value of the function at the new guess
2517  f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2518  }
2519  return (x);
2520 
2521 } // Solve
2522 
2523 
2524 //---------------------------------------------------------------------------
2526 /*
2527  ** Parameters:
2528  ** ChiParams contains degrees of freedom and alpha
2529  ** x value of chi-squared to evaluate
2530  ** Operation:
2531  ** This routine computes the area under a chi density curve
2532  ** from 0 to x, minus the desired area under the curve. The
2533  ** number of degrees of freedom of the chi curve is specified
2534  ** in the ChiParams structure. The desired area is also
2535  ** specified in the ChiParams structure as Alpha ( or 1 minus
2536  ** the desired area ). This routine is intended to be passed
2537  ** to the Solve() function to find the value of chi-squared
2538  ** which will yield a desired area under the right tail of
2539  ** the chi density curve. The function will only work for
2540  ** even degrees of freedom. The equations are based on
2541  ** integrating the chi density curve in parts to obtain
2542  ** a series that can be used to compute the area under the
2543  ** curve.
2544  ** Return: Error between actual and desired area under the chi curve.
2545  ** Exceptions: none
2546  ** History: Fri Aug 4 12:48:41 1989, DSJ, Created.
2547  */
2548  int i, N;
2549  FLOAT64 SeriesTotal;
2550  FLOAT64 Denominator;
2551  FLOAT64 PowerOfx;
2552 
2553  N = ChiParams->DegreesOfFreedom / 2 - 1;
2554  SeriesTotal = 1;
2555  Denominator = 1;
2556  PowerOfx = 1;
2557  for (i = 1; i <= N; i++) {
2558  Denominator *= 2 * i;
2559  PowerOfx *= x;
2560  SeriesTotal += PowerOfx / Denominator;
2561  }
2562  return ((SeriesTotal * exp (-0.5 * x)) - ChiParams->Alpha);
2563 
2564 } // ChiArea
2565 
2566 
2567 //---------------------------------------------------------------------------
2568 BOOL8
2570 CLUSTER * Cluster, FLOAT32 MaxIllegal)
2571 /*
2572  ** Parameters:
2573  ** Clusterer data structure holding cluster tree
2574  ** Cluster cluster containing samples to be tested
2575  ** MaxIllegal max percentage of samples allowed to have
2576  ** more than 1 feature in the cluster
2577  ** Operation:
2578  ** This routine looks at all samples in the specified cluster.
2579  ** It computes a running estimate of the percentage of the
2580  ** charaters which have more than 1 sample in the cluster.
2581  ** When this percentage exceeds MaxIllegal, TRUE is returned.
2582  ** Otherwise FALSE is returned. The CharID
2583  ** fields must contain integers which identify the training
2584  ** characters which were used to generate the sample. One
2585  ** integer is used for each sample. The NumChar field in
2586  ** the Clusterer must contain the number of characters in the
2587  ** training set. All CharID fields must be between 0 and
2588  ** NumChar-1. The main function of this routine is to help
2589  ** identify clusters which need to be split further, i.e. if
2590  ** numerous training characters have 2 or more features which are
2591  ** contained in the same cluster, then the cluster should be
2592  ** split.
2593  ** Return: TRUE if the cluster should be split, FALSE otherwise.
2594  ** Exceptions: none
2595  ** History: Wed Aug 30 11:13:05 1989, DSJ, Created.
2596  ** 2/22/90, DSJ, Added MaxIllegal control rather than always
2597  ** splitting illegal clusters.
2598  */
2599 #define ILLEGAL_CHAR 2
2600 {
2601  static BOOL8 *CharFlags = NULL;
2602  static inT32 NumFlags = 0;
2603  int i;
2604  LIST SearchState;
2605  SAMPLE *Sample;
2606  inT32 CharID;
2607  inT32 NumCharInCluster;
2608  inT32 NumIllegalInCluster;
2609  FLOAT32 PercentIllegal;
2610 
2611  // initial estimate assumes that no illegal chars exist in the cluster
2612  NumCharInCluster = Cluster->SampleCount;
2613  NumIllegalInCluster = 0;
2614 
2615  if (Clusterer->NumChar > NumFlags) {
2616  if (CharFlags != NULL)
2617  memfree(CharFlags);
2618  NumFlags = Clusterer->NumChar;
2619  CharFlags = (BOOL8 *) Emalloc (NumFlags * sizeof (BOOL8));
2620  }
2621 
2622  for (i = 0; i < NumFlags; i++)
2623  CharFlags[i] = FALSE;
2624 
2625  // find each sample in the cluster and check if we have seen it before
2626  InitSampleSearch(SearchState, Cluster);
2627  while ((Sample = NextSample (&SearchState)) != NULL) {
2628  CharID = Sample->CharID;
2629  if (CharFlags[CharID] == FALSE) {
2630  CharFlags[CharID] = TRUE;
2631  }
2632  else {
2633  if (CharFlags[CharID] == TRUE) {
2634  NumIllegalInCluster++;
2635  CharFlags[CharID] = ILLEGAL_CHAR;
2636  }
2637  NumCharInCluster--;
2638  PercentIllegal = (FLOAT32) NumIllegalInCluster / NumCharInCluster;
2639  if (PercentIllegal > MaxIllegal) {
2640  destroy(SearchState);
2641  return (TRUE);
2642  }
2643  }
2644  }
2645  return (FALSE);
2646 
2647 } // MultipleCharSamples
2648 
2649 // Compute the inverse of a matrix using LU decomposition with partial pivoting.
2650 // The return value is the sum of norms of the off-diagonal terms of the
2651 // product of a and inv. (A measure of the error.)
2652 double InvertMatrix(const float* input, int size, float* inv) {
2653  // Allocate memory for the 2D arrays.
2654  GENERIC_2D_ARRAY<double> U(size, size, 0.0);
2655  GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0);
2656  GENERIC_2D_ARRAY<double> L(size, size, 0.0);
2657 
2658  // Initialize the working matrices. U starts as input, L as I and U_inv as O.
2659  int row;
2660  int col;
2661  for (row = 0; row < size; row++) {
2662  for (col = 0; col < size; col++) {
2663  U[row][col] = input[row*size + col];
2664  L[row][col] = row == col ? 1.0 : 0.0;
2665  U_inv[row][col] = 0.0;
2666  }
2667  }
2668 
2669  // Compute forward matrix by inversion by LU decomposition of input.
2670  for (col = 0; col < size; ++col) {
2671  // Find best pivot
2672  int best_row = 0;
2673  double best_pivot = -1.0;
2674  for (row = col; row < size; ++row) {
2675  if (Abs(U[row][col]) > best_pivot) {
2676  best_pivot = Abs(U[row][col]);
2677  best_row = row;
2678  }
2679  }
2680  // Exchange pivot rows.
2681  if (best_row != col) {
2682  for (int k = 0; k < size; ++k) {
2683  double tmp = U[best_row][k];
2684  U[best_row][k] = U[col][k];
2685  U[col][k] = tmp;
2686  tmp = L[best_row][k];
2687  L[best_row][k] = L[col][k];
2688  L[col][k] = tmp;
2689  }
2690  }
2691  // Now do the pivot itself.
2692  for (row = col + 1; row < size; ++row) {
2693  double ratio = -U[row][col] / U[col][col];
2694  for (int j = col; j < size; ++j) {
2695  U[row][j] += U[col][j] * ratio;
2696  }
2697  for (int k = 0; k < size; ++k) {
2698  L[row][k] += L[col][k] * ratio;
2699  }
2700  }
2701  }
2702  // Next invert U.
2703  for (col = 0; col < size; ++col) {
2704  U_inv[col][col] = 1.0 / U[col][col];
2705  for (row = col - 1; row >= 0; --row) {
2706  double total = 0.0;
2707  for (int k = col; k > row; --k) {
2708  total += U[row][k] * U_inv[k][col];
2709  }
2710  U_inv[row][col] = -total / U[row][row];
2711  }
2712  }
2713  // Now the answer is U_inv.L.
2714  for (row = 0; row < size; row++) {
2715  for (col = 0; col < size; col++) {
2716  double sum = 0.0;
2717  for (int k = row; k < size; ++k) {
2718  sum += U_inv[row][k] * L[k][col];
2719  }
2720  inv[row*size + col] = sum;
2721  }
2722  }
2723  // Check matrix product.
2724  double error_sum = 0.0;
2725  for (row = 0; row < size; row++) {
2726  for (col = 0; col < size; col++) {
2727  double sum = 0.0;
2728  for (int k = 0; k < size; ++k) {
2729  sum += input[row*size + k] * inv[k *size + col];
2730  }
2731  if (row != col) {
2732  error_sum += Abs(sum);
2733  }
2734  }
2735  }
2736  return error_sum;
2737 }
#define MINVARIANCE
Definition: cluster.cpp:142
#define Odd(N)
Definition: cluster.cpp:206
#define PI
Definition: const.h:19
FLOAT32 * Max
Definition: cluster.cpp:175
bool Pop(Pair *entry)
Definition: genericheap.h:116
FLOAT32 * Min
Definition: cluster.cpp:174
FLOAT32 Min
Definition: ocrfeatures.h:49
uinT16 UniformBucket(PARAM_DESC *ParamDesc, FLOAT32 x, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:2149
PROTOTYPE * MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
Definition: cluster.cpp:1258
#define INITIALDELTA
#define FTABLE_Y
Definition: cluster.cpp:32
struct sample * Left
Definition: cluster.h:36
#define first_node(l)
Definition: oldlist.h:139
FLOAT32 AvgVariance
Definition: cluster.cpp:172
CHISTRUCT * NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha)
Definition: cluster.cpp:2436
float FLOAT32
Definition: host.h:111
#define ILLEGAL_CHAR
Definition: kdtree.h:49
LIST destroy(LIST list)
Definition: oldlist.cpp:187
PROTOTYPE * NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1593
void FreeBuckets(BUCKETS *Buckets)
Definition: cluster.cpp:2252
BOOL8 Independent(PARAM_DESC ParamDesc[], inT16 N, FLOAT32 *CoVariance, FLOAT32 Independence)
Definition: cluster.cpp:1660
unsigned Clustered
Definition: cluster.h:33
#define MINBUCKETS
Definition: cluster.h:26
void FreeCluster(CLUSTER *Cluster)
Definition: cluster.cpp:2266
void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc)
Definition: cluster.cpp:1364
DISTRIBUTION * Distrib
Definition: cluster.h:77
PROTOTYPE * TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1113
BOOL8 MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, FLOAT32 MaxIllegal)
Definition: cluster.cpp:2569
#define FTABLE_X
Definition: cluster.cpp:31
void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount)
Definition: cluster.cpp:2361
unsigned SampleCount
Definition: cluster.h:35
#define tprintf(...)
Definition: tprintf.h:31
#define BUCKETTABLESIZE
Definition: cluster.cpp:160
#define LOOKUPTABLESIZE
Definition: cluster.cpp:228
uinT16 Bucket[BUCKETTABLESIZE]
Definition: cluster.cpp:184
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1-MINBUCKETS]
Definition: cluster.h:95
double InvertMatrix(const float *input, int size, float *inv)
Definition: cluster.cpp:2652
struct sample * Right
Definition: cluster.h:37
void KDNearestNeighborSearch(KDTREE *Tree, FLOAT32 Query[], int QuerySize, FLOAT32 MaxDistance, int *NumberOfResults, void **NBuffer, FLOAT32 DBuffer[])
Definition: kdtree.cpp:307
CLUSTERER * MakeClusterer(inT16 SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:399
unsigned char BOOL8
Definition: host.h:113
FLOAT64 ComputeChiSquared(uinT16 DegreesOfFreedom, FLOAT64 Alpha)
Definition: cluster.cpp:1884
#define SqrtOf2Pi
Definition: cluster.cpp:218
SAMPLE * MakeSample(CLUSTERER *Clusterer, const FLOAT32 *Feature, inT32 CharID)
Definition: cluster.cpp:454
#define Abs(N)
Definition: cluster.cpp:208
FLOAT32 Spherical
Definition: cluster.h:63
PROTOTYPE * MakeDegenerateProto(uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics, PROTOSTYLE Style, inT32 MinSamples)
Definition: cluster.cpp:1071
uinT16 OptimumNumberOfBuckets(uinT32 SampleCount)
Definition: cluster.cpp:1843
DISTRIBUTION
Definition: cluster.h:58
CLUSTERCONFIG Config
FLOAT32 LogMagnitude
Definition: cluster.h:80
FLOATUNION Variance
Definition: cluster.h:81
FLOAT32 * Mean
Definition: cluster.h:78
#define MAXNEIGHBORS
Definition: cluster.h:59
KDTREE * KDTree
Definition: cluster.h:90
FLOAT32 HalfRange
Definition: ocrfeatures.h:52
#define NORMALEXTENT
Definition: cluster.cpp:161
FLOAT64 Confidence
Definition: cluster.cpp:181
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:115
#define ASSERT_HOST(x)
Definition: errcode.h:84
void FreeKDTree(KDTREE *Tree)
Definition: kdtree.cpp:346
PROTOTYPE * NewEllipticalProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1548
inT32 NumberOfSamples
Definition: cluster.h:89
void DoError(int Error, const char *Message)
Definition: danerror.cpp:32
unsigned Significant
Definition: cluster.h:68
FLOATUNION Weight
Definition: cluster.h:83
#define MINSAMPLESNEEDED
Definition: cluster.cpp:152
FLOAT32 Independence
Definition: cluster.h:53
#define MAXDISTANCE
FLOAT32 MaxIllegal
Definition: cluster.h:51
FLOAT32 TotalMagnitude
Definition: cluster.h:79
int ListEntryMatch(void *arg1, void *arg2)
Definition: cluster.cpp:2343
int MagicSamples
Definition: cluster.h:55
#define MINSAMPLES
Definition: cluster.cpp:151
FLOAT64 ChiSquared
Definition: cluster.cpp:192
FLOAT64 UniformDensity(inT32 x)
Definition: cluster.cpp:1973
unsigned int uinT32
Definition: host.h:103
void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, inT32 Level)
Definition: cluster.cpp:768
BUCKETS * MakeBuckets(DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
Definition: cluster.cpp:1759
int AlphaMatch(void *arg1, void *arg2)
Definition: cluster.cpp:2411
CLUSTER * Neighbor
Definition: cluster.cpp:165
#define HOTELLING
Definition: cluster.cpp:30
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:508
FLOAT32 MidRange
Definition: ocrfeatures.h:53
unsigned NumSamples
Definition: cluster.h:75
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:413
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:564
void destroy_nodes(LIST list, void_dest destructor)
Definition: oldlist.cpp:204
FLOAT32 Mean[1]
Definition: cluster.h:39
FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x)
Definition: cluster.cpp:2525
CLUSTER * Root
Definition: cluster.h:91
FLOAT64 Solve(SOLVEFUNC Function, void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy)
Definition: cluster.cpp:2461
#define CHIACCURACY
uinT32 SampleCount
Definition: cluster.cpp:180
#define RootOf(T)
Definition: kdtree.h:58
#define Mirror(N, R)
Definition: cluster.cpp:207
uinT16 NumberOfBuckets
Definition: cluster.cpp:183
void InitBuckets(BUCKETS *Buckets)
Definition: cluster.cpp:2390
FLOAT64 Confidence
Definition: cluster.h:54
ClusterHeap * heap
Definition: cluster.cpp:197
void Push(Pair *entry)
Definition: genericheap.h:95
#define ALREADYCLUSTERED
Definition: cluster.h:133
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 * Elliptical
Definition: cluster.h:64
CLUSTER * Cluster
Definition: cluster.h:76
CLUSTER * FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, FLOAT32 *Distance)
Definition: cluster.cpp:802
FLOAT32 MinSamples
Definition: cluster.h:50
const double FTable[FTABLE_Y][FTABLE_X]
Definition: cluster.cpp:35
int NumBucketsMatch(void *arg1, void *arg2)
Definition: cluster.cpp:2320
PROTOSTYLE ProtoStyle
Definition: cluster.h:49
PROTOTYPE * MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster)
Definition: cluster.cpp:978
inT8 NonEssential
Definition: ocrfeatures.h:48
BOOL8 DistributionOK(BUCKETS *Buckets)
Definition: cluster.cpp:2191
inT32 NumChar
Definition: cluster.h:93
inT8 Circular
Definition: ocrfeatures.h:47
uinT16 NormalBucket(PARAM_DESC *ParamDesc, FLOAT32 x, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:2107
void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uinT16 Dim, PARAM_DESC *ParamDesc, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:2019
LIST ProtoList
Definition: cluster.h:92
void KDStore(KDTREE *Tree, FLOAT32 *Key, void *Data)
Definition: kdtree.cpp:209
void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:931
void * Emalloc(int Size)
Definition: emalloc.cpp:35
Definition: cluster.h:45
#define NIL_LIST
Definition: oldlist.h:126
unsigned Prototype
Definition: cluster.h:34
CLUSTER * MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster)
Definition: cluster.cpp:841
FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:643
#define FALSE
Definition: capi.h:29
PARAM_DESC * ParamDesc
Definition: cluster.h:88
FLOAT32 Range
Definition: ocrfeatures.h:51
inT32 MergeClusters(inT16 N, register PARAM_DESC ParamDesc[], register inT32 n1, register inT32 n2, register FLOAT32 m[], register FLOAT32 m1[], register FLOAT32 m2[])
void(* void_proc)(...)
Definition: cutil.h:66
STATISTICS * ComputeStatistics(inT16 N, PARAM_DESC ParamDesc[], CLUSTER *Cluster)
Definition: cluster.cpp:1426
Definition: cluster.h:32
PROTOTYPE * MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
Definition: cluster.cpp:1220
CLUSTER * NextSample(LIST *SearchState)
Definition: cluster.cpp:618
uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets)
Definition: cluster.cpp:2290
void Efree(void *ptr)
Definition: emalloc.cpp:85
#define InitSampleSearch(S, C)
Definition: cluster.h:105
#define TRUE
Definition: capi.h:28
PROTOTYPE * NewSphericalProto(uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1514
unsigned Style
Definition: cluster.h:74
#define MAXBUCKETS
Definition: cluster.h:27
tesseract::KDPairInc< float, TEMPCLUSTER * > ClusterPair
Definition: cluster.cpp:168
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:536
#define MAX_FLOAT32
Definition: host.h:124
LIST push(LIST list, void *element)
Definition: oldlist.cpp:323
uinT32 * Count
Definition: cluster.cpp:185
void KDWalk(KDTREE *Tree, void_proc action, void *context)
Definition: kdtree.cpp:339
FLOAT64(* SOLVEFUNC)(CHISTRUCT *, double)
Definition: cluster.cpp:204
void CreateClusterTree(CLUSTERER *Clusterer)
Definition: cluster.cpp:698
CLUSTER * Cluster
Definition: cluster.cpp:164
#define DELTARATIO
inT32 CharID
Definition: cluster.h:38
void memfree(void *element)
Definition: freelist.cpp:30
void FreePrototype(void *arg)
Definition: cluster.cpp:579
BUCKETS * GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
Definition: cluster.cpp:1709
FLOAT64 NormalDensity(inT32 x)
Definition: cluster.cpp:1944
#define NULL
Definition: host.h:144
FLOAT64(* DENSITYFUNC)(inT32)
Definition: cluster.cpp:203
void KDDelete(KDTREE *Tree, FLOAT32 Key[], void *Data)
Definition: kdtree.cpp:269
PROTOTYPE * MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *NormalBuckets, FLOAT64 Confidence)
Definition: cluster.cpp:1302
DISTRIBUTION Distribution
Definition: cluster.cpp:179
#define MAXDEGREESOFFREEDOM
Definition: cluster.cpp:229
LIST pop(LIST list)
Definition: oldlist.cpp:305
unsigned Merged
Definition: cluster.h:69
uinT16 DegreesOfFreedom
Definition: cluster.cpp:190
void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics)
Definition: cluster.cpp:1389
KDTREE * MakeKDTree(inT16 KeySize, const PARAM_DESC KeyDesc[])
Definition: kdtree.cpp:184
TEMPCLUSTER * candidates
Definition: cluster.cpp:198
FLOAT32 StandardDeviation(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:657
inT16 SampleSize
Definition: cluster.h:87
void FreeStatistics(STATISTICS *Statistics)
Definition: cluster.cpp:2230
PROTOSTYLE
Definition: cluster.h:44
FLOAT64 Alpha
Definition: cluster.cpp:191
FLOAT32 * CoVariance
Definition: cluster.cpp:173
PROTOTYPE * NewSimpleProto(inT16 N, CLUSTER *Cluster)
Definition: cluster.cpp:1618
FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx)
Definition: cluster.cpp:1998
FLOAT32 Max
Definition: ocrfeatures.h:50
FLOAT32 * ExpectedCount
Definition: cluster.cpp:186
unsigned short uinT16
Definition: host.h:101
double FLOAT64
Definition: host.h:112
tesseract::GenericHeap< ClusterPair > ClusterHeap
Definition: cluster.cpp:169
FLOAT64 ChiSquared
Definition: cluster.cpp:182
#define MINALPHA
short inT16
Definition: host.h:100
int inT32
Definition: host.h:102
unsigned char uinT8
Definition: host.h:99