Made some major classifier and clustering improvements

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@130 d0cd1f9f-072b-0410-8dd7-cf729c803f20

Made some major classifier and clustering improvements
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@130 d0cd1f9f-072b-0410-8dd7-cf729c803f20
6b5e0c40 · theraysmith · 166c867d · 6b5e0c40 · 6b5e0c40 · 6b5e0c40
10 changed file
--- a/ccmain/blobcmp.cpp
+++ b/ccmain/blobcmp.cpp
@@ -62,7 +62,7 @@ float compare_tess_blobs(TBLOB *blob1,
    SetBaseLineMatch();
    IntegerMatcher (ClassForClassId (ad_templates->Templates, CMP_CLASS),
      AllProtosOn, AllConfigsOn, fcount, fcount,
-      int_features, 0, 0, &int_result, testedit_match_debug);
+      int_features, 0, &int_result, testedit_match_debug);
    FreeFeatureSet(float_features);
    if (int_result.Rating < 0)
      int_result.Rating = MAX_FLOAT32;

--- a/classify/adaptmatch.cpp
+++ b/classify/adaptmatch.cpp
--- a/classify/cluster.cpp
+++ b/classify/cluster.cpp
@@ -19,6 +19,7 @@
 #include "const.h"
 #include "cluster.h"
 #include "emalloc.h"
+#include "tprintf.h"
 #include "danerror.h"
 #include "freelist.h"
 #include <math.h>
@@ -281,6 +282,7 @@ PROTOTYPE *MakeDegenerateProto(UINT16 N,
                               INT32 MinSamples);

 PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
+                               CLUSTERCONFIG *Config,
                               CLUSTER *Cluster,
                               STATISTICS *Statistics);

@@ -1037,7 +1039,7 @@ PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,
  }

  if (HOTELLING && Config->ProtoStyle == elliptical) {
-    Proto = TestEllipticalProto(Clusterer, Cluster, Statistics);
+    Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
    if (Proto != NULL) {
      FreeStatistics(Statistics);
      return Proto;
@@ -1129,6 +1131,7 @@ PROTOTYPE *MakeDegenerateProto(  //this was MinSample

 /** TestEllipticalProto ****************************************************
 Parameters:	Clusterer	data struct containing samples being clustered
+      Config provides the magic number of samples that make a good cluster
      Cluster		cluster to be made into an elliptical prototype
      Statistics	statistical info about cluster
 Globals:	None
@@ -1141,24 +1144,60 @@ Operation:	This routine tests the specified cluster to see if **
 Return:		Pointer to new elliptical prototype or NULL.
 ****************************************************************************/
 PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
+                               CLUSTERCONFIG *Config,
                               CLUSTER *Cluster,
                               STATISTICS *Statistics) {
+  // Fraction of the number of samples used as a range around 1 within
+  // which a cluster has the magic size that allows a boost to the
+  // FTable by kFTableBoostMargin, thus allowing clusters near the
+  // magic size (equal to the number of sample characters) to be more
+  // likely to stay together.
+  const double kMagicSampleMargin = 0.0625;
+  const double kFTableBoostMargin = 2.0;
+
  int N = Clusterer->SampleSize;
  CLUSTER* Left = Cluster->Left;
  CLUSTER* Right = Cluster->Right;
  if (Left == NULL || Right == NULL)
    return NULL;
  int TotalDims = Left->SampleCount + Right->SampleCount;
-  if (TotalDims < N + 1)
+  if (TotalDims < N + 1 || TotalDims < 2)
    return NULL;
-  FLOAT32* Inverse = (FLOAT32 *) Emalloc(N * N * sizeof(FLOAT32));
-  FLOAT32* Delta = (FLOAT32*) Emalloc(N * sizeof(FLOAT32));
-  double err = InvertMatrix(Statistics->CoVariance, N, Inverse);
+  const int kMatrixSize = N * N * sizeof(FLOAT32);
+  FLOAT32* Covariance = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
+  FLOAT32* Inverse = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
+  FLOAT32* Delta = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32)));
+  // Compute a new covariance matrix that only uses essential features.
+  for (int i = 0; i < N; ++i) {
+    int row_offset = i * N;
+    if (!Clusterer->ParamDesc[i].NonEssential) {
+      for (int j = 0; j < N; ++j) {
+        if (!Clusterer->ParamDesc[j].NonEssential)
+          Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
+        else
+          Covariance[j + row_offset] = 0.0f;
+      }
+    } else {
+      for (int j = 0; j < N; ++j) {
+        if (i == j)
+          Covariance[j + row_offset] = 1.0f;
+        else
+          Covariance[j + row_offset] = 0.0f;
+      }
+    }
+  }
+  double err = InvertMatrix(Covariance, N, Inverse);
  if (err > 1) {
-    cprintf("Clustering error: Matrix inverse failed with error %g\n", err);
+    tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
  }
+  int EssentialN = 0;
  for (int dim = 0; dim < N; ++dim) {
-    Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
+    if (!Clusterer->ParamDesc[dim].NonEssential) {
+      Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
+      ++EssentialN;
+    } else {
+      Delta[dim] = 0.0f;
+    }
  }
  // Compute Hotelling's T-squared.
  double Tsq = 0.0;
@@ -1169,19 +1208,30 @@ PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
    }
    Tsq += Delta[x] * temp;
  }
+  memfree(Covariance);
  memfree(Inverse);
  memfree(Delta);
-  Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
-  double F = Tsq * (TotalDims - N - 1) / ((TotalDims - N) * 2);
-  int Fx = N;
+  // Changed this function to match the formula in
+  // Statistical Methods in Medical Research p 473
+  // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
+  // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
+  double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN);
+  int Fx = EssentialN;
  if (Fx > FTABLE_X)
    Fx = FTABLE_X;
  --Fx;
-  int Fy = TotalDims - N - 1;
+  int Fy = TotalDims - EssentialN - 1;
  if (Fy > FTABLE_Y)
    Fy = FTABLE_Y;
  --Fy;
-  if (F < FTable[Fy][Fx]) {
+  double FTarget = FTable[Fy][Fx];
+  if (Config->MagicSamples > 0 &&
+      TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
+      TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
+    // Give magic-sized clusters a magic FTable boost.
+    FTarget += kFTableBoostMargin;
+  }
+  if (F < FTarget) {
    return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
  }
  return NULL;

--- a/classify/cluster.h
+++ b/classify/cluster.h
@@ -55,6 +55,7 @@ typedef struct                   // parameters to control clustering
  // more than 1 feature in that cluster
  FLOAT32 Independence;          // desired independence between dimensions
  FLOAT64 Confidence;            // desired confidence in prototypes created
+  int MagicSamples;              // Ideal number of samples in a cluster.
 }


@@ -80,8 +81,13 @@ FLOATUNION;
 typedef struct proto
 {
  unsigned Significant:1;        // TRUE if prototype is significant
+  unsigned Merged:1;             // Merged after clustering so do not output
+                                 // but kept for display purposes. If it has no
+                                 // samples then it was actually merged.
+                                 // Otherwise it matched an already significant
+                                 // cluster.
  unsigned Style:2;              // spherical, elliptical, or mixed
-  unsigned NumSamples:29;        // number of samples in the cluster
+  unsigned NumSamples:28;        // number of samples in the cluster
  CLUSTER *Cluster;              // ptr to cluster which made prototype
  DISTRIBUTION *Distrib;         // different distribution for each dimension
  FLOAT32 *Mean;                 // prototype mean
@@ -129,19 +135,22 @@ CLUSTERER *MakeClusterer (INT16 SampleSize, PARAM_DESC ParamDesc[]);

 SAMPLE *MakeSample (CLUSTERER * Clusterer, FLOAT32 Feature[], INT32 CharID);

-LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config); 
+LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);

-void FreeClusterer(CLUSTERER *Clusterer); 
+void FreeClusterer(CLUSTERER *Clusterer);

-void FreeProtoList(LIST *ProtoList); 
+void FreeProtoList(LIST *ProtoList);

 void FreePrototype(void *arg);  //PROTOTYPE     *Prototype);

-CLUSTER *NextSample(LIST *SearchState); 
+CLUSTER *NextSample(LIST *SearchState);

-FLOAT32 Mean(PROTOTYPE *Proto, UINT16 Dimension); 
+FLOAT32 Mean(PROTOTYPE *Proto, UINT16 Dimension);

-FLOAT32 StandardDeviation(PROTOTYPE *Proto, UINT16 Dimension); 
+FLOAT32 StandardDeviation(PROTOTYPE *Proto, UINT16 Dimension);
+
+INT32 MergeClusters(INT16 N, PARAM_DESC ParamDesc[], INT32 n1, INT32 n2,
+                    FLOAT32 m[], FLOAT32 m1[], FLOAT32 m2[]);

 //--------------Global Data Definitions and Declarations---------------------------
 // define errors that can be trapped

--- a/classify/featdefs.cpp
+++ b/classify/featdefs.cpp
@@ -41,7 +41,7 @@
 StartParamDesc (MicroFeatureParams)
 DefineParam (0, 0, -0.5, 0.5)
 DefineParam (0, 0, -0.25, 0.75)
-DefineParam (0, 0, 0.0, 1.0)
+DefineParam (0, 1, 0.0, 1.0)
 DefineParam (1, 0, 0.0, 1.0)
 DefineParam (0, 1, -0.5, 0.5)
 DefineParam (0, 1, -0.5, 0.5)
@@ -65,9 +65,9 @@ DefineFeature (PicoFeatDesc, 2, 1, 1, MAX_UINT8, "Pico", "pf", PicoFeatParams)
 /* define all of the parameters for the NormFeat type*/
 StartParamDesc (CharNormParams)
 DefineParam (0, 0, -0.25, 0.75)
-DefineParam (0, 0, 0.0, 1.0)
-DefineParam (0, 0, 0.0, 1.0)
-DefineParam (0, 0, 0.0, 1.0)
+DefineParam (0, 1, 0.0, 1.0)
+DefineParam (0, 1, 0.0, 1.0)
+DefineParam (0, 1, 0.0, 1.0)
 EndParamDesc
 /* now define the feature type itself (see features.h for info about each
  parameter).*/

--- a/classify/intmatcher.cpp
+++ b/classify/intmatcher.cpp
--- a/classify/intmatcher.h
+++ b/classify/intmatcher.h
@@ -30,6 +30,7 @@ typedef struct
  FLOAT32 Rating;
  UINT8 Config;
  UINT8 Config2;
+  UINT16 FeatureMisses;
 }


@@ -38,8 +39,7 @@ INT_RESULT_STRUCT, *INT_RESULT;
 typedef struct
 {
  FLOAT32 Rating;
-  FLOAT32 Rating2;
-  UINT32 config_mask;
+  INT_RESULT_STRUCT IMResult;
  CLASS_ID Class;
 }

@@ -68,42 +68,12 @@ int ClassPruner(INT_TEMPLATES IntTemplates,
                CLASS_PRUNER_RESULTS Results,
                int Debug);

-int feature_pruner(INT_TEMPLATES IntTemplates,
-                   INT16 NumFeatures,
-                   INT_FEATURE_ARRAY Features,
-                   INT32 NumClasses,
-                   CLASS_PRUNER_RESULTS Results);
-
-int prune_configs(INT_TEMPLATES IntTemplates,
-                  INT32 min_misses,
-                  INT16 NumFeatures,
-                  INT_FEATURE_ARRAY Features,
-                  CLASS_NORMALIZATION_ARRAY NormalizationFactors,
-                  INT32 class_count,
-                  UINT16 BlobLength,
-                  CLASS_PRUNER_RESULTS Results,
-                  int Debug);
-
-void PruningMatcher(INT_CLASS ClassTemplate,
-                    UINT16 BlobLength,
-                    INT16 NumFeatures,
-                    INT_FEATURE_ARRAY Features,
-                    INT32 min_misses,
-                    UINT8 NormalizationFactor,
-                    INT_RESULT Result,
-                    int Debug);
-
-void config_mask_to_proto_mask(INT_CLASS ClassTemplate,
-                               BIT_VECTOR config_mask,
-                               BIT_VECTOR proto_mask);
-
 void IntegerMatcher(INT_CLASS ClassTemplate,
                    BIT_VECTOR ProtoMask,
                    BIT_VECTOR ConfigMask,
                    UINT16 BlobLength,
                    INT16 NumFeatures,
                    INT_FEATURE_ARRAY Features,
-                    INT32 min_misses,
                    UINT8 NormalizationFactor,
                    INT_RESULT Result,
                    int Debug);
@@ -126,19 +96,19 @@ int FindBadFeatures(INT_CLASS ClassTemplate,
                    FEATURE_ID *FeatureArray,
                    int Debug);

-void InitIntegerMatcher(); 
+void InitIntegerMatcher();

-void InitIntegerMatcherVars(); 
+void InitIntegerMatcherVars();

-void PrintIntMatcherStats(FILE *f); 
+void PrintIntMatcherStats(FILE *f);

-void SetProtoThresh(FLOAT32 Threshold); 
+void SetProtoThresh(FLOAT32 Threshold);

-void SetFeatureThresh(FLOAT32 Threshold); 
+void SetFeatureThresh(FLOAT32 Threshold);

-void SetBaseLineMatch(); 
+void SetBaseLineMatch();

-void SetCharNormMatch(); 
+void SetCharNormMatch();

 /**----------------------------------------------------------------------------
          Private Function Prototypes
@@ -160,14 +130,7 @@ void IMDebugConfigurationSum(INT_FEATURE FeatureNum,
                             UINT8 *FeatureEvidence,
                             INT32 ConfigCount);

-void PMUpdateTablesForFeature (INT_CLASS ClassTemplate,
-int FeatureNum,
-INT_FEATURE Feature,
-UINT8 FeatureEvidence[MAX_NUM_CONFIGS],
-int SumOfFeatureEvidence[MAX_NUM_CONFIGS],
-int Debug);
-
-void IMUpdateTablesForFeature (INT_CLASS ClassTemplate,
+int IMUpdateTablesForFeature (INT_CLASS ClassTemplate,
 BIT_VECTOR ProtoMask,
 BIT_VECTOR ConfigMask,
 int FeatureNum,
@@ -209,10 +172,6 @@ UINT8
 ProtoEvidence[MAX_NUM_PROTOS]
 [MAX_PROTO_INDEX], INT16 NumFeatures);

-void PMNormalizeSumOfEvidences (INT_CLASS ClassTemplate,
-int SumOfFeatureEvidence[MAX_NUM_CONFIGS],
-INT16 NumFeatures, INT32 used_features);
-
 void IMNormalizeSumOfEvidences (INT_CLASS ClassTemplate,
 int SumOfFeatureEvidence[MAX_NUM_CONFIGS],
 INT16 NumFeatures, INT32 used_features);
@@ -229,7 +188,7 @@ void IMDebugBestMatch(int BestMatch,
                      UINT8 NormalizationFactor);
 #endif

-void HeapSort (int n, register INT16 ra[], register UINT8 rb[]);
+void HeapSort (int n, register int ra[], register int rb[]);

 /**----------------------------------------------------------------------------
        Global Data Definitions and Declarations

--- a/classify/kdtree.cpp
+++ b/classify/kdtree.cpp
@@ -61,6 +61,26 @@ static jmp_buf QuickExit;

 static void_proc WalkAction;

+// Helper function to find the next essential dimension in a cycle.
+static int NextLevel(int level) {
+  do {
+    ++level;
+    if (level >= N)
+      level = 0;
+  } while (KeyDesc[level].NonEssential);
+  return level;
+}
+
+// Helper function to find the previous essential dimension in a cycle.
+static int PrevLevel(int level) {
+  do {
+    --level;
+    if (level < 0)
+      level = N - 1;
+  } while (KeyDesc[level].NonEssential);
+  return level;
+}
+
 /**----------------------------------------------------------------------------
              Public Code
 ----------------------------------------------------------------------------**/
@@ -136,7 +156,7 @@ MakeKDTree (INT16 KeySize, PARAM_DESC KeyDesc[]) {


 /*---------------------------------------------------------------------------*/
-void KDStore(KDTREE *Tree, FLOAT32 *Key, void *Data) { 
+void KDStore(KDTREE *Tree, FLOAT32 *Key, void *Data) {
 /*
 **	Parameters:
 **		Tree		K-D tree in which data is to be stored
@@ -164,7 +184,7 @@ void KDStore(KDTREE *Tree, FLOAT32 *Key, void *Data) {
  KeyDesc = &(Tree->KeyDesc[0]);
  PtrToNode = &(Tree->Root.Left);
  Node = *PtrToNode;
-  Level = 0;
+  Level = NextLevel(-1);
  while (Node != NULL) {
    if (Key[Level] < Node->BranchPoint) {
      PtrToNode = &(Node->Left);
@@ -176,9 +196,7 @@ void KDStore(KDTREE *Tree, FLOAT32 *Key, void *Data) {
      if (Key[Level] < Node->RightBranch)
        Node->RightBranch = Key[Level];
    }
-    Level++;
-    if (Level >= N)
-      Level = 0;
+    Level = NextLevel(Level);
    Node = *PtrToNode;
  }

@@ -239,7 +257,7 @@ KDDelete (KDTREE * Tree, FLOAT32 Key[], void *Data) {
  KeyDesc = &(Tree->KeyDesc[0]);
  Father = &(Tree->Root);
  Current = Father->Left;
-  Level = 0;
+  Level = NextLevel(-1);

  /* search tree for node to be deleted */
  while ((Current != NULL) && (!NodeFound (Current, Key, Data))) {
@@ -249,9 +267,7 @@ KDDelete (KDTREE * Tree, FLOAT32 Key[], void *Data) {
    else
      Current = Current->Right;

-    Level++;
-    if (Level >= N)
-      Level = 0;
+    Level = NextLevel(Level);
  }

  if (Current != NULL) {         /* if node to be deleted was found */
@@ -271,15 +287,11 @@ KDDelete (KDTREE * Tree, FLOAT32 Key[], void *Data) {
      else
        break;

-      Level++;
-      if (Level >= N)
-        Level = 0;
+      Level = NextLevel(Level);
    }

    /* compute level of replacement node's father */
-    Level--;
-    if (Level < 0)
-      Level = N - 1;
+    Level = PrevLevel(Level);

    /* disconnect replacement node from it's father */
    if (FatherReplacement->Left == Replacement) {
@@ -304,7 +316,7 @@ KDDelete (KDTREE * Tree, FLOAT32 Key[], void *Data) {
      else
        Father->Right = Replacement;
    }
-    FreeKDNode(Current); 
+    FreeKDNode(Current);
  }
 }                                /* KDDelete */

@@ -381,7 +393,7 @@ void *NBuffer, FLOAT32 DBuffer[]) {


 /*---------------------------------------------------------------------------*/
-void KDWalk(KDTREE *Tree, void_proc Action) { 
+void KDWalk(KDTREE *Tree, void_proc Action) {
 /*
 **	Parameters:
 **		Tree	ptr to K-D tree to be walked
@@ -401,12 +413,12 @@ void KDWalk(KDTREE *Tree, void_proc Action) {
 */
  WalkAction = Action;
  if (Tree->Root.Left != NULL)
-    Walk (Tree->Root.Left, 0);
+    Walk (Tree->Root.Left, NextLevel(-1));
 }                                /* KDWalk */


 /*---------------------------------------------------------------------------*/
-void FreeKDTree(KDTREE *Tree) { 
+void FreeKDTree(KDTREE *Tree) {
 /*
 **	Parameters:
 **		Tree	tree data structure to be released
@@ -424,7 +436,7 @@ void FreeKDTree(KDTREE *Tree) {
 **		5/26/89, DSJ, Created.
 */
  FreeSubTree (Tree->Root.Left);
-  memfree(Tree); 
+  memfree(Tree);
 }                                /* FreeKDTree */


@@ -496,7 +508,7 @@ MakeKDNode (FLOAT32 Key[], char *Data, int Index) {


 /*---------------------------------------------------------------------------*/
-void FreeKDNode(KDNODE *Node) { 
+void FreeKDNode(KDNODE *Node) {
 /*
 **	Parameters:
 **		Node	ptr to node data structure to be freed
@@ -516,7 +528,7 @@ void FreeKDNode(KDNODE *Node) {


 /*---------------------------------------------------------------------------*/
-void Search(int Level, KDNODE *SubTree) { 
+void Search(int Level, KDNODE *SubTree) {
 /*
 **	Parameters:
 **		Level		level in tree of sub-tree to be searched
@@ -561,12 +573,12 @@ void Search(int Level, KDNODE *SubTree) {
      Distance[NumberOfNeighbors] = d;
      NumberOfNeighbors++;
      if (NumberOfNeighbors == MaxNeighbors)
-        FindMaxDistance(); 
+        FindMaxDistance();
    }
    else {
      Neighbor[Furthest] = SubTree->Data;
      Distance[Furthest] = d;
-      FindMaxDistance(); 
+      FindMaxDistance();
    }
  }
  if (QueryPoint[Level] < SubTree->BranchPoint) {
@@ -575,7 +587,7 @@ void Search(int Level, KDNODE *SubTree) {
    OldLBoxEdge = LBMax[Level];
    LBMax[Level] = SubTree->RightBranch;
    if (SubTree->Left != NULL)
-      Search (Level + 1, SubTree->Left);
+      Search (NextLevel(Level), SubTree->Left);
    SBMax[Level] = OldSBoxEdge;
    LBMax[Level] = OldLBoxEdge;
    OldSBoxEdge = SBMin[Level];
@@ -583,7 +595,7 @@ void Search(int Level, KDNODE *SubTree) {
    OldLBoxEdge = LBMin[Level];
    LBMin[Level] = SubTree->LeftBranch;
    if ((SubTree->Right != NULL) && QueryIntersectsSearch ())
-      Search (Level + 1, SubTree->Right);
+      Search (NextLevel(Level), SubTree->Right);
    SBMin[Level] = OldSBoxEdge;
    LBMin[Level] = OldLBoxEdge;
  }
@@ -593,7 +605,7 @@ void Search(int Level, KDNODE *SubTree) {
    OldLBoxEdge = LBMin[Level];
    LBMin[Level] = SubTree->LeftBranch;
    if (SubTree->Right != NULL)
-      Search (Level + 1, SubTree->Right);
+      Search (NextLevel(Level), SubTree->Right);
    SBMin[Level] = OldSBoxEdge;
    LBMin[Level] = OldLBoxEdge;
    OldSBoxEdge = SBMax[Level];
@@ -601,7 +613,7 @@ void Search(int Level, KDNODE *SubTree) {
    OldLBoxEdge = LBMax[Level];
    LBMax[Level] = SubTree->RightBranch;
    if ((SubTree->Left != NULL) && QueryIntersectsSearch ())
-      Search (Level + 1, SubTree->Left);
+      Search (NextLevel(Level), SubTree->Left);
    SBMax[Level] = OldSBoxEdge;
    LBMax[Level] = OldLBoxEdge;
  }
@@ -657,7 +669,7 @@ register FLOAT32 p1[], register FLOAT32 p2[]) {


 /*---------------------------------------------------------------------------*/
-void FindMaxDistance() { 
+void FindMaxDistance() {
 /*
 **	Parameters:
 **		None
@@ -690,7 +702,7 @@ void FindMaxDistance() {


 /*---------------------------------------------------------------------------*/
-int QueryIntersectsSearch() { 
+int QueryIntersectsSearch() {
 /*
 **	Parameters:
 **		None
@@ -765,7 +777,7 @@ int QueryIntersectsSearch() {


 /*---------------------------------------------------------------------------*/
-int QueryInSearch() { 
+int QueryInSearch() {
 /*
 **	Parameters:
 **		None
@@ -813,7 +825,7 @@ int QueryInSearch() {


 /*---------------------------------------------------------------------------*/
-void Walk(KDNODE *SubTree, INT32 Level) { 
+void Walk(KDNODE *SubTree, INT32 Level) {
 /*
 **	Parameters:
 **		SubTree		ptr to root of subtree to be walked
@@ -842,17 +854,17 @@ void Walk(KDNODE *SubTree, INT32 Level) {
  else {
    (*WalkAction) (SubTree->Data, preorder, Level);
    if (SubTree->Left != NULL)
-      Walk (SubTree->Left, Level + 1);
+      Walk (SubTree->Left, NextLevel(Level));
    (*WalkAction) (SubTree->Data, postorder, Level);
    if (SubTree->Right != NULL)
-      Walk (SubTree->Right, Level + 1);
+      Walk (SubTree->Right, NextLevel(Level));
    (*WalkAction) (SubTree->Data, endorder, Level);
  }
 }                                /* Walk */


 /*---------------------------------------------------------------------------*/
-void FreeSubTree(KDNODE *SubTree) { 
+void FreeSubTree(KDNODE *SubTree) {
 /*
 **	Parameters:
 **		SubTree		ptr to root node of sub-tree to be freed
@@ -867,6 +879,6 @@ void FreeSubTree(KDNODE *SubTree) {
  if (SubTree != NULL) {
    FreeSubTree (SubTree->Left);
    FreeSubTree (SubTree->Right);
-    memfree(SubTree); 
+    memfree(SubTree);
  }
 }                                /* FreeSubTree */
--- a/training/cnTraining.cpp
+++ b/training/cnTraining.cpp
@@ -49,6 +49,7 @@ int	row_number;						/* cjn: fixes link problem */
 typedef struct
 {
  char		*Label;
+  int       SampleCount;
  LIST		List;
 }
 LABELEDLISTNODE, *LABELEDLIST;
@@ -143,7 +144,7 @@ static BOOL8		ShowInsignificantProtos = FALSE;
 //-M 0.025   -B 0.05   -I 0.8   -C 1e-3
 static CLUSTERCONFIG	Config =
 {
-  elliptical, 0.025, 0.05, 0.8, 1e-3
+  elliptical, 0.025, 0.05, 0.8, 1e-3, 0
 };

 static FLOAT32 RoundingAccuracy = 0.0;
@@ -235,6 +236,7 @@ int main (
          //printf ("\nClustering %s ...", CharSample->Label);
          Clusterer = SetUpForClustering(CharSample);
          float SavedMinSamples = Config.MinSamples;
+          Config.MagicSamples = CharSample->SampleCount;
          while (Config.MinSamples > 0.001) {
            ProtoList = ClusterSamples(Clusterer, &Config);
            if (NumberOfProtos(ProtoList, 1, 0) > 0)
@@ -451,6 +453,7 @@ void ReadTrainingSamples (
              f->Params[dim] += UniformRandomNumber(-MINSD, MINSD);
          }
          CharSample->List = push (CharSample->List, FeatureSamples);
+          CharSample->SampleCount++;
          for (i = 0; i < NumFeatureSetsIn (CharDesc); i++)
            if (Type != i)
              FreeFeatureSet (FeaturesOfType (CharDesc, i));
@@ -513,6 +516,7 @@ LABELEDLIST NewLabeledList (
 	LabeledList->Label = (char*)Emalloc (strlen (Label)+1);
 	strcpy (LabeledList->Label, Label);
 	LabeledList->List = NIL;
+    LabeledList->SampleCount = 0;
 	return (LabeledList);

 }	/* NewLabeledList */

--- a/training/mfTraining.cpp
+++ b/training/mfTraining.cpp
@@ -32,12 +32,14 @@
 #include "featdefs.h"
 #include "tessopt.h"
 #include "ocrfeatures.h"
+#include "mf.h"
 #include "general.h"
 #include "clusttool.h"
 #include "cluster.h"
 #include "protos.h"
 #include "minmax.h"
 #include "debug.h"
+#include "tprintf.h"
 #include "const.h"
 #include "mergenf.h"
 #include "name2char.h"
@@ -50,18 +52,21 @@

 #include <string.h>
 #include <stdio.h>
+#define _USE_MATH_DEFINES
 #include <math.h>

 #define MAXNAMESIZE	80
 #define MAX_NUM_SAMPLES	10000
 #define PROGRAM_FEATURE_TYPE "mf"
 #define MINSD (1.0f / 128.0f)
+#define MINSD_ANGLE (1.0f / 64.0f)

 int	row_number;						/* cjn: fixes link problem */

 typedef struct
 {
  char		*Label;
+  int       SampleCount;
  LIST		List;
 }
 LABELEDLISTNODE, *LABELEDLIST;
@@ -151,6 +156,9 @@ PARAMDESC *ConvertToPARAMDESC(
 	PARAM_DESC* Param_Desc,
 	int N);
 */
+void MergeInsignificantProtos(LIST ProtoList, const char* label,
+                              CLUSTERER	*Clusterer, CLUSTERCONFIG *Config);
+
 LIST RemoveInsignificantProtos(
 	LIST ProtoList,
 	BOOL8 KeepSigProtos,
@@ -184,21 +192,51 @@ static BOOL8		ShowInsignificantProtos = FALSE;
 // global variable to hold configuration parameters to control clustering
 // -M 0.40   -B 0.05   -I 1.0   -C 1e-6.
 static CLUSTERCONFIG Config =
-{ elliptical, 0.40, 0.05, 1.0, 1e-6 };
+{ elliptical, 0.625, 0.05, 1.0, 1e-6, 0 };

-static FLOAT32 RoundingAccuracy = 0.0;
+static FLOAT32 RoundingAccuracy = 0.0f;

 // The unicharset used during mftraining
 static UNICHARSET unicharset_mftraining;

+const char* test_ch = "";
+
 /*----------------------------------------------------------------------------
 						Public Code
 -----------------------------------------------------------------------------*/
-/*---------------------------------------------------------------------------*/
-int main (
-     int	argc,
-     char	**argv)
+void DisplayProtoList(const char* ch, LIST protolist) {
+  void* window = c_create_window("Char samples", 50, 200,
+                                 520, 520, -130.0, 130.0, -130.0, 130.0);
+  LIST proto = protolist;
+  iterate(proto) {
+    PROTOTYPE* prototype = reinterpret_cast<PROTOTYPE *>(first_node(proto));
+    if (prototype->Significant)
+      c_line_color_index(window, Green);
+    else if (prototype->NumSamples == 0)
+      c_line_color_index(window, Blue);
+    else if (prototype->Merged)
+      c_line_color_index(window, Magenta);
+    else
+      c_line_color_index(window, Red);
+    float x = CenterX(prototype->Mean);
+    float y = CenterY(prototype->Mean);
+    double angle = OrientationOf(prototype->Mean) * 2 * M_PI;
+    float dx = static_cast<float>(LengthOf(prototype->Mean) * cos(angle) / 2);
+    float dy = static_cast<float>(LengthOf(prototype->Mean) * sin(angle) / 2);
+    c_move(window, (x - dx) * 256, (y - dy) * 256);
+    c_draw(window, (x + dx) * 256, (y + dy) * 256);
+    if (prototype->Significant)
+      tprintf("Green proto at (%g,%g)+(%g,%g) %d samples\n",
+              x, y, dx, dy, prototype->NumSamples);
+    else if (prototype->NumSamples > 0 && !prototype->Merged)
+      tprintf("Red proto at (%g,%g)+(%g,%g) %d samples\n",
+              x, y, dx, dy, prototype->NumSamples);
+  }
+  c_make_current(window);
+}

+/*---------------------------------------------------------------------------*/
+int main (int argc, char **argv) {
 /*
 **	Parameters:
 **		argc	number of command line arguments
@@ -231,123 +269,119 @@ int main (
 **	History:	Fri Aug 18 08:56:17 1989, DSJ, Created.
 **				Mon May 18 1998, Christy Russson, Revistion started.
 */
-
-{
-	char	*PageName;
-	FILE	*TrainingPage;
-	FILE	*OutFile;
-	LIST	CharList;
-	CLUSTERER	*Clusterer = NULL;
-	LIST		ProtoList = NIL;
-	LABELEDLIST CharSample;
-	PROTOTYPE	*Prototype;
-	LIST   	ClassList = NIL;
-	int		Cid, Pid;
-	PROTO		Proto;
-	PROTO_STRUCT	DummyProto;
-	BIT_VECTOR	Config2;
-	MERGE_CLASS	MergeClass;
-	INT_TEMPLATES	IntTemplates;
-	LIST pCharList, pProtoList;
-	char Filename[MAXNAMESIZE];
-
-        // Clean the unichar set
-        unicharset_mftraining.clear();
-        // Space character needed to represent NIL classification
-        unicharset_mftraining.unichar_insert(" ");
-
-	ParseArguments (argc, argv);
-	InitFastTrainerVars ();
-	InitSubfeatureVars ();
-	while ((PageName = GetNextFilename()) != NULL)
-	{
-		printf ("Reading %s ...\n", PageName);
-		TrainingPage = Efopen (PageName, "r");
-		CharList = ReadTrainingSamples (TrainingPage);
-		fclose (TrainingPage);
-		//WriteTrainingSamples (Directory, CharList);
-		pCharList = CharList;
-		iterate(pCharList)
-		{
-			//Cluster
-			CharSample = (LABELEDLIST) first_node (pCharList);
-// 			printf ("\nClustering %s ...", CharSample->Label);
-			Clusterer = SetUpForClustering(CharSample);
-			ProtoList = ClusterSamples(Clusterer, &Config);
-			//WriteClusteredTrainingSamples (Directory, ProtoList, Clusterer, CharSample);
-			CleanUpUnusedData(ProtoList);
-
-			//Merge
-			ProtoList = RemoveInsignificantProtos(ProtoList, ShowSignificantProtos,
-				ShowInsignificantProtos, Clusterer->SampleSize);
-			FreeClusterer(Clusterer);
-			MergeClass = FindClass (ClassList, CharSample->Label);
-			if (MergeClass == NULL)
-			{
-				MergeClass = NewLabeledClass (CharSample->Label);
-				ClassList = push (ClassList, MergeClass);
-			}
-			Cid = AddConfigToClass(MergeClass->Class);
-			pProtoList = ProtoList;
-			iterate (pProtoList)
-			{
-				Prototype = (PROTOTYPE *) first_node (pProtoList);
-
-				// see if proto can be approximated by existing proto
-				Pid = FindClosestExistingProto (MergeClass->Class, MergeClass->NumMerged, Prototype);
-				if (Pid == NO_PROTO)
-				{
-					Pid = AddProtoToClass (MergeClass->Class);
-					Proto = ProtoIn (MergeClass->Class, Pid);
-					MakeNewFromOld (Proto, Prototype);
-					MergeClass->NumMerged[Pid] = 1;
-				}
-				else
-				{
-					MakeNewFromOld (&DummyProto, Prototype);
-					ComputeMergedProto (ProtoIn (MergeClass->Class, Pid), &DummyProto,
-						(FLOAT32) MergeClass->NumMerged[Pid], 1.0,
-						ProtoIn (MergeClass->Class, Pid));
-					MergeClass->NumMerged[Pid] ++;
-				}
-				Config2 = ConfigIn (MergeClass->Class, Cid);
-				AddProtoToConfig (Pid, Config2);
-			}
-			FreeProtoList (&ProtoList);
-		}
-		FreeTrainingSamples (CharList);
-	}
-	//WriteMergedTrainingSamples(Directory,ClassList);
-	WriteMicrofeat(Directory, ClassList);
-	InitIntProtoVars ();
-	InitPrototypes ();
-	SetUpForFloat2Int(ClassList);
-	IntTemplates = CreateIntTemplates(TrainingData, unicharset_mftraining);
-	strcpy (Filename, "");
-	if (Directory != NULL)
-	{
-		strcat (Filename, Directory);
-		strcat (Filename, "/");
-	}
-	strcat (Filename, "inttemp");
+  char	*PageName;
+  FILE	*TrainingPage;
+  FILE	*OutFile;
+  LIST	CharList;
+  CLUSTERER	*Clusterer = NULL;
+  LIST		ProtoList = NIL;
+  LABELEDLIST CharSample;
+  PROTOTYPE	*Prototype;
+  LIST   	ClassList = NIL;
+  int		Cid, Pid;
+  PROTO		Proto;
+  PROTO_STRUCT	DummyProto;
+  BIT_VECTOR	Config2;
+  MERGE_CLASS	MergeClass;
+  INT_TEMPLATES	IntTemplates;
+  LIST pCharList, pProtoList;
+  char Filename[MAXNAMESIZE];
+
+  // Clean the unichar set
+  unicharset_mftraining.clear();
+  // Space character needed to represent NIL classification
+  unicharset_mftraining.unichar_insert(" ");
+
+  ParseArguments (argc, argv);
+  InitFastTrainerVars ();
+  InitSubfeatureVars ();
+  while ((PageName = GetNextFilename()) != NULL) {
+    printf ("Reading %s ...\n", PageName);
+    TrainingPage = Efopen (PageName, "r");
+    CharList = ReadTrainingSamples (TrainingPage);
+    fclose (TrainingPage);
+    //WriteTrainingSamples (Directory, CharList);
+    pCharList = CharList;
+    iterate(pCharList) {
+      //Cluster
+      CharSample = (LABELEDLIST) first_node (pCharList);
+//    printf ("\nClustering %s ...", CharSample->Label);
+      Clusterer = SetUpForClustering(CharSample);
+      Config.MagicSamples = CharSample->SampleCount;
+      ProtoList = ClusterSamples(Clusterer, &Config);
+      CleanUpUnusedData(ProtoList);
+
+      //Merge
+      MergeInsignificantProtos(ProtoList, CharSample->Label,
+                               Clusterer, &Config);
+      if (strcmp(test_ch, CharSample->Label) == 0)
+        DisplayProtoList(test_ch, ProtoList);
+      ProtoList = RemoveInsignificantProtos(ProtoList, ShowSignificantProtos,
+                                            ShowInsignificantProtos,
+                                            Clusterer->SampleSize);
+      FreeClusterer(Clusterer);
+      MergeClass = FindClass (ClassList, CharSample->Label);
+      if (MergeClass == NULL) {
+        MergeClass = NewLabeledClass (CharSample->Label);
+        ClassList = push (ClassList, MergeClass);
+      }
+      Cid = AddConfigToClass(MergeClass->Class);
+      pProtoList = ProtoList;
+      iterate (pProtoList) {
+        Prototype = (PROTOTYPE *) first_node (pProtoList);
+
+        // see if proto can be approximated by existing proto
+        Pid = FindClosestExistingProto(MergeClass->Class,
+                                       MergeClass->NumMerged, Prototype);
+        if (Pid == NO_PROTO) {
+          Pid = AddProtoToClass (MergeClass->Class);
+          Proto = ProtoIn (MergeClass->Class, Pid);
+          MakeNewFromOld (Proto, Prototype);
+          MergeClass->NumMerged[Pid] = 1;
+        }
+        else {
+          MakeNewFromOld (&DummyProto, Prototype);
+          ComputeMergedProto (ProtoIn (MergeClass->Class, Pid), &DummyProto,
+              (FLOAT32) MergeClass->NumMerged[Pid], 1.0,
+              ProtoIn (MergeClass->Class, Pid));
+          MergeClass->NumMerged[Pid] ++;
+        }
+        Config2 = ConfigIn (MergeClass->Class, Cid);
+        AddProtoToConfig (Pid, Config2);
+      }
+      FreeProtoList (&ProtoList);
+    }
+    FreeTrainingSamples (CharList);
+  }
+  //WriteMergedTrainingSamples(Directory,ClassList);
+  WriteMicrofeat(Directory, ClassList);
+  InitIntProtoVars ();
+  InitPrototypes ();
+  SetUpForFloat2Int(ClassList);
+  IntTemplates = CreateIntTemplates(TrainingData, unicharset_mftraining);
+  strcpy (Filename, "");
+  if (Directory != NULL) {
+    strcat (Filename, Directory);
+    strcat (Filename, "/");
+  }
+  strcat (Filename, "inttemp");
 #ifdef __UNIX__
-	OutFile = Efopen (Filename, "w");
+  OutFile = Efopen (Filename, "w");
 #else
-	OutFile = Efopen (Filename, "wb");
+  OutFile = Efopen (Filename, "wb");
 #endif
-	WriteIntTemplates(OutFile, IntTemplates, unicharset_mftraining);
-	fclose (OutFile);
-	strcpy (Filename, "");
-	if (Directory != NULL)
-	{
-		strcat (Filename, Directory);
-		strcat (Filename, "/");
-	}
-	strcat (Filename, "pffmtable");
-        // Now create pffmtable.
-        WritePFFMTable(IntTemplates, Filename);
-	printf ("Done!\n"); /**/
-	FreeLabeledClassList (ClassList);
+  WriteIntTemplates(OutFile, IntTemplates, unicharset_mftraining);
+  fclose (OutFile);
+  strcpy (Filename, "");
+  if (Directory != NULL) {
+    strcat (Filename, Directory);
+    strcat (Filename, "/");
+  }
+  strcat (Filename, "pffmtable");
+  // Now create pffmtable.
+  WritePFFMTable(IntTemplates, Filename);
+  printf ("Done!\n"); /**/
+  FreeLabeledClassList (ClassList);
  return 0;
 }	/* main */

@@ -438,8 +472,8 @@ char	**argv)
 		case 'R':
 			ParametersRead = sscanf( tessoptarg, "%f", &RoundingAccuracy );
 			if ( ParametersRead != 1 ) Error = TRUE;
-			else if ( RoundingAccuracy > 0.01 ) RoundingAccuracy = 0.01;
-			else if ( RoundingAccuracy < 0.0 ) RoundingAccuracy = 0.0;
+			else if ( RoundingAccuracy > 0.01f ) RoundingAccuracy = 0.01f;
+			else if ( RoundingAccuracy < 0.0f ) RoundingAccuracy = 0.0f;
 			break;
 		case 'S':
 			switch ( tessoptarg[0] )
@@ -547,9 +581,12 @@ LIST ReadTrainingSamples (
                for (int feature = 0; feature < FeatureSamples->NumFeatures; ++feature) {
                  FEATURE f = FeatureSamples->Features[feature];
                  for (int dim =0; dim < f->Type->NumParams; ++dim)
-                    f->Params[dim] += UniformRandomNumber(-MINSD, MINSD);
+                    f->Params[dim] += dim == MFDirection ?
+                                    UniformRandomNumber(-MINSD_ANGLE, MINSD_ANGLE) :
+                                    UniformRandomNumber(-MINSD, MINSD);
                }
 		CharSample->List = push (CharSample->List, FeatureSamples);
+        CharSample->SampleCount++;
 		for (i = 0; i < NumFeatureSetsIn (CharDesc); i++)
                  if (Type != i)
                    FreeFeatureSet (FeaturesOfType (CharDesc, i));
@@ -631,6 +668,7 @@ LABELEDLIST NewLabeledList (
 	LabeledList->Label = (char*)Emalloc (strlen (Label)+1);
 	strcpy (LabeledList->Label, Label);
 	LabeledList->List = NIL;
+    LabeledList->SampleCount = 0;
 	return (LabeledList);

 }	/* NewLabeledList */
@@ -1030,7 +1068,7 @@ CLUSTERER *SetUpForClustering(
 			if (Sample == NULL)
 				Sample = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
 			for (j=0; j < N; j++)
-				if (RoundingAccuracy != 0.0)
+				if (RoundingAccuracy != 0.0f)
 					Sample[j] = round(FeatureSet->Features[i]->Params[j], RoundingAccuracy);
 				else
 					Sample[j] = FeatureSet->Features[i]->Params[j];
@@ -1043,6 +1081,71 @@ CLUSTERER *SetUpForClustering(

 }	/* SetUpForClustering */

+/*------------------------------------------------------------------------*/
+void MergeInsignificantProtos(LIST ProtoList, const char* label,
+                              CLUSTERER	*Clusterer, CLUSTERCONFIG *Config) {
+  PROTOTYPE	*Prototype;
+  bool debug = strcmp(test_ch, label) == 0;
+
+  LIST pProtoList = ProtoList;
+  iterate(pProtoList) {
+    Prototype = (PROTOTYPE *) first_node (pProtoList);
+    if (Prototype->Significant || Prototype->Merged)
+      continue;
+    FLOAT32 best_dist = 0.125;
+    PROTOTYPE* best_match = NULL;
+    // Find the nearest alive prototype.
+    LIST list_it = ProtoList;
+    iterate(list_it) {
+      PROTOTYPE* test_p = (PROTOTYPE *) first_node (list_it);
+      if (test_p != Prototype && !test_p->Merged) {
+        FLOAT32 dist = ComputeDistance(Clusterer->SampleSize,
+                                       Clusterer->ParamDesc,
+                                       Prototype->Mean, test_p->Mean);
+        if (dist < best_dist) {
+          best_match = test_p;
+          best_dist = dist;
+        }
+      }
+    }
+    if (best_match != NULL && !best_match->Significant) {
+      if (debug)
+         tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n",
+                 best_match->NumSamples, Prototype->NumSamples,
+                 best_match->Mean[0], best_match->Mean[1],
+                 Prototype->Mean[0], Prototype->Mean[1]);
+      best_match->NumSamples = MergeClusters(Clusterer->SampleSize,
+                                             Clusterer->ParamDesc,
+                                             best_match->NumSamples,
+                                             Prototype->NumSamples,
+                                             best_match->Mean,
+                                             best_match->Mean, Prototype->Mean);
+      Prototype->NumSamples = 0;
+      Prototype->Merged = 1;
+    } else if (best_match != NULL) {
+      if (debug)
+        tprintf("Red proto at %g,%g matched a green one at %g,%g\n",
+                Prototype->Mean[0], Prototype->Mean[1],
+                best_match->Mean[0], best_match->Mean[1]);
+      Prototype->Merged = 1;
+    }
+  }
+  // Mark significant those that now have enough samples.
+  int min_samples = (INT32) (Config->MinSamples * Clusterer->NumChar);
+  pProtoList = ProtoList;
+  iterate(pProtoList) {
+    Prototype = (PROTOTYPE *) first_node (pProtoList);
+    // Process insignificant protos that do not match a green one
+    if (!Prototype->Significant && Prototype->NumSamples >= min_samples &&
+        !Prototype->Merged) {
+      if (debug)
+        tprintf("Red proto at %g,%g becoming green\n",
+                Prototype->Mean[0], Prototype->Mean[1]);
+      Prototype->Significant = true;
+    }
+  }
+}	/* MergeInsignificantProtos */
+
 /*------------------------------------------------------------------------*/
 LIST RemoveInsignificantProtos(
 	LIST ProtoList,