Move training to src.

104fe793 · Egor Pugin · ca5c15e6 · 104fe793 · 104fe793 · 104fe793
65 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -308,7 +308,7 @@ if (BUILD_TESTS AND EXISTS ${PROJECT_SOURCE_DIR}/googletest/CMakeLists.txt)
 endif()

 if (BUILD_TRAINING_TOOLS)
-add_subdirectory(training)
+add_subdirectory(src/training)
 endif()

 get_target_property(tesseract_NAME libtesseract NAME)

--- a/configure.ac
+++ b/configure.ac
@@ -502,7 +502,7 @@ AC_CONFIG_FILES([java/com/google/scrollview/Makefile])
 AC_CONFIG_FILES([java/com/google/scrollview/events/Makefile])
 AC_CONFIG_FILES([java/com/google/scrollview/ui/Makefile])
 AC_CONFIG_FILES([doc/Makefile])
-AM_COND_IF([ENABLE_TRAINING], [AC_CONFIG_FILES(training/Makefile)])
+AM_COND_IF([ENABLE_TRAINING], [AC_CONFIG_FILES(src/training/Makefile)])
 AC_OUTPUT

 # Final message

--- a/cppan.yml
+++ b/cppan.yml
@@ -172,7 +172,7 @@ projects:
    tessopt:
        type: lib
        static_only: true
-        files: training/tessopt.*
+        files: src/training/tessopt.*
        include_directories: training
        dependencies: libtesseract

@@ -180,104 +180,104 @@ projects:
        type: lib
        static_only: true
        files:
-            - training/commandlineflags.cpp
-            - training/commandlineflags.h
-            - training/commontraining.cpp
-            - training/commontraining.h
+            - src/training/commandlineflags.cpp
+            - src/training/commandlineflags.h
+            - src/training/commontraining.cpp
+            - src/training/commontraining.h
        include_directories: training
        dependencies:
            - tessopt

    ambiguous_words:
-        files: training/ambiguous_words.cpp
+        files: src/training/ambiguous_words.cpp
        dependencies:
            - libtesseract

    classifier_tester:
-        files: training/classifier_tester.cpp
+        files: src/training/classifier_tester.cpp
        dependencies: common_training

    combine_lang_model:
-        files: training/combine_lang_model.cpp
+        files: src/training/combine_lang_model.cpp
        dependencies: unicharset_training

    combine_tessdata:
-        files: training/combine_tessdata.cpp
+        files: src/training/combine_tessdata.cpp
        dependencies: libtesseract

    cntraining:
-        files: training/cntraining.cpp
+        files: src/training/cntraining.cpp
        dependencies: common_training

    dawg2wordlist:
-        files: training/dawg2wordlist.cpp
+        files: src/training/dawg2wordlist.cpp
        dependencies: libtesseract

    mftraining:
        files:
-            - training/mftraining.cpp
-            - training/mergenf.*
+            - src/training/mftraining.cpp
+            - src/training/mergenf.*
        dependencies: common_training

    shapeclustering:
-        files: training/shapeclustering.cpp
+        files: src/training/shapeclustering.cpp
        dependencies: common_training

    unicharset_extractor:
-        files: training/unicharset_extractor.cpp
+        files: src/training/unicharset_extractor.cpp
        dependencies: unicharset_training

    wordlist2dawg:
-        files: training/wordlist2dawg.cpp
+        files: src/training/wordlist2dawg.cpp
        dependencies: libtesseract

    unicharset_training:
        type: lib
        static_only: true
        files:
-            - training/fileio.*
-            - training/icuerrorcode.h
-            - training/lang_model_helpers.*
-            - training/lstmtester.*
-            - training/normstrngs.*
-            - training/unicharset_training_utils.*
-            - training/validat.*
+            - src/training/fileio.*
+            - src/training/icuerrorcode.h
+            - src/training/lang_model_helpers.*
+            - src/training/lstmtester.*
+            - src/training/normstrngs.*
+            - src/training/unicharset_training_utils.*
+            - src/training/validat.*
        include_directories: training
        dependencies:
            - common_training
            - pvt.cppan.demo.unicode.icu.i18n

    lstmeval:
-        files: training/lstmeval.cpp
+        files: src/training/lstmeval.cpp
        dependencies: unicharset_training

    lstmtraining:
-        files: training/lstmtraining.cpp
+        files: src/training/lstmtraining.cpp
        dependencies: unicharset_training

    set_unicharset_properties:
-        files: training/set_unicharset_properties.cpp
+        files: src/training/set_unicharset_properties.cpp
        dependencies: unicharset_training

    text2image:
        files:
-            - training/text2image.cpp
-            - training/boxchar.cpp
-            - training/boxchar.h
-            - training/degradeimage.cpp
-            - training/degradeimage.h
-            - training/ligature_table.cpp
-            - training/ligature_table.h
-            - training/normstrngs.cpp
-            - training/normstrngs.h
-            - training/pango_font_info.cpp
-            - training/pango_font_info.h
-            - training/stringrenderer.cpp
-            - training/stringrenderer.h
-            - training/tlog.cpp
-            - training/tlog.h
-            - training/util.h
-            - training/icuerrorcode.h
+            - src/training/text2image.cpp
+            - src/training/boxchar.cpp
+            - src/training/boxchar.h
+            - src/training/degradeimage.cpp
+            - src/training/degradeimage.h
+            - src/training/ligature_table.cpp
+            - src/training/ligature_table.h
+            - src/training/normstrngs.cpp
+            - src/training/normstrngs.h
+            - src/training/pango_font_info.cpp
+            - src/training/pango_font_info.h
+            - src/training/stringrenderer.cpp
+            - src/training/stringrenderer.h
+            - src/training/tlog.cpp
+            - src/training/tlog.h
+            - src/training/util.h
+            - src/training/icuerrorcode.h

        dependencies:
            - unicharset_training

--- a/training/CMakeLists.txt
+++ b/training/CMakeLists.txt
--- a/training/Makefile.am
+++ b/training/Makefile.am
--- a/training/ambiguous_words.cpp
+++ b/training/ambiguous_words.cpp
--- a/training/boxchar.cpp
+++ b/training/boxchar.cpp
--- a/training/boxchar.h
+++ b/training/boxchar.h
--- a/training/classifier_tester.cpp
+++ b/training/classifier_tester.cpp
--- a/training/cntraining.cpp
+++ b/training/cntraining.cpp
--- a/training/combine_lang_model.cpp
+++ b/training/combine_lang_model.cpp
--- a/training/combine_tessdata.cpp
+++ b/training/combine_tessdata.cpp
--- a/training/commandlineflags.cpp
+++ b/training/commandlineflags.cpp
--- a/training/commandlineflags.h
+++ b/training/commandlineflags.h
--- a/training/commontraining.cpp
+++ b/training/commontraining.cpp
--- a/training/commontraining.h
+++ b/training/commontraining.h
--- a/training/dawg2wordlist.cpp
+++ b/training/dawg2wordlist.cpp
--- a/training/degradeimage.cpp
+++ b/training/degradeimage.cpp
--- a/training/degradeimage.h
+++ b/training/degradeimage.h
-/**********************************************************************
- * File:        degradeimage.h
- * Description: Function to degrade an image (usually of text) as if it
- *              has been printed and then scanned.
- * Authors:     Ray Smith
- * Created:     Tue Nov 19 2013
- *
- * (C) Copyright 2013, Google Inc.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * http://www.apache.org/licenses/LICENSE-2.0
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- **********************************************************************/
-#ifndef TESSERACT_TRAINING_DEGRADEIMAGE_H_
-#define TESSERACT_TRAINING_DEGRADEIMAGE_H_
-
-#include "allheaders.h"
-#include "genericvector.h"
-#include "helpers.h"  // For TRand.
-#include "rect.h"
-
-namespace tesseract {
-
-// Degrade the pix as if by a print/copy/scan cycle with exposure > 0
-// corresponding to darkening on the copier and <0 lighter and 0 not copied.
-// If rotation is not nullptr, the clockwise rotation in radians is saved there.
-// The input pix must be 8 bit grey. (Binary with values 0 and 255 is OK.)
-// The input image is destroyed and a different image returned.
-struct Pix* DegradeImage(struct Pix* input, int exposure, TRand* randomizer,
-                         float* rotation);
-
-// Creates and returns a Pix distorted by various means according to the bool
-// flags. If boxes is not nullptr, the boxes are resized/positioned according to
-// any spatial distortion and also by the integer reduction factor box_scale
-// so they will match what the network will output.
-// Returns nullptr on error. The returned Pix must be pixDestroyed.
-Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert,
-                         bool white_noise, bool smooth_noise, bool blur,
-                         int box_reduction, TRand* randomizer,
-                         GenericVector<TBOX>* boxes);
-// Distorts anything that has a non-null pointer with the same pseudo-random
-// perspective distortion. Width and height only need to be set if there
-// is no pix. If there is a pix, then they will be taken from there.
-void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer,
-                                   Pix** pix, GenericVector<TBOX>* boxes);
-// Computes the coefficients of a randomized projective transformation.
-// The image transform requires backward transformation coefficient, and the
-// box transform the forward coefficients.
-// Returns the incolor arg to pixProjective.
-int ProjectiveCoeffs(int width, int height, TRand* randomizer,
-                     float** im_coeffs, float** box_coeffs);
-
-}  // namespace tesseract
-
-#endif  // TESSERACT_TRAINING_DEGRADEIMAGE_H_
+/**********************************************************************
+ * File:        degradeimage.h
+ * Description: Function to degrade an image (usually of text) as if it
+ *              has been printed and then scanned.
+ * Authors:     Ray Smith
+ * Created:     Tue Nov 19 2013
+ *
+ * (C) Copyright 2013, Google Inc.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ **********************************************************************/
+#ifndef TESSERACT_TRAINING_DEGRADEIMAGE_H_
+#define TESSERACT_TRAINING_DEGRADEIMAGE_H_
+
+#include "allheaders.h"
+#include "genericvector.h"
+#include "helpers.h"  // For TRand.
+#include "rect.h"
+
+namespace tesseract {
+
+// Degrade the pix as if by a print/copy/scan cycle with exposure > 0
+// corresponding to darkening on the copier and <0 lighter and 0 not copied.
+// If rotation is not nullptr, the clockwise rotation in radians is saved there.
+// The input pix must be 8 bit grey. (Binary with values 0 and 255 is OK.)
+// The input image is destroyed and a different image returned.
+struct Pix* DegradeImage(struct Pix* input, int exposure, TRand* randomizer,
+                         float* rotation);
+
+// Creates and returns a Pix distorted by various means according to the bool
+// flags. If boxes is not nullptr, the boxes are resized/positioned according to
+// any spatial distortion and also by the integer reduction factor box_scale
+// so they will match what the network will output.
+// Returns nullptr on error. The returned Pix must be pixDestroyed.
+Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert,
+                         bool white_noise, bool smooth_noise, bool blur,
+                         int box_reduction, TRand* randomizer,
+                         GenericVector<TBOX>* boxes);
+// Distorts anything that has a non-null pointer with the same pseudo-random
+// perspective distortion. Width and height only need to be set if there
+// is no pix. If there is a pix, then they will be taken from there.
+void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer,
+                                   Pix** pix, GenericVector<TBOX>* boxes);
+// Computes the coefficients of a randomized projective transformation.
+// The image transform requires backward transformation coefficient, and the
+// box transform the forward coefficients.
+// Returns the incolor arg to pixProjective.
+int ProjectiveCoeffs(int width, int height, TRand* randomizer,
+                     float** im_coeffs, float** box_coeffs);
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_TRAINING_DEGRADEIMAGE_H_
--- a/training/fileio.cpp
+++ b/training/fileio.cpp
--- a/training/fileio.h
+++ b/training/fileio.h
--- a/training/icuerrorcode.h
+++ b/training/icuerrorcode.h
-/**********************************************************************
- * File:        icuerrorcode.h
- * Description: Wrapper class for UErrorCode, with conversion operators for
- *              direct use in ICU C and C++ APIs.
- * Author:      Fredrik Roubert
- * Created:     Thu July 4 2013
- *
- * Features:
- * - The constructor initializes the internal UErrorCode to U_ZERO_ERROR,
- *  removing one common source of errors.
- * - Same use in C APIs taking a UErrorCode* (pointer) and C++ taking
- *   UErrorCode& (reference), via conversion operators.
- * - Automatic checking for success when it goes out of scope. On failure,
- *   the destructor will log an error message and exit.
- *
- * Most of ICU will handle errors gracefully and provide sensible fallbacks.
- * Using IcuErrorCode, it is therefore possible to write very compact code
- * that does sensible things on failure and provides logging for debugging.
- *
- * Example:
- * IcuErrorCode icuerrorcode;
- * return collator.compareUTF8(a, b, icuerrorcode) == UCOL_EQUAL;
- *
- * (C) Copyright 2013, Google Inc.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * http://www.apache.org/licenses/LICENSE-2.0
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- **********************************************************************/
-#ifndef TESSERACT_CCUTIL_ICUERRORCODE_H_
-#define TESSERACT_CCUTIL_ICUERRORCODE_H_
-
-#include "tprintf.h"
-#include "unicode/errorcode.h"  // From libicu
-
-namespace tesseract {
-
-class IcuErrorCode : public icu::ErrorCode {
- public:
-  IcuErrorCode() {}
-  virtual ~IcuErrorCode() {
-    if (isFailure()) {
-      handleFailure();
-    }
-  }
-
- protected:
-  virtual void handleFailure() const {
-    tprintf("ICU ERROR: %s", errorName());
-    exit(errorCode);
-  }
-
- private:
-  // Disallow implicit copying of object.
-  IcuErrorCode(const IcuErrorCode&);
-  void operator=(const IcuErrorCode&);
-};
-
-}  // namespace tesseract
-#endif  // TESSERACT_CCUTIL_ICUERRORCODE_H_
+/**********************************************************************
+ * File:        icuerrorcode.h
+ * Description: Wrapper class for UErrorCode, with conversion operators for
+ *              direct use in ICU C and C++ APIs.
+ * Author:      Fredrik Roubert
+ * Created:     Thu July 4 2013
+ *
+ * Features:
+ * - The constructor initializes the internal UErrorCode to U_ZERO_ERROR,
+ *  removing one common source of errors.
+ * - Same use in C APIs taking a UErrorCode* (pointer) and C++ taking
+ *   UErrorCode& (reference), via conversion operators.
+ * - Automatic checking for success when it goes out of scope. On failure,
+ *   the destructor will log an error message and exit.
+ *
+ * Most of ICU will handle errors gracefully and provide sensible fallbacks.
+ * Using IcuErrorCode, it is therefore possible to write very compact code
+ * that does sensible things on failure and provides logging for debugging.
+ *
+ * Example:
+ * IcuErrorCode icuerrorcode;
+ * return collator.compareUTF8(a, b, icuerrorcode) == UCOL_EQUAL;
+ *
+ * (C) Copyright 2013, Google Inc.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ **********************************************************************/
+#ifndef TESSERACT_CCUTIL_ICUERRORCODE_H_
+#define TESSERACT_CCUTIL_ICUERRORCODE_H_
+
+#include "tprintf.h"
+#include "unicode/errorcode.h"  // From libicu
+
+namespace tesseract {
+
+class IcuErrorCode : public icu::ErrorCode {
+ public:
+  IcuErrorCode() {}
+  virtual ~IcuErrorCode() {
+    if (isFailure()) {
+      handleFailure();
+    }
+  }
+
+ protected:
+  virtual void handleFailure() const {
+    tprintf("ICU ERROR: %s", errorName());
+    exit(errorCode);
+  }
+
+ private:
+  // Disallow implicit copying of object.
+  IcuErrorCode(const IcuErrorCode&);
+  void operator=(const IcuErrorCode&);
+};
+
+}  // namespace tesseract
+#endif  // TESSERACT_CCUTIL_ICUERRORCODE_H_
--- a/training/lang_model_helpers.cpp
+++ b/training/lang_model_helpers.cpp
--- a/training/lang_model_helpers.h
+++ b/training/lang_model_helpers.h
--- a/training/language-specific.sh
+++ b/training/language-specific.sh
--- a/training/ligature_table.cpp
+++ b/training/ligature_table.cpp
--- a/training/ligature_table.h
+++ b/training/ligature_table.h
--- a/training/lstmeval.cpp
+++ b/training/lstmeval.cpp
--- a/training/lstmtester.cpp
+++ b/training/lstmtester.cpp
--- a/training/lstmtester.h
+++ b/training/lstmtester.h
--- a/training/lstmtraining.cpp
+++ b/training/lstmtraining.cpp
--- a/training/merge_unicharsets.cpp
+++ b/training/merge_unicharsets.cpp
--- a/training/mergenf.cpp
+++ b/training/mergenf.cpp
--- a/training/mergenf.h
+++ b/training/mergenf.h
-/******************************************************************************
-**	Filename:    MergeNF.c
-**	Purpose:     Program for merging similar nano-feature protos
-**	Author:      Dan Johnson
-**	History:     Wed Nov 21 09:55:23 1990, DSJ, Created.
-**
- **	(c) Copyright Hewlett-Packard Company, 1988.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
-******************************************************************************/
-
-#ifndef TESSERACT_TRAINING_MERGENF_H_
-#define TESSERACT_TRAINING_MERGENF_H_
-
-/**----------------------------------------------------------------------------
-					Include Files and Type Defines
----------------------------------------------------------------------------**/
-#include "protos.h"
-#include "cluster.h"
-#include "ocrfeatures.h"
-#include "callcpp.h"
-#include "picofeat.h"
-
-
-#define WORST_MATCH_ALLOWED	(0.9)
-#define WORST_EVIDENCE (1.0)
-#define MAX_LENGTH_MISMATCH	(2.0 * GetPicoFeatureLength ())
-
-
-#define PROTO_SUFFIX		".mf.p"
-#define CONFIG_SUFFIX		".cl"
-#define NO_PROTO	(-1)
-#define XPOSITION			0
-#define YPOSITION			1
-#define MFLENGTH			2
-#define ORIENTATION			3
-
-typedef struct
-{
-  FLOAT32	MinX, MaxX, MinY, MaxY;
-} FRECT;
-
-/**----------------------------------------------------------------------------
-					Public Macros
----------------------------------------------------------------------------**/
-#define CenterX(M)		( (M)[XPOSITION] )
-#define CenterY(M)		( (M)[YPOSITION] )
-#define LengthOf(M)		( (M)[MFLENGTH] )
-#define OrientationOf(M)	( (M)[ORIENTATION] )
-
-/**----------------------------------------------------------------------------
-					Public Function Prototypes
----------------------------------------------------------------------------**/
-FLOAT32 CompareProtos (
-     PROTO	p1,
-	 PROTO	p2);
-
-void ComputeMergedProto (
-     PROTO	p1,
-	 PROTO	p2,
-     FLOAT32	w1,
-	 FLOAT32	w2,
-     PROTO	MergedProto);
-
-int FindClosestExistingProto (
-     CLASS_TYPE	Class,
-     int       	NumMerged[],
-     PROTOTYPE	*Prototype);
-
-void MakeNewFromOld (
-     PROTO	New,
-     PROTOTYPE	*Old);
-
-FLOAT32 SubfeatureEvidence (
-   FEATURE     Feature,
-   PROTO       Proto);
-
-double EvidenceOf (
-  register double   Similarity);
-
-BOOL8 DummyFastMatch (
-     FEATURE	Feature,
-     PROTO	Proto);
-
-void ComputePaddedBoundingBox (
-     PROTO	Proto,
-     FLOAT32	TangentPad,
-	 FLOAT32	OrthogonalPad,
-     FRECT	*BoundingBox);
-
-BOOL8 PointInside (
-     FRECT	*Rectangle,
-     FLOAT32	X,
-	 FLOAT32	Y);
-
-#endif  // TESSERACT_TRAINING_MERGENF_H_
+/******************************************************************************
+**	Filename:    MergeNF.c
+**	Purpose:     Program for merging similar nano-feature protos
+**	Author:      Dan Johnson
+**	History:     Wed Nov 21 09:55:23 1990, DSJ, Created.
+**
+ **	(c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+******************************************************************************/
+
+#ifndef TESSERACT_TRAINING_MERGENF_H_
+#define TESSERACT_TRAINING_MERGENF_H_
+
+/**----------------------------------------------------------------------------
+					Include Files and Type Defines
+----------------------------------------------------------------------------**/
+#include "protos.h"
+#include "cluster.h"
+#include "ocrfeatures.h"
+#include "callcpp.h"
+#include "picofeat.h"
+
+
+#define WORST_MATCH_ALLOWED	(0.9)
+#define WORST_EVIDENCE (1.0)
+#define MAX_LENGTH_MISMATCH	(2.0 * GetPicoFeatureLength ())
+
+
+#define PROTO_SUFFIX		".mf.p"
+#define CONFIG_SUFFIX		".cl"
+#define NO_PROTO	(-1)
+#define XPOSITION			0
+#define YPOSITION			1
+#define MFLENGTH			2
+#define ORIENTATION			3
+
+typedef struct
+{
+  FLOAT32	MinX, MaxX, MinY, MaxY;
+} FRECT;
+
+/**----------------------------------------------------------------------------
+					Public Macros
+----------------------------------------------------------------------------**/
+#define CenterX(M)		( (M)[XPOSITION] )
+#define CenterY(M)		( (M)[YPOSITION] )
+#define LengthOf(M)		( (M)[MFLENGTH] )
+#define OrientationOf(M)	( (M)[ORIENTATION] )
+
+/**----------------------------------------------------------------------------
+					Public Function Prototypes
+----------------------------------------------------------------------------**/
+FLOAT32 CompareProtos (
+     PROTO	p1,
+	 PROTO	p2);
+
+void ComputeMergedProto (
+     PROTO	p1,
+	 PROTO	p2,
+     FLOAT32	w1,
+	 FLOAT32	w2,
+     PROTO	MergedProto);
+
+int FindClosestExistingProto (
+     CLASS_TYPE	Class,
+     int       	NumMerged[],
+     PROTOTYPE	*Prototype);
+
+void MakeNewFromOld (
+     PROTO	New,
+     PROTOTYPE	*Old);
+
+FLOAT32 SubfeatureEvidence (
+   FEATURE     Feature,
+   PROTO       Proto);
+
+double EvidenceOf (
+  register double   Similarity);
+
+BOOL8 DummyFastMatch (
+     FEATURE	Feature,
+     PROTO	Proto);
+
+void ComputePaddedBoundingBox (
+     PROTO	Proto,
+     FLOAT32	TangentPad,
+	 FLOAT32	OrthogonalPad,
+     FRECT	*BoundingBox);
+
+BOOL8 PointInside (
+     FRECT	*Rectangle,
+     FLOAT32	X,
+	 FLOAT32	Y);
+
+#endif  // TESSERACT_TRAINING_MERGENF_H_
--- a/training/mftraining.cpp
+++ b/training/mftraining.cpp
--- a/training/normstrngs.cpp
+++ b/training/normstrngs.cpp
--- a/training/normstrngs.h
+++ b/training/normstrngs.h
--- a/training/pango_font_info.cpp
+++ b/training/pango_font_info.cpp
--- a/training/pango_font_info.h
+++ b/training/pango_font_info.h
--- a/training/set_unicharset_properties.cpp
+++ b/training/set_unicharset_properties.cpp
--- a/training/shapeclustering.cpp
+++ b/training/shapeclustering.cpp
--- a/training/stringrenderer.cpp
+++ b/training/stringrenderer.cpp
--- a/training/stringrenderer.h
+++ b/training/stringrenderer.h
--- a/training/tessopt.cpp
+++ b/training/tessopt.cpp
--- a/training/tessopt.h
+++ b/training/tessopt.h
--- a/training/tesstrain.sh
+++ b/training/tesstrain.sh
--- a/training/tesstrain_utils.sh
+++ b/training/tesstrain_utils.sh
--- a/training/text2image.cpp
+++ b/training/text2image.cpp
--- a/training/tlog.cpp
+++ b/training/tlog.cpp
-/**********************************************************************
- * File:        tlog.cpp
- * Description: Variant of printf with logging level controllable by a
- *              commandline flag.
- * Author:      Ranjith Unnikrishnan
- * Created:     Wed Nov 20 2013
- *
- * (C) Copyright 2013, Google Inc.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-
-#include "tlog.h"
-
-INT_PARAM_FLAG(tlog_level, 0, "Minimum logging level for tlog() output");
+/**********************************************************************
+ * File:        tlog.cpp
+ * Description: Variant of printf with logging level controllable by a
+ *              commandline flag.
+ * Author:      Ranjith Unnikrishnan
+ * Created:     Wed Nov 20 2013
+ *
+ * (C) Copyright 2013, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "tlog.h"
+
+INT_PARAM_FLAG(tlog_level, 0, "Minimum logging level for tlog() output");
--- a/training/tlog.h
+++ b/training/tlog.h
-/**********************************************************************
- * File:        tlog.h
- * Description: Variant of printf with logging level controllable by a
- *              commandline flag.
- * Author:      Ranjith Unnikrishnan
- * Created:     Wed Nov 20 2013
- *
- * (C) Copyright 2013, Google Inc.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-#ifndef TESSERACT_TRAINING_TLOG_H_
-#define TESSERACT_TRAINING_TLOG_H_
-
-#include "commandlineflags.h"
-#include "errcode.h"
-#include "tprintf.h"
-
-DECLARE_INT_PARAM_FLAG(tlog_level);
-
-// Variant guarded by the numeric logging level parameter FLAGS_tlog_level
-// (default 0).  Code using ParseCommandLineFlags() can control its value using
-// the --tlog_level commandline argument. Otherwise it must be specified in a
-// config file like other params.
-#define tlog(level, ...) {                        \
-  if (FLAGS_tlog_level >= level) {                \
-    tprintf_internal(__VA_ARGS__);                \
-  }                                               \
-}
-
-#define TLOG_IS_ON(level) (FLAGS_tlog_level >= level)
-
-#endif  // TESSERACT_TRAINING_TLOG_H_
+/**********************************************************************
+ * File:        tlog.h
+ * Description: Variant of printf with logging level controllable by a
+ *              commandline flag.
+ * Author:      Ranjith Unnikrishnan
+ * Created:     Wed Nov 20 2013
+ *
+ * (C) Copyright 2013, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+#ifndef TESSERACT_TRAINING_TLOG_H_
+#define TESSERACT_TRAINING_TLOG_H_
+
+#include "commandlineflags.h"
+#include "errcode.h"
+#include "tprintf.h"
+
+DECLARE_INT_PARAM_FLAG(tlog_level);
+
+// Variant guarded by the numeric logging level parameter FLAGS_tlog_level
+// (default 0).  Code using ParseCommandLineFlags() can control its value using
+// the --tlog_level commandline argument. Otherwise it must be specified in a
+// config file like other params.
+#define tlog(level, ...) {                        \
+  if (FLAGS_tlog_level >= level) {                \
+    tprintf_internal(__VA_ARGS__);                \
+  }                                               \
+}
+
+#define TLOG_IS_ON(level) (FLAGS_tlog_level >= level)
+
+#endif  // TESSERACT_TRAINING_TLOG_H_
--- a/training/unicharset_extractor.cpp
+++ b/training/unicharset_extractor.cpp
--- a/training/unicharset_training_utils.cpp
+++ b/training/unicharset_training_utils.cpp
--- a/training/unicharset_training_utils.h
+++ b/training/unicharset_training_utils.h
--- a/training/util.h
+++ b/training/util.h
--- a/training/validate_grapheme.cpp
+++ b/training/validate_grapheme.cpp
--- a/training/validate_grapheme.h
+++ b/training/validate_grapheme.h
-#ifndef TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
-#define TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
-
-#include "validator.h"
-
-namespace tesseract {
-
-// Subclass of Validator that validates and segments generic unicode into
-// grapheme clusters, including Latin with diacritics.
-class ValidateGrapheme : public Validator {
- public:
-  ValidateGrapheme(ViramaScript script, bool report_errors)
-      : Validator(script, report_errors) {}
-  ~ValidateGrapheme() {}
-
- protected:
-  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
-  // parts_ and output_. Returns true if a valid Grapheme was consumed,
-  // otherwise does not increment codes_used_.
-  bool ConsumeGraphemeIfValid() override;
-  // Returns the CharClass corresponding to the given Unicode ch.
-  CharClass UnicodeToCharClass(char32 ch) const override;
-
- private:
-  // Helper returns true if the sequence prev_ch,ch is invalid.
-  bool IsBadlyFormed(char32 prev_ch, char32 ch);
-  // Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel.
-  static bool IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch);
-  // Helper returns true if the sequence prev_ch,ch is invalid Thai.
-  static bool IsBadlyFormedThai(char32 prev_ch, char32 ch);
-};
-
-}  // namespace tesseract
-
-#endif  // TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
+#ifndef TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
+#define TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
+
+#include "validator.h"
+
+namespace tesseract {
+
+// Subclass of Validator that validates and segments generic unicode into
+// grapheme clusters, including Latin with diacritics.
+class ValidateGrapheme : public Validator {
+ public:
+  ValidateGrapheme(ViramaScript script, bool report_errors)
+      : Validator(script, report_errors) {}
+  ~ValidateGrapheme() {}
+
+ protected:
+  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
+  // parts_ and output_. Returns true if a valid Grapheme was consumed,
+  // otherwise does not increment codes_used_.
+  bool ConsumeGraphemeIfValid() override;
+  // Returns the CharClass corresponding to the given Unicode ch.
+  CharClass UnicodeToCharClass(char32 ch) const override;
+
+ private:
+  // Helper returns true if the sequence prev_ch,ch is invalid.
+  bool IsBadlyFormed(char32 prev_ch, char32 ch);
+  // Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel.
+  static bool IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch);
+  // Helper returns true if the sequence prev_ch,ch is invalid Thai.
+  static bool IsBadlyFormedThai(char32 prev_ch, char32 ch);
+};
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
--- a/training/validate_indic.cpp
+++ b/training/validate_indic.cpp
--- a/training/validate_indic.h
+++ b/training/validate_indic.h
-#ifndef TESSERACT_TRAINING_VALIDATE_INDIC_H_
-#define TESSERACT_TRAINING_VALIDATE_INDIC_H_
-
-#include "validator.h"
-
-namespace tesseract {
-
-// Subclass of Validator that validates and segments Indic scripts in the
-// unicode range 0x900-0xdff (Devanagari-Sinhala).
-class ValidateIndic : public Validator {
- public:
-  ValidateIndic(ViramaScript script, bool report_errors)
-      : Validator(script, report_errors) {}
-  ~ValidateIndic() {}
-
- protected:
-  // Returns whether codes matches the pattern for an Indic Grapheme.
-  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
-  // parts_ and output_. Returns true if a valid Grapheme was consumed,
-  // otherwise does not increment codes_used_.
-  bool ConsumeGraphemeIfValid() override;
-  // Returns the CharClass corresponding to the given Unicode ch.
-  Validator::CharClass UnicodeToCharClass(char32 ch) const override;
-
- private:
-  // Helper consumes/copies a virama and any associated post-virama joiners.
-  bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra);
-  // Helper consumes/copies a series of consonants separated by viramas while
-  // valid, but not any vowel or other modifiers.
-  bool ConsumeConsonantHeadIfValid();
-  // Helper consumes/copies a tail part of a consonant, comprising optional
-  // matra/piece, vowel modifier, vedic mark, terminating virama.
-  bool ConsumeConsonantTailIfValid();
-  // Helper consumes/copies a vowel and optional modifiers.
-  bool ConsumeVowelIfValid();
-
-  // Some special unicodes used only for Indic processing.
-  static const char32 kYayana = 0xdba;  // Sinhala Ya
-  static const char32 kRayana = 0xdbb;  // Sinhala Ra
-};
-
-}  // namespace tesseract
-
-#endif  // TESSERACT_TRAINING_VALIDATE_INDIC_H_
+#ifndef TESSERACT_TRAINING_VALIDATE_INDIC_H_
+#define TESSERACT_TRAINING_VALIDATE_INDIC_H_
+
+#include "validator.h"
+
+namespace tesseract {
+
+// Subclass of Validator that validates and segments Indic scripts in the
+// unicode range 0x900-0xdff (Devanagari-Sinhala).
+class ValidateIndic : public Validator {
+ public:
+  ValidateIndic(ViramaScript script, bool report_errors)
+      : Validator(script, report_errors) {}
+  ~ValidateIndic() {}
+
+ protected:
+  // Returns whether codes matches the pattern for an Indic Grapheme.
+  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
+  // parts_ and output_. Returns true if a valid Grapheme was consumed,
+  // otherwise does not increment codes_used_.
+  bool ConsumeGraphemeIfValid() override;
+  // Returns the CharClass corresponding to the given Unicode ch.
+  Validator::CharClass UnicodeToCharClass(char32 ch) const override;
+
+ private:
+  // Helper consumes/copies a virama and any associated post-virama joiners.
+  bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra);
+  // Helper consumes/copies a series of consonants separated by viramas while
+  // valid, but not any vowel or other modifiers.
+  bool ConsumeConsonantHeadIfValid();
+  // Helper consumes/copies a tail part of a consonant, comprising optional
+  // matra/piece, vowel modifier, vedic mark, terminating virama.
+  bool ConsumeConsonantTailIfValid();
+  // Helper consumes/copies a vowel and optional modifiers.
+  bool ConsumeVowelIfValid();
+
+  // Some special unicodes used only for Indic processing.
+  static const char32 kYayana = 0xdba;  // Sinhala Ya
+  static const char32 kRayana = 0xdbb;  // Sinhala Ra
+};
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_TRAINING_VALIDATE_INDIC_H_
--- a/training/validate_khmer.cpp
+++ b/training/validate_khmer.cpp
-#include "validate_khmer.h"
-#include "errcode.h"
-#include "tprintf.h"
-
-namespace tesseract {
-
-// Returns whether codes matches the pattern for a Khmer Grapheme.
-// Taken from unicode standard:
-// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf.
-// where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation
-// to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf.
-// Translated to the codes used by the CharClass enum:
-// C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC}
-// Where R is a new symbol (Robat) and N is repurposed as a consonant shifter.
-// Also the Consonant class here includes independent vowels, as they are
-// treated the same anyway.
-// In the split grapheme mode, the only characters that get grouped are the
-// HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in
-// the BNF syntax, so who knows what they do.
-bool ValidateKhmer::ConsumeGraphemeIfValid() {
-  int num_codes = codes_.size();
-  if (codes_used_ == num_codes) return false;
-  if (codes_[codes_used_].first == CharClass::kOther) {
-    UseMultiCode(1);
-    return true;
-  }
-  if (codes_[codes_used_].first != CharClass::kConsonant) {
-    if (report_errors_) {
-      tprintf("Invalid start of Khmer syllable:0x%x\n",
-              codes_[codes_used_].second);
-    }
-    return false;
-  }
-  if (UseMultiCode(1)) return true;
-  if (codes_[codes_used_].first == CharClass::kRobat ||
-      codes_[codes_used_].first == CharClass::kNukta) {
-    if (UseMultiCode(1)) return true;
-  }
-  while (codes_used_ + 1 < num_codes &&
-         codes_[codes_used_].first == CharClass::kVirama &&
-         codes_[codes_used_ + 1].first == CharClass::kConsonant) {
-    ASSERT_HOST(!CodeOnlyToOutput());
-    if (UseMultiCode(2)) return true;
-    if (codes_[codes_used_].first == CharClass::kRobat) {
-      if (UseMultiCode(1)) return true;
-    }
-  }
-  int num_matra_parts = 0;
-  if (codes_[codes_used_].second == kZeroWidthJoiner ||
-      codes_[codes_used_].second == kZeroWidthNonJoiner) {
-    if (CodeOnlyToOutput()) {
-      if (report_errors_) {
-        tprintf("Unterminated joiner: 0x%x\n", output_.back());
-      }
-      return false;
-    }
-    ++num_matra_parts;
-  }
-  // Not quite as shown by the BNF, the matra piece is allowed as a matra on its
-  // own or as an addition to other matras.
-  if (codes_[codes_used_].first == CharClass::kMatra ||
-      codes_[codes_used_].first == CharClass::kMatraPiece) {
-    ++num_matra_parts;
-    if (UseMultiCode(num_matra_parts)) return true;
-  } else if (num_matra_parts) {
-    if (report_errors_) {
-      tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n",
-              output_.back(), codes_[codes_used_].second);
-    }
-    return false;
-  }
-  if (codes_[codes_used_].first == CharClass::kMatraPiece &&
-      codes_[codes_used_ - 1].first != CharClass::kMatraPiece) {
-    if (UseMultiCode(1)) return true;
-  }
-  if (codes_[codes_used_].first == CharClass::kVowelModifier) {
-    if (UseMultiCode(1)) return true;
-  }
-  if (codes_used_ + 1 < num_codes &&
-      codes_[codes_used_].first == CharClass::kVirama &&
-      codes_[codes_used_ + 1].first == CharClass::kConsonant) {
-    ASSERT_HOST(!CodeOnlyToOutput());
-    if (UseMultiCode(2)) return true;
-  }
-  return true;
-}
-
-Validator::CharClass ValidateKhmer::UnicodeToCharClass(char32 ch) const {
-  if (IsVedicAccent(ch)) return CharClass::kVedicMark;
-  if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
-  if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
-  // Offset from the start of the relevant unicode code block aka code page.
-  int off = ch - static_cast<char32>(script_);
-  // Anything in another code block is other.
-  if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
-  if (off <= 0x33) return CharClass::kConsonant;
-  if (off <= 0x45) return CharClass::kMatra;
-  if (off == 0x46) return CharClass::kMatraPiece;
-  if (off == 0x4c) return CharClass::kRobat;
-  if (off == 0x49 || off == 0x4a) return CharClass::kNukta;
-  if (off <= 0x51) return CharClass::kVowelModifier;
-  if (off == 0x52) return CharClass::kVirama;
-  return CharClass::kOther;
-}
-
-}  // namespace tesseract
+#include "validate_khmer.h"
+#include "errcode.h"
+#include "tprintf.h"
+
+namespace tesseract {
+
+// Returns whether codes matches the pattern for a Khmer Grapheme.
+// Taken from unicode standard:
+// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf.
+// where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation
+// to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf.
+// Translated to the codes used by the CharClass enum:
+// C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC}
+// Where R is a new symbol (Robat) and N is repurposed as a consonant shifter.
+// Also the Consonant class here includes independent vowels, as they are
+// treated the same anyway.
+// In the split grapheme mode, the only characters that get grouped are the
+// HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in
+// the BNF syntax, so who knows what they do.
+bool ValidateKhmer::ConsumeGraphemeIfValid() {
+  int num_codes = codes_.size();
+  if (codes_used_ == num_codes) return false;
+  if (codes_[codes_used_].first == CharClass::kOther) {
+    UseMultiCode(1);
+    return true;
+  }
+  if (codes_[codes_used_].first != CharClass::kConsonant) {
+    if (report_errors_) {
+      tprintf("Invalid start of Khmer syllable:0x%x\n",
+              codes_[codes_used_].second);
+    }
+    return false;
+  }
+  if (UseMultiCode(1)) return true;
+  if (codes_[codes_used_].first == CharClass::kRobat ||
+      codes_[codes_used_].first == CharClass::kNukta) {
+    if (UseMultiCode(1)) return true;
+  }
+  while (codes_used_ + 1 < num_codes &&
+         codes_[codes_used_].first == CharClass::kVirama &&
+         codes_[codes_used_ + 1].first == CharClass::kConsonant) {
+    ASSERT_HOST(!CodeOnlyToOutput());
+    if (UseMultiCode(2)) return true;
+    if (codes_[codes_used_].first == CharClass::kRobat) {
+      if (UseMultiCode(1)) return true;
+    }
+  }
+  int num_matra_parts = 0;
+  if (codes_[codes_used_].second == kZeroWidthJoiner ||
+      codes_[codes_used_].second == kZeroWidthNonJoiner) {
+    if (CodeOnlyToOutput()) {
+      if (report_errors_) {
+        tprintf("Unterminated joiner: 0x%x\n", output_.back());
+      }
+      return false;
+    }
+    ++num_matra_parts;
+  }
+  // Not quite as shown by the BNF, the matra piece is allowed as a matra on its
+  // own or as an addition to other matras.
+  if (codes_[codes_used_].first == CharClass::kMatra ||
+      codes_[codes_used_].first == CharClass::kMatraPiece) {
+    ++num_matra_parts;
+    if (UseMultiCode(num_matra_parts)) return true;
+  } else if (num_matra_parts) {
+    if (report_errors_) {
+      tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n",
+              output_.back(), codes_[codes_used_].second);
+    }
+    return false;
+  }
+  if (codes_[codes_used_].first == CharClass::kMatraPiece &&
+      codes_[codes_used_ - 1].first != CharClass::kMatraPiece) {
+    if (UseMultiCode(1)) return true;
+  }
+  if (codes_[codes_used_].first == CharClass::kVowelModifier) {
+    if (UseMultiCode(1)) return true;
+  }
+  if (codes_used_ + 1 < num_codes &&
+      codes_[codes_used_].first == CharClass::kVirama &&
+      codes_[codes_used_ + 1].first == CharClass::kConsonant) {
+    ASSERT_HOST(!CodeOnlyToOutput());
+    if (UseMultiCode(2)) return true;
+  }
+  return true;
+}
+
+Validator::CharClass ValidateKhmer::UnicodeToCharClass(char32 ch) const {
+  if (IsVedicAccent(ch)) return CharClass::kVedicMark;
+  if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
+  if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
+  // Offset from the start of the relevant unicode code block aka code page.
+  int off = ch - static_cast<char32>(script_);
+  // Anything in another code block is other.
+  if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
+  if (off <= 0x33) return CharClass::kConsonant;
+  if (off <= 0x45) return CharClass::kMatra;
+  if (off == 0x46) return CharClass::kMatraPiece;
+  if (off == 0x4c) return CharClass::kRobat;
+  if (off == 0x49 || off == 0x4a) return CharClass::kNukta;
+  if (off <= 0x51) return CharClass::kVowelModifier;
+  if (off == 0x52) return CharClass::kVirama;
+  return CharClass::kOther;
+}
+
+}  // namespace tesseract
--- a/training/validate_khmer.h
+++ b/training/validate_khmer.h
-#ifndef TESSERACT_TRAINING_VALIDATE_KHMER_H_
-#define TESSERACT_TRAINING_VALIDATE_KHMER_H_
-
-#include "validator.h"
-
-namespace tesseract {
-
-// Subclass of Validator that validates and segments Khmer.
-class ValidateKhmer : public Validator {
- public:
-  ValidateKhmer(ViramaScript script, bool report_errors)
-      : Validator(script, report_errors) {}
-  ~ValidateKhmer() {}
-
- protected:
-  // Returns whether codes matches the pattern for an Khmer Grapheme.
-  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
-  // parts_ and output_. Returns true if a valid Grapheme was consumed,
-  // otherwise does not increment codes_used_.
-  bool ConsumeGraphemeIfValid() override;
-  // Returns the CharClass corresponding to the given Unicode ch.
-  CharClass UnicodeToCharClass(char32 ch) const override;
-};
-
-}  // namespace tesseract
-
-#endif  // TESSERACT_TRAINING_VALIDATE_KHMER_H_
+#ifndef TESSERACT_TRAINING_VALIDATE_KHMER_H_
+#define TESSERACT_TRAINING_VALIDATE_KHMER_H_
+
+#include "validator.h"
+
+namespace tesseract {
+
+// Subclass of Validator that validates and segments Khmer.
+class ValidateKhmer : public Validator {
+ public:
+  ValidateKhmer(ViramaScript script, bool report_errors)
+      : Validator(script, report_errors) {}
+  ~ValidateKhmer() {}
+
+ protected:
+  // Returns whether codes matches the pattern for an Khmer Grapheme.
+  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
+  // parts_ and output_. Returns true if a valid Grapheme was consumed,
+  // otherwise does not increment codes_used_.
+  bool ConsumeGraphemeIfValid() override;
+  // Returns the CharClass corresponding to the given Unicode ch.
+  CharClass UnicodeToCharClass(char32 ch) const override;
+};
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_TRAINING_VALIDATE_KHMER_H_
--- a/training/validate_myanmar.cpp
+++ b/training/validate_myanmar.cpp
-#include "validate_myanmar.h"
-#include "errcode.h"
-#include "icuerrorcode.h"
-#include "tprintf.h"
-#include "unicode/uchar.h"    // From libicu
-#include "unicode/uscript.h"  // From libicu
-
-namespace tesseract {
-
-// Returns whether codes matches the pattern for a Myanmar Grapheme.
-// Taken directly from the unicode table 16-3.
-// See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
-bool ValidateMyanmar::ConsumeGraphemeIfValid() {
-  int num_codes = codes_.size();
-  if (codes_used_ == num_codes) return true;
-  // Other.
-  if (IsMyanmarOther(codes_[codes_used_].second)) {
-    UseMultiCode(1);
-    return true;
-  }
-  // Kinzi.
-  if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 &&
-      codes_[codes_used_ + 1].second == kMyanmarAsat &&
-      codes_[codes_used_ + 2].second == kMyanmarVirama) {
-    ASSERT_HOST(!CodeOnlyToOutput());
-    ASSERT_HOST(!CodeOnlyToOutput());
-    if (UseMultiCode(3)) return true;
-  }
-  // Base consonant/vowel. NOTE that since everything in Myanmar appears to be
-  // optional, except the base, this is the only place where invalid input can
-  // be detected and false returned.
-  if (IsMyanmarLetter(codes_[codes_used_].second)) {
-    if (UseMultiCode(1)) return true;
-  } else {
-    if (report_errors_) {
-      tprintf("Invalid start of Myanmar syllable:0x%x\n",
-              codes_[codes_used_].second);
-    }
-    return false;  // One of these is required.
-  }
-  if (ConsumeSubscriptIfPresent()) return true;
-  ConsumeOptionalSignsIfPresent();
-  // What we have consumed so far is a valid syllable.
-  return true;
-}
-
-// TODO(rays) Doesn't use intermediate coding like the other scripts, as there
-// is little correspondence between the content of table 16-3 and the char
-// classes of the Indic languages. (Experts may disagree and improve!)
-// In unicode table 16-3 there is basically a long list of optional characters,
-// which can be coded quite easily.
-// Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!!
-// The table also allows sequences that still result in dotted circles!!
-// So with a lot of guesswork the rest have been added in a reasonable place.
-Validator::CharClass ValidateMyanmar::UnicodeToCharClass(char32 ch) const {
-  if (IsMyanmarLetter(ch)) return CharClass::kConsonant;
-  return CharClass::kOther;
-}
-
-// Helper consumes/copies a virama and any subscript consonant.
-// Returns true if the end of input is reached.
-bool ValidateMyanmar::ConsumeSubscriptIfPresent() {
-  // Subscript consonant. It appears there can be only one.
-  int num_codes = codes_.size();
-  if (codes_used_ + 1 < num_codes &&
-      codes_[codes_used_].second == kMyanmarVirama) {
-    if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) {
-      ASSERT_HOST(!CodeOnlyToOutput());
-      if (UseMultiCode(2)) return true;
-    }
-  }
-  return false;
-}
-
-// Helper consumes/copies a series of optional signs.
-// Returns true if the end of input is reached.
-bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {
-  // The following characters are allowed, all optional, and in sequence.
-  // An exception is kMyanmarMedialYa, which can include kMyanmarAsat.
-  const std::vector<char32> kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c,
-                                      0x103d, 0x103e, 0x105e, 0x105f, 0x1060,
-                                      0x1081, 0x1031});
-  for (char32 ch : kMedials) {
-    if (codes_[codes_used_].second == ch) {
-      if (UseMultiCode(1)) return true;
-      if (ch == kMyanmarMedialYa &&
-          codes_[codes_used_].second == kMyanmarAsat) {
-        if (UseMultiCode(1)) return true;
-      }
-    }
-  }
-  // Vowel sign i, ii, ai.
-  char32 ch = codes_[codes_used_].second;
-  if (ch == 0x102d || ch == 0x102e || ch == 0x1032) {
-    if (UseMultiCode(1)) return true;
-  }
-  // Vowel sign u, uu, and extensions.
-  ch = codes_[codes_used_].second;
-  if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) ||
-      ch == 0x1062 || ch == 0x1067 || ch == 0x1068 ||
-      (0x1071 <= ch && ch <= 0x1074) || (0x1083 <= ch && ch <= 0x1086) ||
-      ch == 0x109c || ch == 0x109d) {
-    if (UseMultiCode(1)) return true;
-  }
-  // Tall aa, aa with optional asat.
-  if (codes_[codes_used_].second == 0x102b ||
-      codes_[codes_used_].second == 0x102c) {
-    if (UseMultiCode(1)) return true;
-    if (codes_[codes_used_].second == kMyanmarAsat) {
-      if (UseMultiCode(1)) return true;
-    }
-  }
-  // The following characters are allowed, all optional, and in sequence.
-  const std::vector<char32> kSigns({0x1036, 0x1037});
-  for (char32 ch : kSigns) {
-    if (codes_[codes_used_].second == ch) {
-      if (UseMultiCode(1)) return true;
-    }
-  }
-  // Tone mark extensions.
-  ch = codes_[codes_used_].second;
-  if (ch == 0x1038 || ch == kMyanmarAsat || ch == 0x1063 || ch == 0x1064 ||
-      (0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) ||
-      ch == 0x108f || ch == 0x109a || ch == 0x109b ||
-      (0xaa7b <= ch && ch <= 0xaa7d)) {
-    if (UseMultiCode(1)) return true;
-  }
-  return false;
-}
-
-// Returns true if the unicode is a Myanmar "letter" including consonants
-// and independent vowels. Although table 16-3 distinguishes between some
-// base consonants and vowels, the extensions make no such distinction, so we
-// put them all into a single bucket.
-/* static */
-bool ValidateMyanmar::IsMyanmarLetter(char32 ch) {
-  return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f ||
-         (0x1050 <= ch && ch <= 0x1055) || (0x105a <= ch && ch <= 0x105d) ||
-         ch == 0x1061 || ch == 0x1065 || ch == 0x1066 ||
-         (0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1080) ||
-         ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9ef) ||
-         (0xa9fa <= ch && ch <= 0xa9ff) || (0xaa60 <= ch && ch <= 0xaa73) ||
-         ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f;
-}
-
-// Returns true if ch is a Myanmar digit or other symbol that does not take
-// part in being a syllable.
-/* static */
-bool ValidateMyanmar::IsMyanmarOther(char32 ch) {
-  IcuErrorCode err;
-  UScriptCode script_code = uscript_getScript(ch, err);
-  if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner &&
-      ch != Validator::kZeroWidthNonJoiner)
-    return true;
-  return (0x1040 <= ch && ch <= 0x1049) || (0x1090 <= ch && ch <= 0x1099) ||
-         (0x109c <= ch && ch <= 0x109d) || (0xa9f0 <= ch && ch <= 0xa9f9) ||
-         (0xaa74 <= ch && ch <= 0xaa79);
-}
-
-}  // namespace tesseract
+#include "validate_myanmar.h"
+#include "errcode.h"
+#include "icuerrorcode.h"
+#include "tprintf.h"
+#include "unicode/uchar.h"    // From libicu
+#include "unicode/uscript.h"  // From libicu
+
+namespace tesseract {
+
+// Returns whether codes matches the pattern for a Myanmar Grapheme.
+// Taken directly from the unicode table 16-3.
+// See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
+bool ValidateMyanmar::ConsumeGraphemeIfValid() {
+  int num_codes = codes_.size();
+  if (codes_used_ == num_codes) return true;
+  // Other.
+  if (IsMyanmarOther(codes_[codes_used_].second)) {
+    UseMultiCode(1);
+    return true;
+  }
+  // Kinzi.
+  if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 &&
+      codes_[codes_used_ + 1].second == kMyanmarAsat &&
+      codes_[codes_used_ + 2].second == kMyanmarVirama) {
+    ASSERT_HOST(!CodeOnlyToOutput());
+    ASSERT_HOST(!CodeOnlyToOutput());
+    if (UseMultiCode(3)) return true;
+  }
+  // Base consonant/vowel. NOTE that since everything in Myanmar appears to be
+  // optional, except the base, this is the only place where invalid input can
+  // be detected and false returned.
+  if (IsMyanmarLetter(codes_[codes_used_].second)) {
+    if (UseMultiCode(1)) return true;
+  } else {
+    if (report_errors_) {
+      tprintf("Invalid start of Myanmar syllable:0x%x\n",
+              codes_[codes_used_].second);
+    }
+    return false;  // One of these is required.
+  }
+  if (ConsumeSubscriptIfPresent()) return true;
+  ConsumeOptionalSignsIfPresent();
+  // What we have consumed so far is a valid syllable.
+  return true;
+}
+
+// TODO(rays) Doesn't use intermediate coding like the other scripts, as there
+// is little correspondence between the content of table 16-3 and the char
+// classes of the Indic languages. (Experts may disagree and improve!)
+// In unicode table 16-3 there is basically a long list of optional characters,
+// which can be coded quite easily.
+// Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!!
+// The table also allows sequences that still result in dotted circles!!
+// So with a lot of guesswork the rest have been added in a reasonable place.
+Validator::CharClass ValidateMyanmar::UnicodeToCharClass(char32 ch) const {
+  if (IsMyanmarLetter(ch)) return CharClass::kConsonant;
+  return CharClass::kOther;
+}
+
+// Helper consumes/copies a virama and any subscript consonant.
+// Returns true if the end of input is reached.
+bool ValidateMyanmar::ConsumeSubscriptIfPresent() {
+  // Subscript consonant. It appears there can be only one.
+  int num_codes = codes_.size();
+  if (codes_used_ + 1 < num_codes &&
+      codes_[codes_used_].second == kMyanmarVirama) {
+    if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) {
+      ASSERT_HOST(!CodeOnlyToOutput());
+      if (UseMultiCode(2)) return true;
+    }
+  }
+  return false;
+}
+
+// Helper consumes/copies a series of optional signs.
+// Returns true if the end of input is reached.
+bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {
+  // The following characters are allowed, all optional, and in sequence.
+  // An exception is kMyanmarMedialYa, which can include kMyanmarAsat.
+  const std::vector<char32> kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c,
+                                      0x103d, 0x103e, 0x105e, 0x105f, 0x1060,
+                                      0x1081, 0x1031});
+  for (char32 ch : kMedials) {
+    if (codes_[codes_used_].second == ch) {
+      if (UseMultiCode(1)) return true;
+      if (ch == kMyanmarMedialYa &&
+          codes_[codes_used_].second == kMyanmarAsat) {
+        if (UseMultiCode(1)) return true;
+      }
+    }
+  }
+  // Vowel sign i, ii, ai.
+  char32 ch = codes_[codes_used_].second;
+  if (ch == 0x102d || ch == 0x102e || ch == 0x1032) {
+    if (UseMultiCode(1)) return true;
+  }
+  // Vowel sign u, uu, and extensions.
+  ch = codes_[codes_used_].second;
+  if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) ||
+      ch == 0x1062 || ch == 0x1067 || ch == 0x1068 ||
+      (0x1071 <= ch && ch <= 0x1074) || (0x1083 <= ch && ch <= 0x1086) ||
+      ch == 0x109c || ch == 0x109d) {
+    if (UseMultiCode(1)) return true;
+  }
+  // Tall aa, aa with optional asat.
+  if (codes_[codes_used_].second == 0x102b ||
+      codes_[codes_used_].second == 0x102c) {
+    if (UseMultiCode(1)) return true;
+    if (codes_[codes_used_].second == kMyanmarAsat) {
+      if (UseMultiCode(1)) return true;
+    }
+  }
+  // The following characters are allowed, all optional, and in sequence.
+  const std::vector<char32> kSigns({0x1036, 0x1037});
+  for (char32 ch : kSigns) {
+    if (codes_[codes_used_].second == ch) {
+      if (UseMultiCode(1)) return true;
+    }
+  }
+  // Tone mark extensions.
+  ch = codes_[codes_used_].second;
+  if (ch == 0x1038 || ch == kMyanmarAsat || ch == 0x1063 || ch == 0x1064 ||
+      (0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) ||
+      ch == 0x108f || ch == 0x109a || ch == 0x109b ||
+      (0xaa7b <= ch && ch <= 0xaa7d)) {
+    if (UseMultiCode(1)) return true;
+  }
+  return false;
+}
+
+// Returns true if the unicode is a Myanmar "letter" including consonants
+// and independent vowels. Although table 16-3 distinguishes between some
+// base consonants and vowels, the extensions make no such distinction, so we
+// put them all into a single bucket.
+/* static */
+bool ValidateMyanmar::IsMyanmarLetter(char32 ch) {
+  return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f ||
+         (0x1050 <= ch && ch <= 0x1055) || (0x105a <= ch && ch <= 0x105d) ||
+         ch == 0x1061 || ch == 0x1065 || ch == 0x1066 ||
+         (0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1080) ||
+         ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9ef) ||
+         (0xa9fa <= ch && ch <= 0xa9ff) || (0xaa60 <= ch && ch <= 0xaa73) ||
+         ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f;
+}
+
+// Returns true if ch is a Myanmar digit or other symbol that does not take
+// part in being a syllable.
+/* static */
+bool ValidateMyanmar::IsMyanmarOther(char32 ch) {
+  IcuErrorCode err;
+  UScriptCode script_code = uscript_getScript(ch, err);
+  if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner &&
+      ch != Validator::kZeroWidthNonJoiner)
+    return true;
+  return (0x1040 <= ch && ch <= 0x1049) || (0x1090 <= ch && ch <= 0x1099) ||
+         (0x109c <= ch && ch <= 0x109d) || (0xa9f0 <= ch && ch <= 0xa9f9) ||
+         (0xaa74 <= ch && ch <= 0xaa79);
+}
+
+}  // namespace tesseract
--- a/training/validate_myanmar.h
+++ b/training/validate_myanmar.h
-#ifndef TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
-#define TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
-
-#include "validator.h"
-
-namespace tesseract {
-
-// Subclass of Validator that validates and segments Myanmar.
-class ValidateMyanmar : public Validator {
- public:
-  ValidateMyanmar(ViramaScript script, bool report_errors)
-      : Validator(script, report_errors) {}
-  ~ValidateMyanmar() {}
-
- protected:
-  // Returns whether codes matches the pattern for a Myanmar Grapheme.
-  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
-  // parts_ and output_. Returns true if a valid Grapheme was consumed,
-  // otherwise does not increment codes_used_.
-  bool ConsumeGraphemeIfValid() override;
-  // Returns the CharClass corresponding to the given Unicode ch.
-  Validator::CharClass UnicodeToCharClass(char32 ch) const override;
-
- private:
-  // Helper consumes/copies a virama and any subscript consonant.
-  // Returns true if the end of input is reached.
-  bool ConsumeSubscriptIfPresent();
-  // Helper consumes/copies a series of optional signs.
-  // Returns true if the end of input is reached.
-  bool ConsumeOptionalSignsIfPresent();
-  // Returns true if the unicode is a Myanmar "letter" including consonants
-  // and independent vowels. Although table 16-3 distinguishes between some
-  // base consonants and vowels, the extensions make no such distinction, so we
-  // put them all into a single bucket.
-  static bool IsMyanmarLetter(char32 ch);
-  // Returns true if ch is a Myanmar digit or other symbol that does not take
-  // part in being a syllable.
-  static bool IsMyanmarOther(char32 ch);
-
-  // Some special unicodes used only for Myanmar processing.
-  static const char32 kMyanmarAsat = 0x103a;
-  static const char32 kMyanmarMedialYa = 0x103b;
-};
-
-}  // namespace tesseract
-
-#endif  // TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
+#ifndef TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
+#define TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
+
+#include "validator.h"
+
+namespace tesseract {
+
+// Subclass of Validator that validates and segments Myanmar.
+class ValidateMyanmar : public Validator {
+ public:
+  ValidateMyanmar(ViramaScript script, bool report_errors)
+      : Validator(script, report_errors) {}
+  ~ValidateMyanmar() {}
+
+ protected:
+  // Returns whether codes matches the pattern for a Myanmar Grapheme.
+  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
+  // parts_ and output_. Returns true if a valid Grapheme was consumed,
+  // otherwise does not increment codes_used_.
+  bool ConsumeGraphemeIfValid() override;
+  // Returns the CharClass corresponding to the given Unicode ch.
+  Validator::CharClass UnicodeToCharClass(char32 ch) const override;
+
+ private:
+  // Helper consumes/copies a virama and any subscript consonant.
+  // Returns true if the end of input is reached.
+  bool ConsumeSubscriptIfPresent();
+  // Helper consumes/copies a series of optional signs.
+  // Returns true if the end of input is reached.
+  bool ConsumeOptionalSignsIfPresent();
+  // Returns true if the unicode is a Myanmar "letter" including consonants
+  // and independent vowels. Although table 16-3 distinguishes between some
+  // base consonants and vowels, the extensions make no such distinction, so we
+  // put them all into a single bucket.
+  static bool IsMyanmarLetter(char32 ch);
+  // Returns true if ch is a Myanmar digit or other symbol that does not take
+  // part in being a syllable.
+  static bool IsMyanmarOther(char32 ch);
+
+  // Some special unicodes used only for Myanmar processing.
+  static const char32 kMyanmarAsat = 0x103a;
+  static const char32 kMyanmarMedialYa = 0x103b;
+};
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
--- a/training/validator.cpp
+++ b/training/validator.cpp
--- a/training/validator.h
+++ b/training/validator.h
--- a/training/wordlist2dawg.cpp
+++ b/training/wordlist2dawg.cpp