Move training to src.

104fe793 · Egor Pugin · ca5c15e6 · 104fe793 · 104fe793 · 104fe793
65 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -308,7 +308,7 @@ if (BUILD_TESTS AND EXISTS ${PROJECT_SOURCE_DIR}/googletest/CMakeLists.txt)
 endif()
 if (BUILD_TRAINING_TOOLS)
-add_subdirectory(training)
+add_subdirectory(src/training)
 endif()
 get_target_property(tesseract_NAME libtesseract NAME)

--- a/configure.ac
+++ b/configure.ac
@@ -502,7 +502,7 @@ AC_CONFIG_FILES([java/com/google/scrollview/Makefile])
 AC_CONFIG_FILES([java/com/google/scrollview/events/Makefile])
 AC_CONFIG_FILES([java/com/google/scrollview/ui/Makefile])
 AC_CONFIG_FILES([doc/Makefile])
-AM_COND_IF([ENABLE_TRAINING], [AC_CONFIG_FILES(training/Makefile)])
+AM_COND_IF([ENABLE_TRAINING], [AC_CONFIG_FILES(src/training/Makefile)])
 AC_OUTPUT
 # Final message

--- a/cppan.yml
+++ b/cppan.yml
@@ -172,7 +172,7 @@ projects:
    tessopt:
        type: lib
        static_only: true
-        files: training/tessopt.*
+        files: src/training/tessopt.*
        include_directories: training
        dependencies: libtesseract
@@ -180,104 +180,104 @@ projects:
        type: lib
        static_only: true
        files:
-            - training/commandlineflags.cpp
+            - src/training/commandlineflags.cpp
-            - training/commandlineflags.h
+            - src/training/commandlineflags.h
-            - training/commontraining.cpp
+            - src/training/commontraining.cpp
-            - training/commontraining.h
+            - src/training/commontraining.h
        include_directories: training
        dependencies:
            - tessopt
    ambiguous_words:
-        files: training/ambiguous_words.cpp
+        files: src/training/ambiguous_words.cpp
        dependencies:
            - libtesseract
    classifier_tester:
-        files: training/classifier_tester.cpp
+        files: src/training/classifier_tester.cpp
        dependencies: common_training
    combine_lang_model:
-        files: training/combine_lang_model.cpp
+        files: src/training/combine_lang_model.cpp
        dependencies: unicharset_training
    combine_tessdata:
-        files: training/combine_tessdata.cpp
+        files: src/training/combine_tessdata.cpp
        dependencies: libtesseract
    cntraining:
-        files: training/cntraining.cpp
+        files: src/training/cntraining.cpp
        dependencies: common_training
    dawg2wordlist:
-        files: training/dawg2wordlist.cpp
+        files: src/training/dawg2wordlist.cpp
        dependencies: libtesseract
    mftraining:
        files:
-            - training/mftraining.cpp
+            - src/training/mftraining.cpp
-            - training/mergenf.*
+            - src/training/mergenf.*
        dependencies: common_training
    shapeclustering:
-        files: training/shapeclustering.cpp
+        files: src/training/shapeclustering.cpp
        dependencies: common_training
    unicharset_extractor:
-        files: training/unicharset_extractor.cpp
+        files: src/training/unicharset_extractor.cpp
        dependencies: unicharset_training
    wordlist2dawg:
-        files: training/wordlist2dawg.cpp
+        files: src/training/wordlist2dawg.cpp
        dependencies: libtesseract
    unicharset_training:
        type: lib
        static_only: true
        files:
-            - training/fileio.*
+            - src/training/fileio.*
-            - training/icuerrorcode.h
+            - src/training/icuerrorcode.h
-            - training/lang_model_helpers.*
+            - src/training/lang_model_helpers.*
-            - training/lstmtester.*
+            - src/training/lstmtester.*
-            - training/normstrngs.*
+            - src/training/normstrngs.*
-            - training/unicharset_training_utils.*
+            - src/training/unicharset_training_utils.*
-            - training/validat.*
+            - src/training/validat.*
        include_directories: training
        dependencies:
            - common_training
            - pvt.cppan.demo.unicode.icu.i18n
    lstmeval:
-        files: training/lstmeval.cpp
+        files: src/training/lstmeval.cpp
        dependencies: unicharset_training
    lstmtraining:
-        files: training/lstmtraining.cpp
+        files: src/training/lstmtraining.cpp
        dependencies: unicharset_training
    set_unicharset_properties:
-        files: training/set_unicharset_properties.cpp
+        files: src/training/set_unicharset_properties.cpp
        dependencies: unicharset_training
    text2image:
        files:
-            - training/text2image.cpp
+            - src/training/text2image.cpp
-            - training/boxchar.cpp
+            - src/training/boxchar.cpp
-            - training/boxchar.h
+            - src/training/boxchar.h
-            - training/degradeimage.cpp
+            - src/training/degradeimage.cpp
-            - training/degradeimage.h
+            - src/training/degradeimage.h
-            - training/ligature_table.cpp
+            - src/training/ligature_table.cpp
-            - training/ligature_table.h
+            - src/training/ligature_table.h
-            - training/normstrngs.cpp
+            - src/training/normstrngs.cpp
-            - training/normstrngs.h
+            - src/training/normstrngs.h
-            - training/pango_font_info.cpp
+            - src/training/pango_font_info.cpp
-            - training/pango_font_info.h
+            - src/training/pango_font_info.h
-            - training/stringrenderer.cpp
+            - src/training/stringrenderer.cpp
-            - training/stringrenderer.h
+            - src/training/stringrenderer.h
-            - training/tlog.cpp
+            - src/training/tlog.cpp
-            - training/tlog.h
+            - src/training/tlog.h
-            - training/util.h
+            - src/training/util.h
-            - training/icuerrorcode.h
+            - src/training/icuerrorcode.h
        dependencies:
            - unicharset_training

--- a/training/CMakeLists.txt
+++ b/training/CMakeLists.txt
--- a/training/Makefile.am
+++ b/training/Makefile.am
--- a/training/ambiguous_words.cpp
+++ b/training/ambiguous_words.cpp
--- a/training/boxchar.cpp
+++ b/training/boxchar.cpp
--- a/training/boxchar.h
+++ b/training/boxchar.h
--- a/training/classifier_tester.cpp
+++ b/training/classifier_tester.cpp
--- a/training/cntraining.cpp
+++ b/training/cntraining.cpp
--- a/training/combine_lang_model.cpp
+++ b/training/combine_lang_model.cpp
--- a/training/combine_tessdata.cpp
+++ b/training/combine_tessdata.cpp
--- a/training/commandlineflags.cpp
+++ b/training/commandlineflags.cpp
--- a/training/commandlineflags.h
+++ b/training/commandlineflags.h
--- a/training/commontraining.cpp
+++ b/training/commontraining.cpp
--- a/training/commontraining.h
+++ b/training/commontraining.h
--- a/training/dawg2wordlist.cpp
+++ b/training/dawg2wordlist.cpp
--- a/training/degradeimage.cpp
+++ b/training/degradeimage.cpp
--- a/training/degradeimage.h
+++ b/training/degradeimage.h
 /**********************************************************************
 * File:        degradeimage.h
 * Description: Function to degrade an image (usually of text) as if it
 *              has been printed and then scanned.
 * Authors:     Ray Smith
 * Created:     Tue Nov 19 2013
 *
 * (C) Copyright 2013, Google Inc.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 **********************************************************************/
 #ifndef TESSERACT_TRAINING_DEGRADEIMAGE_H_
 #define TESSERACT_TRAINING_DEGRADEIMAGE_H_
 #include "allheaders.h"
 #include "genericvector.h"
 #include "helpers.h"  // For TRand.
 #include "rect.h"
 namespace tesseract {
 // Degrade the pix as if by a print/copy/scan cycle with exposure > 0
 // corresponding to darkening on the copier and <0 lighter and 0 not copied.
 // If rotation is not nullptr, the clockwise rotation in radians is saved there.
 // The input pix must be 8 bit grey. (Binary with values 0 and 255 is OK.)
 // The input image is destroyed and a different image returned.
 struct Pix* DegradeImage(struct Pix* input, int exposure, TRand* randomizer,
                         float* rotation);
 // Creates and returns a Pix distorted by various means according to the bool
 // flags. If boxes is not nullptr, the boxes are resized/positioned according to
 // any spatial distortion and also by the integer reduction factor box_scale
 // so they will match what the network will output.
 // Returns nullptr on error. The returned Pix must be pixDestroyed.
 Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert,
                         bool white_noise, bool smooth_noise, bool blur,
                         int box_reduction, TRand* randomizer,
                         GenericVector<TBOX>* boxes);
 // Distorts anything that has a non-null pointer with the same pseudo-random
 // perspective distortion. Width and height only need to be set if there
 // is no pix. If there is a pix, then they will be taken from there.
 void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer,
                                   Pix** pix, GenericVector<TBOX>* boxes);
 // Computes the coefficients of a randomized projective transformation.
 // The image transform requires backward transformation coefficient, and the
 // box transform the forward coefficients.
 // Returns the incolor arg to pixProjective.
 int ProjectiveCoeffs(int width, int height, TRand* randomizer,
                     float** im_coeffs, float** box_coeffs);
 }  // namespace tesseract
 #endif  // TESSERACT_TRAINING_DEGRADEIMAGE_H_
--- a/training/fileio.cpp
+++ b/training/fileio.cpp
--- a/training/fileio.h
+++ b/training/fileio.h
--- a/training/icuerrorcode.h
+++ b/training/icuerrorcode.h
 /**********************************************************************
 * File:        icuerrorcode.h
 * Description: Wrapper class for UErrorCode, with conversion operators for
 *              direct use in ICU C and C++ APIs.
 * Author:      Fredrik Roubert
 * Created:     Thu July 4 2013
 *
 * Features:
 * - The constructor initializes the internal UErrorCode to U_ZERO_ERROR,
 *  removing one common source of errors.
 * - Same use in C APIs taking a UErrorCode* (pointer) and C++ taking
 *   UErrorCode& (reference), via conversion operators.
 * - Automatic checking for success when it goes out of scope. On failure,
 *   the destructor will log an error message and exit.
 *
 * Most of ICU will handle errors gracefully and provide sensible fallbacks.
 * Using IcuErrorCode, it is therefore possible to write very compact code
 * that does sensible things on failure and provides logging for debugging.
 *
 * Example:
 * IcuErrorCode icuerrorcode;
 * return collator.compareUTF8(a, b, icuerrorcode) == UCOL_EQUAL;
 *
 * (C) Copyright 2013, Google Inc.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 **********************************************************************/
 #ifndef TESSERACT_CCUTIL_ICUERRORCODE_H_
 #define TESSERACT_CCUTIL_ICUERRORCODE_H_
 #include "tprintf.h"
 #include "unicode/errorcode.h"  // From libicu
 namespace tesseract {
 class IcuErrorCode : public icu::ErrorCode {
 public:
  IcuErrorCode() {}
  virtual ~IcuErrorCode() {
    if (isFailure()) {
      handleFailure();
    }
  }
 protected:
  virtual void handleFailure() const {
    tprintf("ICU ERROR: %s", errorName());
    exit(errorCode);
  }
 private:
  // Disallow implicit copying of object.
  IcuErrorCode(const IcuErrorCode&);
  void operator=(const IcuErrorCode&);
 };
 }  // namespace tesseract
 #endif  // TESSERACT_CCUTIL_ICUERRORCODE_H_
--- a/training/lang_model_helpers.cpp
+++ b/training/lang_model_helpers.cpp
--- a/training/lang_model_helpers.h
+++ b/training/lang_model_helpers.h
--- a/training/language-specific.sh
+++ b/training/language-specific.sh
--- a/training/ligature_table.cpp
+++ b/training/ligature_table.cpp
--- a/training/ligature_table.h
+++ b/training/ligature_table.h
--- a/training/lstmeval.cpp
+++ b/training/lstmeval.cpp
--- a/training/lstmtester.cpp
+++ b/training/lstmtester.cpp
--- a/training/lstmtester.h
+++ b/training/lstmtester.h
--- a/training/lstmtraining.cpp
+++ b/training/lstmtraining.cpp
--- a/training/merge_unicharsets.cpp
+++ b/training/merge_unicharsets.cpp
--- a/training/mergenf.cpp
+++ b/training/mergenf.cpp
--- a/training/mergenf.h
+++ b/training/mergenf.h
 /******************************************************************************
 **	Filename:    MergeNF.c
 **	Purpose:     Program for merging similar nano-feature protos
 **	Author:      Dan Johnson
 **	History:     Wed Nov 21 09:55:23 1990, DSJ, Created.
 **
 **	(c) Copyright Hewlett-Packard Company, 1988.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 ******************************************************************************/
 #ifndef TESSERACT_TRAINING_MERGENF_H_
 #define TESSERACT_TRAINING_MERGENF_H_
 /**----------------------------------------------------------------------------
 					Include Files and Type Defines
 ----------------------------------------------------------------------------**/
 #include "protos.h"
 #include "cluster.h"
 #include "ocrfeatures.h"
 #include "callcpp.h"
 #include "picofeat.h"
 #define WORST_MATCH_ALLOWED	(0.9)
 #define WORST_EVIDENCE (1.0)
 #define MAX_LENGTH_MISMATCH	(2.0 * GetPicoFeatureLength ())
 #define PROTO_SUFFIX		".mf.p"
 #define CONFIG_SUFFIX		".cl"
 #define NO_PROTO	(-1)
 #define XPOSITION			0
 #define YPOSITION			1
 #define MFLENGTH			2
 #define ORIENTATION			3
 typedef struct
 {
  FLOAT32	MinX, MaxX, MinY, MaxY;
 } FRECT;
 /**----------------------------------------------------------------------------
 					Public Macros
 ----------------------------------------------------------------------------**/
 #define CenterX(M)		( (M)[XPOSITION] )
 #define CenterY(M)		( (M)[YPOSITION] )
 #define LengthOf(M)		( (M)[MFLENGTH] )
 #define OrientationOf(M)	( (M)[ORIENTATION] )
 /**----------------------------------------------------------------------------
 					Public Function Prototypes
 ----------------------------------------------------------------------------**/
 FLOAT32 CompareProtos (
     PROTO	p1,
 	 PROTO	p2);
 void ComputeMergedProto (
     PROTO	p1,
 	 PROTO	p2,
     FLOAT32	w1,
 	 FLOAT32	w2,
     PROTO	MergedProto);
 int FindClosestExistingProto (
     CLASS_TYPE	Class,
     int       	NumMerged[],
     PROTOTYPE	*Prototype);
 void MakeNewFromOld (
     PROTO	New,
     PROTOTYPE	*Old);
 FLOAT32 SubfeatureEvidence (
   FEATURE     Feature,
   PROTO       Proto);
 double EvidenceOf (
  register double   Similarity);
 BOOL8 DummyFastMatch (
     FEATURE	Feature,
     PROTO	Proto);
 void ComputePaddedBoundingBox (
     PROTO	Proto,
     FLOAT32	TangentPad,
 	 FLOAT32	OrthogonalPad,
     FRECT	*BoundingBox);
 BOOL8 PointInside (
     FRECT	*Rectangle,
     FLOAT32	X,
 	 FLOAT32	Y);
 #endif  // TESSERACT_TRAINING_MERGENF_H_
--- a/training/mftraining.cpp
+++ b/training/mftraining.cpp
--- a/training/normstrngs.cpp
+++ b/training/normstrngs.cpp
--- a/training/normstrngs.h
+++ b/training/normstrngs.h
--- a/training/pango_font_info.cpp
+++ b/training/pango_font_info.cpp
--- a/training/pango_font_info.h
+++ b/training/pango_font_info.h
--- a/training/set_unicharset_properties.cpp
+++ b/training/set_unicharset_properties.cpp
--- a/training/shapeclustering.cpp
+++ b/training/shapeclustering.cpp
--- a/training/stringrenderer.cpp
+++ b/training/stringrenderer.cpp
--- a/training/stringrenderer.h
+++ b/training/stringrenderer.h
--- a/training/tessopt.cpp
+++ b/training/tessopt.cpp
--- a/training/tessopt.h
+++ b/training/tessopt.h
--- a/training/tesstrain.sh
+++ b/training/tesstrain.sh
--- a/training/tesstrain_utils.sh
+++ b/training/tesstrain_utils.sh
--- a/training/text2image.cpp
+++ b/training/text2image.cpp
--- a/training/tlog.cpp
+++ b/training/tlog.cpp
 /**********************************************************************
 * File:        tlog.cpp
 * Description: Variant of printf with logging level controllable by a
 *              commandline flag.
 * Author:      Ranjith Unnikrishnan
 * Created:     Wed Nov 20 2013
 *
 * (C) Copyright 2013, Google Inc.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 **********************************************************************/
 #include "tlog.h"
 INT_PARAM_FLAG(tlog_level, 0, "Minimum logging level for tlog() output");
--- a/training/tlog.h
+++ b/training/tlog.h
 /**********************************************************************
 * File:        tlog.h
 * Description: Variant of printf with logging level controllable by a
 *              commandline flag.
 * Author:      Ranjith Unnikrishnan
 * Created:     Wed Nov 20 2013
 *
 * (C) Copyright 2013, Google Inc.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 **********************************************************************/
 #ifndef TESSERACT_TRAINING_TLOG_H_
 #define TESSERACT_TRAINING_TLOG_H_
 #include "commandlineflags.h"
 #include "errcode.h"
 #include "tprintf.h"
 DECLARE_INT_PARAM_FLAG(tlog_level);
 // Variant guarded by the numeric logging level parameter FLAGS_tlog_level
 // (default 0).  Code using ParseCommandLineFlags() can control its value using
 // the --tlog_level commandline argument. Otherwise it must be specified in a
 // config file like other params.
 #define tlog(level, ...) {                        \
  if (FLAGS_tlog_level >= level) {                \
    tprintf_internal(__VA_ARGS__);                \
  }                                               \
 }
 #define TLOG_IS_ON(level) (FLAGS_tlog_level >= level)
 #endif  // TESSERACT_TRAINING_TLOG_H_
--- a/training/unicharset_extractor.cpp
+++ b/training/unicharset_extractor.cpp
--- a/training/unicharset_training_utils.cpp
+++ b/training/unicharset_training_utils.cpp
--- a/training/unicharset_training_utils.h
+++ b/training/unicharset_training_utils.h
--- a/training/util.h
+++ b/training/util.h
--- a/training/validate_grapheme.cpp
+++ b/training/validate_grapheme.cpp
--- a/training/validate_grapheme.h
+++ b/training/validate_grapheme.h
 #ifndef TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
 #define TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
 #include "validator.h"
 namespace tesseract {
 // Subclass of Validator that validates and segments generic unicode into
 // grapheme clusters, including Latin with diacritics.
 class ValidateGrapheme : public Validator {
 public:
  ValidateGrapheme(ViramaScript script, bool report_errors)
      : Validator(script, report_errors) {}
  ~ValidateGrapheme() {}
 protected:
  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
  // parts_ and output_. Returns true if a valid Grapheme was consumed,
  // otherwise does not increment codes_used_.
  bool ConsumeGraphemeIfValid() override;
  // Returns the CharClass corresponding to the given Unicode ch.
  CharClass UnicodeToCharClass(char32 ch) const override;
 private:
  // Helper returns true if the sequence prev_ch,ch is invalid.
  bool IsBadlyFormed(char32 prev_ch, char32 ch);
  // Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel.
  static bool IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch);
  // Helper returns true if the sequence prev_ch,ch is invalid Thai.
  static bool IsBadlyFormedThai(char32 prev_ch, char32 ch);
 };
 }  // namespace tesseract
 #endif  // TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
--- a/training/validate_indic.cpp
+++ b/training/validate_indic.cpp
--- a/training/validate_indic.h
+++ b/training/validate_indic.h
 #ifndef TESSERACT_TRAINING_VALIDATE_INDIC_H_
 #define TESSERACT_TRAINING_VALIDATE_INDIC_H_
 #include "validator.h"
 namespace tesseract {
 // Subclass of Validator that validates and segments Indic scripts in the
 // unicode range 0x900-0xdff (Devanagari-Sinhala).
 class ValidateIndic : public Validator {
 public:
  ValidateIndic(ViramaScript script, bool report_errors)
      : Validator(script, report_errors) {}
  ~ValidateIndic() {}
 protected:
  // Returns whether codes matches the pattern for an Indic Grapheme.
  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
  // parts_ and output_. Returns true if a valid Grapheme was consumed,
  // otherwise does not increment codes_used_.
  bool ConsumeGraphemeIfValid() override;
  // Returns the CharClass corresponding to the given Unicode ch.
  Validator::CharClass UnicodeToCharClass(char32 ch) const override;
 private:
  // Helper consumes/copies a virama and any associated post-virama joiners.
  bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra);
  // Helper consumes/copies a series of consonants separated by viramas while
  // valid, but not any vowel or other modifiers.
  bool ConsumeConsonantHeadIfValid();
  // Helper consumes/copies a tail part of a consonant, comprising optional
  // matra/piece, vowel modifier, vedic mark, terminating virama.
  bool ConsumeConsonantTailIfValid();
  // Helper consumes/copies a vowel and optional modifiers.
  bool ConsumeVowelIfValid();
  // Some special unicodes used only for Indic processing.
  static const char32 kYayana = 0xdba;  // Sinhala Ya
  static const char32 kRayana = 0xdbb;  // Sinhala Ra
 };
 }  // namespace tesseract
 #endif  // TESSERACT_TRAINING_VALIDATE_INDIC_H_
--- a/training/validate_khmer.cpp
+++ b/training/validate_khmer.cpp
 #include "validate_khmer.h"
 #include "errcode.h"
 #include "tprintf.h"
 namespace tesseract {
 // Returns whether codes matches the pattern for a Khmer Grapheme.
 // Taken from unicode standard:
 // http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf.
 // where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation
 // to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf.
 // Translated to the codes used by the CharClass enum:
 // C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC}
 // Where R is a new symbol (Robat) and N is repurposed as a consonant shifter.
 // Also the Consonant class here includes independent vowels, as they are
 // treated the same anyway.
 // In the split grapheme mode, the only characters that get grouped are the
 // HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in
 // the BNF syntax, so who knows what they do.
 bool ValidateKhmer::ConsumeGraphemeIfValid() {
  int num_codes = codes_.size();
  if (codes_used_ == num_codes) return false;
  if (codes_[codes_used_].first == CharClass::kOther) {
    UseMultiCode(1);
    return true;
  }
  if (codes_[codes_used_].first != CharClass::kConsonant) {
    if (report_errors_) {
      tprintf("Invalid start of Khmer syllable:0x%x\n",
              codes_[codes_used_].second);
    }
    return false;
  }
  if (UseMultiCode(1)) return true;
  if (codes_[codes_used_].first == CharClass::kRobat ||
      codes_[codes_used_].first == CharClass::kNukta) {
    if (UseMultiCode(1)) return true;
  }
  while (codes_used_ + 1 < num_codes &&
         codes_[codes_used_].first == CharClass::kVirama &&
         codes_[codes_used_ + 1].first == CharClass::kConsonant) {
    ASSERT_HOST(!CodeOnlyToOutput());
    if (UseMultiCode(2)) return true;
    if (codes_[codes_used_].first == CharClass::kRobat) {
      if (UseMultiCode(1)) return true;
    }
  }
  int num_matra_parts = 0;
  if (codes_[codes_used_].second == kZeroWidthJoiner ||
      codes_[codes_used_].second == kZeroWidthNonJoiner) {
    if (CodeOnlyToOutput()) {
      if (report_errors_) {
        tprintf("Unterminated joiner: 0x%x\n", output_.back());
      }
      return false;
    }
    ++num_matra_parts;
  }
  // Not quite as shown by the BNF, the matra piece is allowed as a matra on its
  // own or as an addition to other matras.
  if (codes_[codes_used_].first == CharClass::kMatra ||
      codes_[codes_used_].first == CharClass::kMatraPiece) {
    ++num_matra_parts;
    if (UseMultiCode(num_matra_parts)) return true;
  } else if (num_matra_parts) {
    if (report_errors_) {
      tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n",
              output_.back(), codes_[codes_used_].second);
    }
    return false;
  }
  if (codes_[codes_used_].first == CharClass::kMatraPiece &&
      codes_[codes_used_ - 1].first != CharClass::kMatraPiece) {
    if (UseMultiCode(1)) return true;
  }
  if (codes_[codes_used_].first == CharClass::kVowelModifier) {
    if (UseMultiCode(1)) return true;
  }
  if (codes_used_ + 1 < num_codes &&
      codes_[codes_used_].first == CharClass::kVirama &&
      codes_[codes_used_ + 1].first == CharClass::kConsonant) {
    ASSERT_HOST(!CodeOnlyToOutput());
    if (UseMultiCode(2)) return true;
  }
  return true;
 }
 Validator::CharClass ValidateKhmer::UnicodeToCharClass(char32 ch) const {
  if (IsVedicAccent(ch)) return CharClass::kVedicMark;
  if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
  if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
  // Offset from the start of the relevant unicode code block aka code page.
  int off = ch - static_cast<char32>(script_);
  // Anything in another code block is other.
  if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
  if (off <= 0x33) return CharClass::kConsonant;
  if (off <= 0x45) return CharClass::kMatra;
  if (off == 0x46) return CharClass::kMatraPiece;
  if (off == 0x4c) return CharClass::kRobat;
  if (off == 0x49 || off == 0x4a) return CharClass::kNukta;
  if (off <= 0x51) return CharClass::kVowelModifier;
  if (off == 0x52) return CharClass::kVirama;
  return CharClass::kOther;
 }
 }  // namespace tesseract
--- a/training/validate_khmer.h
+++ b/training/validate_khmer.h
 #ifndef TESSERACT_TRAINING_VALIDATE_KHMER_H_
 #define TESSERACT_TRAINING_VALIDATE_KHMER_H_
 #include "validator.h"
 namespace tesseract {
 // Subclass of Validator that validates and segments Khmer.
 class ValidateKhmer : public Validator {
 public:
  ValidateKhmer(ViramaScript script, bool report_errors)
      : Validator(script, report_errors) {}
  ~ValidateKhmer() {}
 protected:
  // Returns whether codes matches the pattern for an Khmer Grapheme.
  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
  // parts_ and output_. Returns true if a valid Grapheme was consumed,
  // otherwise does not increment codes_used_.
  bool ConsumeGraphemeIfValid() override;
  // Returns the CharClass corresponding to the given Unicode ch.
  CharClass UnicodeToCharClass(char32 ch) const override;
 };
 }  // namespace tesseract
 #endif  // TESSERACT_TRAINING_VALIDATE_KHMER_H_
--- a/training/validate_myanmar.cpp
+++ b/training/validate_myanmar.cpp
 #include "validate_myanmar.h"
 #include "errcode.h"
 #include "icuerrorcode.h"
 #include "tprintf.h"
 #include "unicode/uchar.h"    // From libicu
 #include "unicode/uscript.h"  // From libicu
 namespace tesseract {
 // Returns whether codes matches the pattern for a Myanmar Grapheme.
 // Taken directly from the unicode table 16-3.
 // See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
 bool ValidateMyanmar::ConsumeGraphemeIfValid() {
  int num_codes = codes_.size();
  if (codes_used_ == num_codes) return true;
  // Other.
  if (IsMyanmarOther(codes_[codes_used_].second)) {
    UseMultiCode(1);
    return true;
  }
  // Kinzi.
  if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 &&
      codes_[codes_used_ + 1].second == kMyanmarAsat &&
      codes_[codes_used_ + 2].second == kMyanmarVirama) {
    ASSERT_HOST(!CodeOnlyToOutput());
    ASSERT_HOST(!CodeOnlyToOutput());
    if (UseMultiCode(3)) return true;
  }
  // Base consonant/vowel. NOTE that since everything in Myanmar appears to be
  // optional, except the base, this is the only place where invalid input can
  // be detected and false returned.
  if (IsMyanmarLetter(codes_[codes_used_].second)) {
    if (UseMultiCode(1)) return true;
  } else {
    if (report_errors_) {
      tprintf("Invalid start of Myanmar syllable:0x%x\n",
              codes_[codes_used_].second);
    }
    return false;  // One of these is required.
  }
  if (ConsumeSubscriptIfPresent()) return true;
  ConsumeOptionalSignsIfPresent();
  // What we have consumed so far is a valid syllable.
  return true;
 }
 // TODO(rays) Doesn't use intermediate coding like the other scripts, as there
 // is little correspondence between the content of table 16-3 and the char
 // classes of the Indic languages. (Experts may disagree and improve!)
 // In unicode table 16-3 there is basically a long list of optional characters,
 // which can be coded quite easily.
 // Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!!
 // The table also allows sequences that still result in dotted circles!!
 // So with a lot of guesswork the rest have been added in a reasonable place.
 Validator::CharClass ValidateMyanmar::UnicodeToCharClass(char32 ch) const {
  if (IsMyanmarLetter(ch)) return CharClass::kConsonant;
  return CharClass::kOther;
 }
 // Helper consumes/copies a virama and any subscript consonant.
 // Returns true if the end of input is reached.
 bool ValidateMyanmar::ConsumeSubscriptIfPresent() {
  // Subscript consonant. It appears there can be only one.
  int num_codes = codes_.size();
  if (codes_used_ + 1 < num_codes &&
      codes_[codes_used_].second == kMyanmarVirama) {
    if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) {
      ASSERT_HOST(!CodeOnlyToOutput());
      if (UseMultiCode(2)) return true;
    }
  }
  return false;
 }
 // Helper consumes/copies a series of optional signs.
 // Returns true if the end of input is reached.
 bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {
  // The following characters are allowed, all optional, and in sequence.
  // An exception is kMyanmarMedialYa, which can include kMyanmarAsat.
  const std::vector<char32> kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c,
                                      0x103d, 0x103e, 0x105e, 0x105f, 0x1060,
                                      0x1081, 0x1031});
  for (char32 ch : kMedials) {
    if (codes_[codes_used_].second == ch) {
      if (UseMultiCode(1)) return true;
      if (ch == kMyanmarMedialYa &&
          codes_[codes_used_].second == kMyanmarAsat) {
        if (UseMultiCode(1)) return true;
      }
    }
  }
  // Vowel sign i, ii, ai.
  char32 ch = codes_[codes_used_].second;
  if (ch == 0x102d || ch == 0x102e || ch == 0x1032) {
    if (UseMultiCode(1)) return true;
  }
  // Vowel sign u, uu, and extensions.
  ch = codes_[codes_used_].second;
  if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) ||
      ch == 0x1062 || ch == 0x1067 || ch == 0x1068 ||
      (0x1071 <= ch && ch <= 0x1074) || (0x1083 <= ch && ch <= 0x1086) ||
      ch == 0x109c || ch == 0x109d) {
    if (UseMultiCode(1)) return true;
  }
  // Tall aa, aa with optional asat.
  if (codes_[codes_used_].second == 0x102b ||
      codes_[codes_used_].second == 0x102c) {
    if (UseMultiCode(1)) return true;
    if (codes_[codes_used_].second == kMyanmarAsat) {
      if (UseMultiCode(1)) return true;
    }
  }
  // The following characters are allowed, all optional, and in sequence.
  const std::vector<char32> kSigns({0x1036, 0x1037});
  for (char32 ch : kSigns) {
    if (codes_[codes_used_].second == ch) {
      if (UseMultiCode(1)) return true;
    }
  }
  // Tone mark extensions.
  ch = codes_[codes_used_].second;
  if (ch == 0x1038 || ch == kMyanmarAsat || ch == 0x1063 || ch == 0x1064 ||
      (0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) ||
      ch == 0x108f || ch == 0x109a || ch == 0x109b ||
      (0xaa7b <= ch && ch <= 0xaa7d)) {
    if (UseMultiCode(1)) return true;
  }
  return false;
 }
 // Returns true if the unicode is a Myanmar "letter" including consonants
 // and independent vowels. Although table 16-3 distinguishes between some
 // base consonants and vowels, the extensions make no such distinction, so we
 // put them all into a single bucket.
 /* static */
 bool ValidateMyanmar::IsMyanmarLetter(char32 ch) {
  return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f ||
         (0x1050 <= ch && ch <= 0x1055) || (0x105a <= ch && ch <= 0x105d) ||
         ch == 0x1061 || ch == 0x1065 || ch == 0x1066 ||
         (0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1080) ||
         ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9ef) ||
         (0xa9fa <= ch && ch <= 0xa9ff) || (0xaa60 <= ch && ch <= 0xaa73) ||
         ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f;
 }
 // Returns true if ch is a Myanmar digit or other symbol that does not take
 // part in being a syllable.
 /* static */
 bool ValidateMyanmar::IsMyanmarOther(char32 ch) {
  IcuErrorCode err;
  UScriptCode script_code = uscript_getScript(ch, err);
  if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner &&
      ch != Validator::kZeroWidthNonJoiner)
    return true;
  return (0x1040 <= ch && ch <= 0x1049) || (0x1090 <= ch && ch <= 0x1099) ||
         (0x109c <= ch && ch <= 0x109d) || (0xa9f0 <= ch && ch <= 0xa9f9) ||
         (0xaa74 <= ch && ch <= 0xaa79);
 }
 }  // namespace tesseract
--- a/training/validate_myanmar.h
+++ b/training/validate_myanmar.h
 #ifndef TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
 #define TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
 #include "validator.h"
 namespace tesseract {
 // Subclass of Validator that validates and segments Myanmar.
 class ValidateMyanmar : public Validator {
 public:
  ValidateMyanmar(ViramaScript script, bool report_errors)
      : Validator(script, report_errors) {}
  ~ValidateMyanmar() {}
 protected:
  // Returns whether codes matches the pattern for a Myanmar Grapheme.
  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
  // parts_ and output_. Returns true if a valid Grapheme was consumed,
  // otherwise does not increment codes_used_.
  bool ConsumeGraphemeIfValid() override;
  // Returns the CharClass corresponding to the given Unicode ch.
  Validator::CharClass UnicodeToCharClass(char32 ch) const override;
 private:
  // Helper consumes/copies a virama and any subscript consonant.
  // Returns true if the end of input is reached.
  bool ConsumeSubscriptIfPresent();
  // Helper consumes/copies a series of optional signs.
  // Returns true if the end of input is reached.
  bool ConsumeOptionalSignsIfPresent();
  // Returns true if the unicode is a Myanmar "letter" including consonants
  // and independent vowels. Although table 16-3 distinguishes between some
  // base consonants and vowels, the extensions make no such distinction, so we
  // put them all into a single bucket.
  static bool IsMyanmarLetter(char32 ch);
  // Returns true if ch is a Myanmar digit or other symbol that does not take
  // part in being a syllable.
  static bool IsMyanmarOther(char32 ch);
  // Some special unicodes used only for Myanmar processing.
  static const char32 kMyanmarAsat = 0x103a;
  static const char32 kMyanmarMedialYa = 0x103b;
 };
 }  // namespace tesseract
 #endif  // TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
--- a/training/validator.cpp
+++ b/training/validator.cpp
--- a/training/validator.h
+++ b/training/validator.h
--- a/training/wordlist2dawg.cpp
+++ b/training/wordlist2dawg.cpp