提交 104fe793 编写于 作者: E Egor Pugin

Move training to src.

上级 ca5c15e6
...@@ -308,7 +308,7 @@ if (BUILD_TESTS AND EXISTS ${PROJECT_SOURCE_DIR}/googletest/CMakeLists.txt) ...@@ -308,7 +308,7 @@ if (BUILD_TESTS AND EXISTS ${PROJECT_SOURCE_DIR}/googletest/CMakeLists.txt)
endif() endif()
if (BUILD_TRAINING_TOOLS) if (BUILD_TRAINING_TOOLS)
add_subdirectory(training) add_subdirectory(src/training)
endif() endif()
get_target_property(tesseract_NAME libtesseract NAME) get_target_property(tesseract_NAME libtesseract NAME)
......
...@@ -502,7 +502,7 @@ AC_CONFIG_FILES([java/com/google/scrollview/Makefile]) ...@@ -502,7 +502,7 @@ AC_CONFIG_FILES([java/com/google/scrollview/Makefile])
AC_CONFIG_FILES([java/com/google/scrollview/events/Makefile]) AC_CONFIG_FILES([java/com/google/scrollview/events/Makefile])
AC_CONFIG_FILES([java/com/google/scrollview/ui/Makefile]) AC_CONFIG_FILES([java/com/google/scrollview/ui/Makefile])
AC_CONFIG_FILES([doc/Makefile]) AC_CONFIG_FILES([doc/Makefile])
AM_COND_IF([ENABLE_TRAINING], [AC_CONFIG_FILES(training/Makefile)]) AM_COND_IF([ENABLE_TRAINING], [AC_CONFIG_FILES(src/training/Makefile)])
AC_OUTPUT AC_OUTPUT
# Final message # Final message
......
...@@ -172,7 +172,7 @@ projects: ...@@ -172,7 +172,7 @@ projects:
tessopt: tessopt:
type: lib type: lib
static_only: true static_only: true
files: training/tessopt.* files: src/training/tessopt.*
include_directories: training include_directories: training
dependencies: libtesseract dependencies: libtesseract
...@@ -180,104 +180,104 @@ projects: ...@@ -180,104 +180,104 @@ projects:
type: lib type: lib
static_only: true static_only: true
files: files:
- training/commandlineflags.cpp - src/training/commandlineflags.cpp
- training/commandlineflags.h - src/training/commandlineflags.h
- training/commontraining.cpp - src/training/commontraining.cpp
- training/commontraining.h - src/training/commontraining.h
include_directories: training include_directories: training
dependencies: dependencies:
- tessopt - tessopt
ambiguous_words: ambiguous_words:
files: training/ambiguous_words.cpp files: src/training/ambiguous_words.cpp
dependencies: dependencies:
- libtesseract - libtesseract
classifier_tester: classifier_tester:
files: training/classifier_tester.cpp files: src/training/classifier_tester.cpp
dependencies: common_training dependencies: common_training
combine_lang_model: combine_lang_model:
files: training/combine_lang_model.cpp files: src/training/combine_lang_model.cpp
dependencies: unicharset_training dependencies: unicharset_training
combine_tessdata: combine_tessdata:
files: training/combine_tessdata.cpp files: src/training/combine_tessdata.cpp
dependencies: libtesseract dependencies: libtesseract
cntraining: cntraining:
files: training/cntraining.cpp files: src/training/cntraining.cpp
dependencies: common_training dependencies: common_training
dawg2wordlist: dawg2wordlist:
files: training/dawg2wordlist.cpp files: src/training/dawg2wordlist.cpp
dependencies: libtesseract dependencies: libtesseract
mftraining: mftraining:
files: files:
- training/mftraining.cpp - src/training/mftraining.cpp
- training/mergenf.* - src/training/mergenf.*
dependencies: common_training dependencies: common_training
shapeclustering: shapeclustering:
files: training/shapeclustering.cpp files: src/training/shapeclustering.cpp
dependencies: common_training dependencies: common_training
unicharset_extractor: unicharset_extractor:
files: training/unicharset_extractor.cpp files: src/training/unicharset_extractor.cpp
dependencies: unicharset_training dependencies: unicharset_training
wordlist2dawg: wordlist2dawg:
files: training/wordlist2dawg.cpp files: src/training/wordlist2dawg.cpp
dependencies: libtesseract dependencies: libtesseract
unicharset_training: unicharset_training:
type: lib type: lib
static_only: true static_only: true
files: files:
- training/fileio.* - src/training/fileio.*
- training/icuerrorcode.h - src/training/icuerrorcode.h
- training/lang_model_helpers.* - src/training/lang_model_helpers.*
- training/lstmtester.* - src/training/lstmtester.*
- training/normstrngs.* - src/training/normstrngs.*
- training/unicharset_training_utils.* - src/training/unicharset_training_utils.*
- training/validat.* - src/training/validat.*
include_directories: training include_directories: training
dependencies: dependencies:
- common_training - common_training
- pvt.cppan.demo.unicode.icu.i18n - pvt.cppan.demo.unicode.icu.i18n
lstmeval: lstmeval:
files: training/lstmeval.cpp files: src/training/lstmeval.cpp
dependencies: unicharset_training dependencies: unicharset_training
lstmtraining: lstmtraining:
files: training/lstmtraining.cpp files: src/training/lstmtraining.cpp
dependencies: unicharset_training dependencies: unicharset_training
set_unicharset_properties: set_unicharset_properties:
files: training/set_unicharset_properties.cpp files: src/training/set_unicharset_properties.cpp
dependencies: unicharset_training dependencies: unicharset_training
text2image: text2image:
files: files:
- training/text2image.cpp - src/training/text2image.cpp
- training/boxchar.cpp - src/training/boxchar.cpp
- training/boxchar.h - src/training/boxchar.h
- training/degradeimage.cpp - src/training/degradeimage.cpp
- training/degradeimage.h - src/training/degradeimage.h
- training/ligature_table.cpp - src/training/ligature_table.cpp
- training/ligature_table.h - src/training/ligature_table.h
- training/normstrngs.cpp - src/training/normstrngs.cpp
- training/normstrngs.h - src/training/normstrngs.h
- training/pango_font_info.cpp - src/training/pango_font_info.cpp
- training/pango_font_info.h - src/training/pango_font_info.h
- training/stringrenderer.cpp - src/training/stringrenderer.cpp
- training/stringrenderer.h - src/training/stringrenderer.h
- training/tlog.cpp - src/training/tlog.cpp
- training/tlog.h - src/training/tlog.h
- training/util.h - src/training/util.h
- training/icuerrorcode.h - src/training/icuerrorcode.h
dependencies: dependencies:
- unicharset_training - unicharset_training
......
/********************************************************************** /**********************************************************************
* File: degradeimage.h * File: degradeimage.h
* Description: Function to degrade an image (usually of text) as if it * Description: Function to degrade an image (usually of text) as if it
* has been printed and then scanned. * has been printed and then scanned.
* Authors: Ray Smith * Authors: Ray Smith
* Created: Tue Nov 19 2013 * Created: Tue Nov 19 2013
* *
* (C) Copyright 2013, Google Inc. * (C) Copyright 2013, Google Inc.
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
* You may obtain a copy of the License at * You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
* *
**********************************************************************/ **********************************************************************/
#ifndef TESSERACT_TRAINING_DEGRADEIMAGE_H_ #ifndef TESSERACT_TRAINING_DEGRADEIMAGE_H_
#define TESSERACT_TRAINING_DEGRADEIMAGE_H_ #define TESSERACT_TRAINING_DEGRADEIMAGE_H_
#include "allheaders.h" #include "allheaders.h"
#include "genericvector.h" #include "genericvector.h"
#include "helpers.h" // For TRand. #include "helpers.h" // For TRand.
#include "rect.h" #include "rect.h"
namespace tesseract { namespace tesseract {
// Degrade the pix as if by a print/copy/scan cycle with exposure > 0 // Degrade the pix as if by a print/copy/scan cycle with exposure > 0
// corresponding to darkening on the copier and <0 lighter and 0 not copied. // corresponding to darkening on the copier and <0 lighter and 0 not copied.
// If rotation is not nullptr, the clockwise rotation in radians is saved there. // If rotation is not nullptr, the clockwise rotation in radians is saved there.
// The input pix must be 8 bit grey. (Binary with values 0 and 255 is OK.) // The input pix must be 8 bit grey. (Binary with values 0 and 255 is OK.)
// The input image is destroyed and a different image returned. // The input image is destroyed and a different image returned.
struct Pix* DegradeImage(struct Pix* input, int exposure, TRand* randomizer, struct Pix* DegradeImage(struct Pix* input, int exposure, TRand* randomizer,
float* rotation); float* rotation);
// Creates and returns a Pix distorted by various means according to the bool // Creates and returns a Pix distorted by various means according to the bool
// flags. If boxes is not nullptr, the boxes are resized/positioned according to // flags. If boxes is not nullptr, the boxes are resized/positioned according to
// any spatial distortion and also by the integer reduction factor box_scale // any spatial distortion and also by the integer reduction factor box_scale
// so they will match what the network will output. // so they will match what the network will output.
// Returns nullptr on error. The returned Pix must be pixDestroyed. // Returns nullptr on error. The returned Pix must be pixDestroyed.
Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert, Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert,
bool white_noise, bool smooth_noise, bool blur, bool white_noise, bool smooth_noise, bool blur,
int box_reduction, TRand* randomizer, int box_reduction, TRand* randomizer,
GenericVector<TBOX>* boxes); GenericVector<TBOX>* boxes);
// Distorts anything that has a non-null pointer with the same pseudo-random // Distorts anything that has a non-null pointer with the same pseudo-random
// perspective distortion. Width and height only need to be set if there // perspective distortion. Width and height only need to be set if there
// is no pix. If there is a pix, then they will be taken from there. // is no pix. If there is a pix, then they will be taken from there.
void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer, void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer,
Pix** pix, GenericVector<TBOX>* boxes); Pix** pix, GenericVector<TBOX>* boxes);
// Computes the coefficients of a randomized projective transformation. // Computes the coefficients of a randomized projective transformation.
// The image transform requires backward transformation coefficient, and the // The image transform requires backward transformation coefficient, and the
// box transform the forward coefficients. // box transform the forward coefficients.
// Returns the incolor arg to pixProjective. // Returns the incolor arg to pixProjective.
int ProjectiveCoeffs(int width, int height, TRand* randomizer, int ProjectiveCoeffs(int width, int height, TRand* randomizer,
float** im_coeffs, float** box_coeffs); float** im_coeffs, float** box_coeffs);
} // namespace tesseract } // namespace tesseract
#endif // TESSERACT_TRAINING_DEGRADEIMAGE_H_ #endif // TESSERACT_TRAINING_DEGRADEIMAGE_H_
/********************************************************************** /**********************************************************************
* File: icuerrorcode.h * File: icuerrorcode.h
* Description: Wrapper class for UErrorCode, with conversion operators for * Description: Wrapper class for UErrorCode, with conversion operators for
* direct use in ICU C and C++ APIs. * direct use in ICU C and C++ APIs.
* Author: Fredrik Roubert * Author: Fredrik Roubert
* Created: Thu July 4 2013 * Created: Thu July 4 2013
* *
* Features: * Features:
* - The constructor initializes the internal UErrorCode to U_ZERO_ERROR, * - The constructor initializes the internal UErrorCode to U_ZERO_ERROR,
* removing one common source of errors. * removing one common source of errors.
* - Same use in C APIs taking a UErrorCode* (pointer) and C++ taking * - Same use in C APIs taking a UErrorCode* (pointer) and C++ taking
* UErrorCode& (reference), via conversion operators. * UErrorCode& (reference), via conversion operators.
* - Automatic checking for success when it goes out of scope. On failure, * - Automatic checking for success when it goes out of scope. On failure,
* the destructor will log an error message and exit. * the destructor will log an error message and exit.
* *
* Most of ICU will handle errors gracefully and provide sensible fallbacks. * Most of ICU will handle errors gracefully and provide sensible fallbacks.
* Using IcuErrorCode, it is therefore possible to write very compact code * Using IcuErrorCode, it is therefore possible to write very compact code
* that does sensible things on failure and provides logging for debugging. * that does sensible things on failure and provides logging for debugging.
* *
* Example: * Example:
* IcuErrorCode icuerrorcode; * IcuErrorCode icuerrorcode;
* return collator.compareUTF8(a, b, icuerrorcode) == UCOL_EQUAL; * return collator.compareUTF8(a, b, icuerrorcode) == UCOL_EQUAL;
* *
* (C) Copyright 2013, Google Inc. * (C) Copyright 2013, Google Inc.
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
* You may obtain a copy of the License at * You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
* *
**********************************************************************/ **********************************************************************/
#ifndef TESSERACT_CCUTIL_ICUERRORCODE_H_ #ifndef TESSERACT_CCUTIL_ICUERRORCODE_H_
#define TESSERACT_CCUTIL_ICUERRORCODE_H_ #define TESSERACT_CCUTIL_ICUERRORCODE_H_
#include "tprintf.h" #include "tprintf.h"
#include "unicode/errorcode.h" // From libicu #include "unicode/errorcode.h" // From libicu
namespace tesseract { namespace tesseract {
class IcuErrorCode : public icu::ErrorCode { class IcuErrorCode : public icu::ErrorCode {
public: public:
IcuErrorCode() {} IcuErrorCode() {}
virtual ~IcuErrorCode() { virtual ~IcuErrorCode() {
if (isFailure()) { if (isFailure()) {
handleFailure(); handleFailure();
} }
} }
protected: protected:
virtual void handleFailure() const { virtual void handleFailure() const {
tprintf("ICU ERROR: %s", errorName()); tprintf("ICU ERROR: %s", errorName());
exit(errorCode); exit(errorCode);
} }
private: private:
// Disallow implicit copying of object. // Disallow implicit copying of object.
IcuErrorCode(const IcuErrorCode&); IcuErrorCode(const IcuErrorCode&);
void operator=(const IcuErrorCode&); void operator=(const IcuErrorCode&);
}; };
} // namespace tesseract } // namespace tesseract
#endif // TESSERACT_CCUTIL_ICUERRORCODE_H_ #endif // TESSERACT_CCUTIL_ICUERRORCODE_H_
/****************************************************************************** /******************************************************************************
** Filename: MergeNF.c ** Filename: MergeNF.c
** Purpose: Program for merging similar nano-feature protos ** Purpose: Program for merging similar nano-feature protos
** Author: Dan Johnson ** Author: Dan Johnson
** History: Wed Nov 21 09:55:23 1990, DSJ, Created. ** History: Wed Nov 21 09:55:23 1990, DSJ, Created.
** **
** (c) Copyright Hewlett-Packard Company, 1988. ** (c) Copyright Hewlett-Packard Company, 1988.
** Licensed under the Apache License, Version 2.0 (the "License"); ** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License. ** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at ** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0 ** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software ** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS, ** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and ** See the License for the specific language governing permissions and
** limitations under the License. ** limitations under the License.
******************************************************************************/ ******************************************************************************/
#ifndef TESSERACT_TRAINING_MERGENF_H_ #ifndef TESSERACT_TRAINING_MERGENF_H_
#define TESSERACT_TRAINING_MERGENF_H_ #define TESSERACT_TRAINING_MERGENF_H_
/**---------------------------------------------------------------------------- /**----------------------------------------------------------------------------
Include Files and Type Defines Include Files and Type Defines
----------------------------------------------------------------------------**/ ----------------------------------------------------------------------------**/
#include "protos.h" #include "protos.h"
#include "cluster.h" #include "cluster.h"
#include "ocrfeatures.h" #include "ocrfeatures.h"
#include "callcpp.h" #include "callcpp.h"
#include "picofeat.h" #include "picofeat.h"
#define WORST_MATCH_ALLOWED (0.9) #define WORST_MATCH_ALLOWED (0.9)
#define WORST_EVIDENCE (1.0) #define WORST_EVIDENCE (1.0)
#define MAX_LENGTH_MISMATCH (2.0 * GetPicoFeatureLength ()) #define MAX_LENGTH_MISMATCH (2.0 * GetPicoFeatureLength ())
#define PROTO_SUFFIX ".mf.p" #define PROTO_SUFFIX ".mf.p"
#define CONFIG_SUFFIX ".cl" #define CONFIG_SUFFIX ".cl"
#define NO_PROTO (-1) #define NO_PROTO (-1)
#define XPOSITION 0 #define XPOSITION 0
#define YPOSITION 1 #define YPOSITION 1
#define MFLENGTH 2 #define MFLENGTH 2
#define ORIENTATION 3 #define ORIENTATION 3
typedef struct typedef struct
{ {
FLOAT32 MinX, MaxX, MinY, MaxY; FLOAT32 MinX, MaxX, MinY, MaxY;
} FRECT; } FRECT;
/**---------------------------------------------------------------------------- /**----------------------------------------------------------------------------
Public Macros Public Macros
----------------------------------------------------------------------------**/ ----------------------------------------------------------------------------**/
#define CenterX(M) ( (M)[XPOSITION] ) #define CenterX(M) ( (M)[XPOSITION] )
#define CenterY(M) ( (M)[YPOSITION] ) #define CenterY(M) ( (M)[YPOSITION] )
#define LengthOf(M) ( (M)[MFLENGTH] ) #define LengthOf(M) ( (M)[MFLENGTH] )
#define OrientationOf(M) ( (M)[ORIENTATION] ) #define OrientationOf(M) ( (M)[ORIENTATION] )
/**---------------------------------------------------------------------------- /**----------------------------------------------------------------------------
Public Function Prototypes Public Function Prototypes
----------------------------------------------------------------------------**/ ----------------------------------------------------------------------------**/
FLOAT32 CompareProtos ( FLOAT32 CompareProtos (
PROTO p1, PROTO p1,
PROTO p2); PROTO p2);
void ComputeMergedProto ( void ComputeMergedProto (
PROTO p1, PROTO p1,
PROTO p2, PROTO p2,
FLOAT32 w1, FLOAT32 w1,
FLOAT32 w2, FLOAT32 w2,
PROTO MergedProto); PROTO MergedProto);
int FindClosestExistingProto ( int FindClosestExistingProto (
CLASS_TYPE Class, CLASS_TYPE Class,
int NumMerged[], int NumMerged[],
PROTOTYPE *Prototype); PROTOTYPE *Prototype);
void MakeNewFromOld ( void MakeNewFromOld (
PROTO New, PROTO New,
PROTOTYPE *Old); PROTOTYPE *Old);
FLOAT32 SubfeatureEvidence ( FLOAT32 SubfeatureEvidence (
FEATURE Feature, FEATURE Feature,
PROTO Proto); PROTO Proto);
double EvidenceOf ( double EvidenceOf (
register double Similarity); register double Similarity);
BOOL8 DummyFastMatch ( BOOL8 DummyFastMatch (
FEATURE Feature, FEATURE Feature,
PROTO Proto); PROTO Proto);
void ComputePaddedBoundingBox ( void ComputePaddedBoundingBox (
PROTO Proto, PROTO Proto,
FLOAT32 TangentPad, FLOAT32 TangentPad,
FLOAT32 OrthogonalPad, FLOAT32 OrthogonalPad,
FRECT *BoundingBox); FRECT *BoundingBox);
BOOL8 PointInside ( BOOL8 PointInside (
FRECT *Rectangle, FRECT *Rectangle,
FLOAT32 X, FLOAT32 X,
FLOAT32 Y); FLOAT32 Y);
#endif // TESSERACT_TRAINING_MERGENF_H_ #endif // TESSERACT_TRAINING_MERGENF_H_
/********************************************************************** /**********************************************************************
* File: tlog.cpp * File: tlog.cpp
* Description: Variant of printf with logging level controllable by a * Description: Variant of printf with logging level controllable by a
* commandline flag. * commandline flag.
* Author: Ranjith Unnikrishnan * Author: Ranjith Unnikrishnan
* Created: Wed Nov 20 2013 * Created: Wed Nov 20 2013
* *
* (C) Copyright 2013, Google Inc. * (C) Copyright 2013, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License"); ** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License. ** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at ** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0 ** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software ** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS, ** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and ** See the License for the specific language governing permissions and
** limitations under the License. ** limitations under the License.
* *
**********************************************************************/ **********************************************************************/
#include "tlog.h" #include "tlog.h"
INT_PARAM_FLAG(tlog_level, 0, "Minimum logging level for tlog() output"); INT_PARAM_FLAG(tlog_level, 0, "Minimum logging level for tlog() output");
/********************************************************************** /**********************************************************************
* File: tlog.h * File: tlog.h
* Description: Variant of printf with logging level controllable by a * Description: Variant of printf with logging level controllable by a
* commandline flag. * commandline flag.
* Author: Ranjith Unnikrishnan * Author: Ranjith Unnikrishnan
* Created: Wed Nov 20 2013 * Created: Wed Nov 20 2013
* *
* (C) Copyright 2013, Google Inc. * (C) Copyright 2013, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License"); ** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License. ** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at ** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0 ** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software ** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS, ** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and ** See the License for the specific language governing permissions and
** limitations under the License. ** limitations under the License.
* *
**********************************************************************/ **********************************************************************/
#ifndef TESSERACT_TRAINING_TLOG_H_ #ifndef TESSERACT_TRAINING_TLOG_H_
#define TESSERACT_TRAINING_TLOG_H_ #define TESSERACT_TRAINING_TLOG_H_
#include "commandlineflags.h" #include "commandlineflags.h"
#include "errcode.h" #include "errcode.h"
#include "tprintf.h" #include "tprintf.h"
DECLARE_INT_PARAM_FLAG(tlog_level); DECLARE_INT_PARAM_FLAG(tlog_level);
// Variant guarded by the numeric logging level parameter FLAGS_tlog_level // Variant guarded by the numeric logging level parameter FLAGS_tlog_level
// (default 0). Code using ParseCommandLineFlags() can control its value using // (default 0). Code using ParseCommandLineFlags() can control its value using
// the --tlog_level commandline argument. Otherwise it must be specified in a // the --tlog_level commandline argument. Otherwise it must be specified in a
// config file like other params. // config file like other params.
#define tlog(level, ...) { \ #define tlog(level, ...) { \
if (FLAGS_tlog_level >= level) { \ if (FLAGS_tlog_level >= level) { \
tprintf_internal(__VA_ARGS__); \ tprintf_internal(__VA_ARGS__); \
} \ } \
} }
#define TLOG_IS_ON(level) (FLAGS_tlog_level >= level) #define TLOG_IS_ON(level) (FLAGS_tlog_level >= level)
#endif // TESSERACT_TRAINING_TLOG_H_ #endif // TESSERACT_TRAINING_TLOG_H_
#ifndef TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_ #ifndef TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
#define TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_ #define TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
#include "validator.h" #include "validator.h"
namespace tesseract { namespace tesseract {
// Subclass of Validator that validates and segments generic unicode into // Subclass of Validator that validates and segments generic unicode into
// grapheme clusters, including Latin with diacritics. // grapheme clusters, including Latin with diacritics.
class ValidateGrapheme : public Validator { class ValidateGrapheme : public Validator {
public: public:
ValidateGrapheme(ViramaScript script, bool report_errors) ValidateGrapheme(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {} : Validator(script, report_errors) {}
~ValidateGrapheme() {} ~ValidateGrapheme() {}
protected: protected:
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed, // parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_. // otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override; bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch. // Returns the CharClass corresponding to the given Unicode ch.
CharClass UnicodeToCharClass(char32 ch) const override; CharClass UnicodeToCharClass(char32 ch) const override;
private: private:
// Helper returns true if the sequence prev_ch,ch is invalid. // Helper returns true if the sequence prev_ch,ch is invalid.
bool IsBadlyFormed(char32 prev_ch, char32 ch); bool IsBadlyFormed(char32 prev_ch, char32 ch);
// Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel. // Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel.
static bool IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch); static bool IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch);
// Helper returns true if the sequence prev_ch,ch is invalid Thai. // Helper returns true if the sequence prev_ch,ch is invalid Thai.
static bool IsBadlyFormedThai(char32 prev_ch, char32 ch); static bool IsBadlyFormedThai(char32 prev_ch, char32 ch);
}; };
} // namespace tesseract } // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_ #endif // TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
#ifndef TESSERACT_TRAINING_VALIDATE_INDIC_H_ #ifndef TESSERACT_TRAINING_VALIDATE_INDIC_H_
#define TESSERACT_TRAINING_VALIDATE_INDIC_H_ #define TESSERACT_TRAINING_VALIDATE_INDIC_H_
#include "validator.h" #include "validator.h"
namespace tesseract { namespace tesseract {
// Subclass of Validator that validates and segments Indic scripts in the // Subclass of Validator that validates and segments Indic scripts in the
// unicode range 0x900-0xdff (Devanagari-Sinhala). // unicode range 0x900-0xdff (Devanagari-Sinhala).
class ValidateIndic : public Validator { class ValidateIndic : public Validator {
public: public:
ValidateIndic(ViramaScript script, bool report_errors) ValidateIndic(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {} : Validator(script, report_errors) {}
~ValidateIndic() {} ~ValidateIndic() {}
protected: protected:
// Returns whether codes matches the pattern for an Indic Grapheme. // Returns whether codes matches the pattern for an Indic Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed, // parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_. // otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override; bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch. // Returns the CharClass corresponding to the given Unicode ch.
Validator::CharClass UnicodeToCharClass(char32 ch) const override; Validator::CharClass UnicodeToCharClass(char32 ch) const override;
private: private:
// Helper consumes/copies a virama and any associated post-virama joiners. // Helper consumes/copies a virama and any associated post-virama joiners.
bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra); bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra);
// Helper consumes/copies a series of consonants separated by viramas while // Helper consumes/copies a series of consonants separated by viramas while
// valid, but not any vowel or other modifiers. // valid, but not any vowel or other modifiers.
bool ConsumeConsonantHeadIfValid(); bool ConsumeConsonantHeadIfValid();
// Helper consumes/copies a tail part of a consonant, comprising optional // Helper consumes/copies a tail part of a consonant, comprising optional
// matra/piece, vowel modifier, vedic mark, terminating virama. // matra/piece, vowel modifier, vedic mark, terminating virama.
bool ConsumeConsonantTailIfValid(); bool ConsumeConsonantTailIfValid();
// Helper consumes/copies a vowel and optional modifiers. // Helper consumes/copies a vowel and optional modifiers.
bool ConsumeVowelIfValid(); bool ConsumeVowelIfValid();
// Some special unicodes used only for Indic processing. // Some special unicodes used only for Indic processing.
static const char32 kYayana = 0xdba; // Sinhala Ya static const char32 kYayana = 0xdba; // Sinhala Ya
static const char32 kRayana = 0xdbb; // Sinhala Ra static const char32 kRayana = 0xdbb; // Sinhala Ra
}; };
} // namespace tesseract } // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_INDIC_H_ #endif // TESSERACT_TRAINING_VALIDATE_INDIC_H_
#include "validate_khmer.h" #include "validate_khmer.h"
#include "errcode.h" #include "errcode.h"
#include "tprintf.h" #include "tprintf.h"
namespace tesseract { namespace tesseract {
// Returns whether codes matches the pattern for a Khmer Grapheme. // Returns whether codes matches the pattern for a Khmer Grapheme.
// Taken from unicode standard: // Taken from unicode standard:
// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf. // http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf.
// where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation // where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation
// to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf. // to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf.
// Translated to the codes used by the CharClass enum: // Translated to the codes used by the CharClass enum:
// C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC} // C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC}
// Where R is a new symbol (Robat) and N is repurposed as a consonant shifter. // Where R is a new symbol (Robat) and N is repurposed as a consonant shifter.
// Also the Consonant class here includes independent vowels, as they are // Also the Consonant class here includes independent vowels, as they are
// treated the same anyway. // treated the same anyway.
// In the split grapheme mode, the only characters that get grouped are the // In the split grapheme mode, the only characters that get grouped are the
// HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in // HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in
// the BNF syntax, so who knows what they do. // the BNF syntax, so who knows what they do.
bool ValidateKhmer::ConsumeGraphemeIfValid() { bool ValidateKhmer::ConsumeGraphemeIfValid() {
int num_codes = codes_.size(); int num_codes = codes_.size();
if (codes_used_ == num_codes) return false; if (codes_used_ == num_codes) return false;
if (codes_[codes_used_].first == CharClass::kOther) { if (codes_[codes_used_].first == CharClass::kOther) {
UseMultiCode(1); UseMultiCode(1);
return true; return true;
} }
if (codes_[codes_used_].first != CharClass::kConsonant) { if (codes_[codes_used_].first != CharClass::kConsonant) {
if (report_errors_) { if (report_errors_) {
tprintf("Invalid start of Khmer syllable:0x%x\n", tprintf("Invalid start of Khmer syllable:0x%x\n",
codes_[codes_used_].second); codes_[codes_used_].second);
} }
return false; return false;
} }
if (UseMultiCode(1)) return true; if (UseMultiCode(1)) return true;
if (codes_[codes_used_].first == CharClass::kRobat || if (codes_[codes_used_].first == CharClass::kRobat ||
codes_[codes_used_].first == CharClass::kNukta) { codes_[codes_used_].first == CharClass::kNukta) {
if (UseMultiCode(1)) return true; if (UseMultiCode(1)) return true;
} }
while (codes_used_ + 1 < num_codes && while (codes_used_ + 1 < num_codes &&
codes_[codes_used_].first == CharClass::kVirama && codes_[codes_used_].first == CharClass::kVirama &&
codes_[codes_used_ + 1].first == CharClass::kConsonant) { codes_[codes_used_ + 1].first == CharClass::kConsonant) {
ASSERT_HOST(!CodeOnlyToOutput()); ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true; if (UseMultiCode(2)) return true;
if (codes_[codes_used_].first == CharClass::kRobat) { if (codes_[codes_used_].first == CharClass::kRobat) {
if (UseMultiCode(1)) return true; if (UseMultiCode(1)) return true;
} }
} }
int num_matra_parts = 0; int num_matra_parts = 0;
if (codes_[codes_used_].second == kZeroWidthJoiner || if (codes_[codes_used_].second == kZeroWidthJoiner ||
codes_[codes_used_].second == kZeroWidthNonJoiner) { codes_[codes_used_].second == kZeroWidthNonJoiner) {
if (CodeOnlyToOutput()) { if (CodeOnlyToOutput()) {
if (report_errors_) { if (report_errors_) {
tprintf("Unterminated joiner: 0x%x\n", output_.back()); tprintf("Unterminated joiner: 0x%x\n", output_.back());
} }
return false; return false;
} }
++num_matra_parts; ++num_matra_parts;
} }
// Not quite as shown by the BNF, the matra piece is allowed as a matra on its // Not quite as shown by the BNF, the matra piece is allowed as a matra on its
// own or as an addition to other matras. // own or as an addition to other matras.
if (codes_[codes_used_].first == CharClass::kMatra || if (codes_[codes_used_].first == CharClass::kMatra ||
codes_[codes_used_].first == CharClass::kMatraPiece) { codes_[codes_used_].first == CharClass::kMatraPiece) {
++num_matra_parts; ++num_matra_parts;
if (UseMultiCode(num_matra_parts)) return true; if (UseMultiCode(num_matra_parts)) return true;
} else if (num_matra_parts) { } else if (num_matra_parts) {
if (report_errors_) { if (report_errors_) {
tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n", tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n",
output_.back(), codes_[codes_used_].second); output_.back(), codes_[codes_used_].second);
} }
return false; return false;
} }
if (codes_[codes_used_].first == CharClass::kMatraPiece && if (codes_[codes_used_].first == CharClass::kMatraPiece &&
codes_[codes_used_ - 1].first != CharClass::kMatraPiece) { codes_[codes_used_ - 1].first != CharClass::kMatraPiece) {
if (UseMultiCode(1)) return true; if (UseMultiCode(1)) return true;
} }
if (codes_[codes_used_].first == CharClass::kVowelModifier) { if (codes_[codes_used_].first == CharClass::kVowelModifier) {
if (UseMultiCode(1)) return true; if (UseMultiCode(1)) return true;
} }
if (codes_used_ + 1 < num_codes && if (codes_used_ + 1 < num_codes &&
codes_[codes_used_].first == CharClass::kVirama && codes_[codes_used_].first == CharClass::kVirama &&
codes_[codes_used_ + 1].first == CharClass::kConsonant) { codes_[codes_used_ + 1].first == CharClass::kConsonant) {
ASSERT_HOST(!CodeOnlyToOutput()); ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true; if (UseMultiCode(2)) return true;
} }
return true; return true;
} }
Validator::CharClass ValidateKhmer::UnicodeToCharClass(char32 ch) const { Validator::CharClass ValidateKhmer::UnicodeToCharClass(char32 ch) const {
if (IsVedicAccent(ch)) return CharClass::kVedicMark; if (IsVedicAccent(ch)) return CharClass::kVedicMark;
if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner; if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner; if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
// Offset from the start of the relevant unicode code block aka code page. // Offset from the start of the relevant unicode code block aka code page.
int off = ch - static_cast<char32>(script_); int off = ch - static_cast<char32>(script_);
// Anything in another code block is other. // Anything in another code block is other.
if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther; if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
if (off <= 0x33) return CharClass::kConsonant; if (off <= 0x33) return CharClass::kConsonant;
if (off <= 0x45) return CharClass::kMatra; if (off <= 0x45) return CharClass::kMatra;
if (off == 0x46) return CharClass::kMatraPiece; if (off == 0x46) return CharClass::kMatraPiece;
if (off == 0x4c) return CharClass::kRobat; if (off == 0x4c) return CharClass::kRobat;
if (off == 0x49 || off == 0x4a) return CharClass::kNukta; if (off == 0x49 || off == 0x4a) return CharClass::kNukta;
if (off <= 0x51) return CharClass::kVowelModifier; if (off <= 0x51) return CharClass::kVowelModifier;
if (off == 0x52) return CharClass::kVirama; if (off == 0x52) return CharClass::kVirama;
return CharClass::kOther; return CharClass::kOther;
} }
} // namespace tesseract } // namespace tesseract
#ifndef TESSERACT_TRAINING_VALIDATE_KHMER_H_ #ifndef TESSERACT_TRAINING_VALIDATE_KHMER_H_
#define TESSERACT_TRAINING_VALIDATE_KHMER_H_ #define TESSERACT_TRAINING_VALIDATE_KHMER_H_
#include "validator.h" #include "validator.h"
namespace tesseract { namespace tesseract {
// Subclass of Validator that validates and segments Khmer. // Subclass of Validator that validates and segments Khmer.
class ValidateKhmer : public Validator { class ValidateKhmer : public Validator {
public: public:
ValidateKhmer(ViramaScript script, bool report_errors) ValidateKhmer(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {} : Validator(script, report_errors) {}
~ValidateKhmer() {} ~ValidateKhmer() {}
protected: protected:
// Returns whether codes matches the pattern for an Khmer Grapheme. // Returns whether codes matches the pattern for an Khmer Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed, // parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_. // otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override; bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch. // Returns the CharClass corresponding to the given Unicode ch.
CharClass UnicodeToCharClass(char32 ch) const override; CharClass UnicodeToCharClass(char32 ch) const override;
}; };
} // namespace tesseract } // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_KHMER_H_ #endif // TESSERACT_TRAINING_VALIDATE_KHMER_H_
#include "validate_myanmar.h" #include "validate_myanmar.h"
#include "errcode.h" #include "errcode.h"
#include "icuerrorcode.h" #include "icuerrorcode.h"
#include "tprintf.h" #include "tprintf.h"
#include "unicode/uchar.h" // From libicu #include "unicode/uchar.h" // From libicu
#include "unicode/uscript.h" // From libicu #include "unicode/uscript.h" // From libicu
namespace tesseract { namespace tesseract {
// Returns whether codes matches the pattern for a Myanmar Grapheme. // Returns whether codes matches the pattern for a Myanmar Grapheme.
// Taken directly from the unicode table 16-3. // Taken directly from the unicode table 16-3.
// See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf // See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
bool ValidateMyanmar::ConsumeGraphemeIfValid() { bool ValidateMyanmar::ConsumeGraphemeIfValid() {
int num_codes = codes_.size(); int num_codes = codes_.size();
if (codes_used_ == num_codes) return true; if (codes_used_ == num_codes) return true;
// Other. // Other.
if (IsMyanmarOther(codes_[codes_used_].second)) { if (IsMyanmarOther(codes_[codes_used_].second)) {
UseMultiCode(1); UseMultiCode(1);
return true; return true;
} }
// Kinzi. // Kinzi.
if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 && if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 &&
codes_[codes_used_ + 1].second == kMyanmarAsat && codes_[codes_used_ + 1].second == kMyanmarAsat &&
codes_[codes_used_ + 2].second == kMyanmarVirama) { codes_[codes_used_ + 2].second == kMyanmarVirama) {
ASSERT_HOST(!CodeOnlyToOutput()); ASSERT_HOST(!CodeOnlyToOutput());
ASSERT_HOST(!CodeOnlyToOutput()); ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(3)) return true; if (UseMultiCode(3)) return true;
} }
// Base consonant/vowel. NOTE that since everything in Myanmar appears to be // Base consonant/vowel. NOTE that since everything in Myanmar appears to be
// optional, except the base, this is the only place where invalid input can // optional, except the base, this is the only place where invalid input can
// be detected and false returned. // be detected and false returned.
if (IsMyanmarLetter(codes_[codes_used_].second)) { if (IsMyanmarLetter(codes_[codes_used_].second)) {
if (UseMultiCode(1)) return true; if (UseMultiCode(1)) return true;
} else { } else {
if (report_errors_) { if (report_errors_) {
tprintf("Invalid start of Myanmar syllable:0x%x\n", tprintf("Invalid start of Myanmar syllable:0x%x\n",
codes_[codes_used_].second); codes_[codes_used_].second);
} }
return false; // One of these is required. return false; // One of these is required.
} }
if (ConsumeSubscriptIfPresent()) return true; if (ConsumeSubscriptIfPresent()) return true;
ConsumeOptionalSignsIfPresent(); ConsumeOptionalSignsIfPresent();
// What we have consumed so far is a valid syllable. // What we have consumed so far is a valid syllable.
return true; return true;
} }
// TODO(rays) Doesn't use intermediate coding like the other scripts, as there // TODO(rays) Doesn't use intermediate coding like the other scripts, as there
// is little correspondence between the content of table 16-3 and the char // is little correspondence between the content of table 16-3 and the char
// classes of the Indic languages. (Experts may disagree and improve!) // classes of the Indic languages. (Experts may disagree and improve!)
// In unicode table 16-3 there is basically a long list of optional characters, // In unicode table 16-3 there is basically a long list of optional characters,
// which can be coded quite easily. // which can be coded quite easily.
// Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!! // Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!!
// The table also allows sequences that still result in dotted circles!! // The table also allows sequences that still result in dotted circles!!
// So with a lot of guesswork the rest have been added in a reasonable place. // So with a lot of guesswork the rest have been added in a reasonable place.
Validator::CharClass ValidateMyanmar::UnicodeToCharClass(char32 ch) const { Validator::CharClass ValidateMyanmar::UnicodeToCharClass(char32 ch) const {
if (IsMyanmarLetter(ch)) return CharClass::kConsonant; if (IsMyanmarLetter(ch)) return CharClass::kConsonant;
return CharClass::kOther; return CharClass::kOther;
} }
// Helper consumes/copies a virama and any subscript consonant. // Helper consumes/copies a virama and any subscript consonant.
// Returns true if the end of input is reached. // Returns true if the end of input is reached.
bool ValidateMyanmar::ConsumeSubscriptIfPresent() { bool ValidateMyanmar::ConsumeSubscriptIfPresent() {
// Subscript consonant. It appears there can be only one. // Subscript consonant. It appears there can be only one.
int num_codes = codes_.size(); int num_codes = codes_.size();
if (codes_used_ + 1 < num_codes && if (codes_used_ + 1 < num_codes &&
codes_[codes_used_].second == kMyanmarVirama) { codes_[codes_used_].second == kMyanmarVirama) {
if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) { if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) {
ASSERT_HOST(!CodeOnlyToOutput()); ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true; if (UseMultiCode(2)) return true;
} }
} }
return false; return false;
} }
// Helper consumes/copies a series of optional signs. // Helper consumes/copies a series of optional signs.
// Returns true if the end of input is reached. // Returns true if the end of input is reached.
bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() { bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {
// The following characters are allowed, all optional, and in sequence. // The following characters are allowed, all optional, and in sequence.
// An exception is kMyanmarMedialYa, which can include kMyanmarAsat. // An exception is kMyanmarMedialYa, which can include kMyanmarAsat.
const std::vector<char32> kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c, const std::vector<char32> kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c,
0x103d, 0x103e, 0x105e, 0x105f, 0x1060, 0x103d, 0x103e, 0x105e, 0x105f, 0x1060,
0x1081, 0x1031}); 0x1081, 0x1031});
for (char32 ch : kMedials) { for (char32 ch : kMedials) {
if (codes_[codes_used_].second == ch) { if (codes_[codes_used_].second == ch) {
if (UseMultiCode(1)) return true; if (UseMultiCode(1)) return true;
if (ch == kMyanmarMedialYa && if (ch == kMyanmarMedialYa &&
codes_[codes_used_].second == kMyanmarAsat) { codes_[codes_used_].second == kMyanmarAsat) {
if (UseMultiCode(1)) return true; if (UseMultiCode(1)) return true;
} }
} }
} }
// Vowel sign i, ii, ai. // Vowel sign i, ii, ai.
char32 ch = codes_[codes_used_].second; char32 ch = codes_[codes_used_].second;
if (ch == 0x102d || ch == 0x102e || ch == 0x1032) { if (ch == 0x102d || ch == 0x102e || ch == 0x1032) {
if (UseMultiCode(1)) return true; if (UseMultiCode(1)) return true;
} }
// Vowel sign u, uu, and extensions. // Vowel sign u, uu, and extensions.
ch = codes_[codes_used_].second; ch = codes_[codes_used_].second;
if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) || if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) ||
ch == 0x1062 || ch == 0x1067 || ch == 0x1068 || ch == 0x1062 || ch == 0x1067 || ch == 0x1068 ||
(0x1071 <= ch && ch <= 0x1074) || (0x1083 <= ch && ch <= 0x1086) || (0x1071 <= ch && ch <= 0x1074) || (0x1083 <= ch && ch <= 0x1086) ||
ch == 0x109c || ch == 0x109d) { ch == 0x109c || ch == 0x109d) {
if (UseMultiCode(1)) return true; if (UseMultiCode(1)) return true;
} }
// Tall aa, aa with optional asat. // Tall aa, aa with optional asat.
if (codes_[codes_used_].second == 0x102b || if (codes_[codes_used_].second == 0x102b ||
codes_[codes_used_].second == 0x102c) { codes_[codes_used_].second == 0x102c) {
if (UseMultiCode(1)) return true; if (UseMultiCode(1)) return true;
if (codes_[codes_used_].second == kMyanmarAsat) { if (codes_[codes_used_].second == kMyanmarAsat) {
if (UseMultiCode(1)) return true; if (UseMultiCode(1)) return true;
} }
} }
// The following characters are allowed, all optional, and in sequence. // The following characters are allowed, all optional, and in sequence.
const std::vector<char32> kSigns({0x1036, 0x1037}); const std::vector<char32> kSigns({0x1036, 0x1037});
for (char32 ch : kSigns) { for (char32 ch : kSigns) {
if (codes_[codes_used_].second == ch) { if (codes_[codes_used_].second == ch) {
if (UseMultiCode(1)) return true; if (UseMultiCode(1)) return true;
} }
} }
// Tone mark extensions. // Tone mark extensions.
ch = codes_[codes_used_].second; ch = codes_[codes_used_].second;
if (ch == 0x1038 || ch == kMyanmarAsat || ch == 0x1063 || ch == 0x1064 || if (ch == 0x1038 || ch == kMyanmarAsat || ch == 0x1063 || ch == 0x1064 ||
(0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) || (0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) ||
ch == 0x108f || ch == 0x109a || ch == 0x109b || ch == 0x108f || ch == 0x109a || ch == 0x109b ||
(0xaa7b <= ch && ch <= 0xaa7d)) { (0xaa7b <= ch && ch <= 0xaa7d)) {
if (UseMultiCode(1)) return true; if (UseMultiCode(1)) return true;
} }
return false; return false;
} }
// Returns true if the unicode is a Myanmar "letter" including consonants // Returns true if the unicode is a Myanmar "letter" including consonants
// and independent vowels. Although table 16-3 distinguishes between some // and independent vowels. Although table 16-3 distinguishes between some
// base consonants and vowels, the extensions make no such distinction, so we // base consonants and vowels, the extensions make no such distinction, so we
// put them all into a single bucket. // put them all into a single bucket.
/* static */ /* static */
bool ValidateMyanmar::IsMyanmarLetter(char32 ch) { bool ValidateMyanmar::IsMyanmarLetter(char32 ch) {
return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f || return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f ||
(0x1050 <= ch && ch <= 0x1055) || (0x105a <= ch && ch <= 0x105d) || (0x1050 <= ch && ch <= 0x1055) || (0x105a <= ch && ch <= 0x105d) ||
ch == 0x1061 || ch == 0x1065 || ch == 0x1066 || ch == 0x1061 || ch == 0x1065 || ch == 0x1066 ||
(0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1080) || (0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1080) ||
ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9ef) || ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9ef) ||
(0xa9fa <= ch && ch <= 0xa9ff) || (0xaa60 <= ch && ch <= 0xaa73) || (0xa9fa <= ch && ch <= 0xa9ff) || (0xaa60 <= ch && ch <= 0xaa73) ||
ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f; ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f;
} }
// Returns true if ch is a Myanmar digit or other symbol that does not take // Returns true if ch is a Myanmar digit or other symbol that does not take
// part in being a syllable. // part in being a syllable.
/* static */ /* static */
bool ValidateMyanmar::IsMyanmarOther(char32 ch) { bool ValidateMyanmar::IsMyanmarOther(char32 ch) {
IcuErrorCode err; IcuErrorCode err;
UScriptCode script_code = uscript_getScript(ch, err); UScriptCode script_code = uscript_getScript(ch, err);
if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner && if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner &&
ch != Validator::kZeroWidthNonJoiner) ch != Validator::kZeroWidthNonJoiner)
return true; return true;
return (0x1040 <= ch && ch <= 0x1049) || (0x1090 <= ch && ch <= 0x1099) || return (0x1040 <= ch && ch <= 0x1049) || (0x1090 <= ch && ch <= 0x1099) ||
(0x109c <= ch && ch <= 0x109d) || (0xa9f0 <= ch && ch <= 0xa9f9) || (0x109c <= ch && ch <= 0x109d) || (0xa9f0 <= ch && ch <= 0xa9f9) ||
(0xaa74 <= ch && ch <= 0xaa79); (0xaa74 <= ch && ch <= 0xaa79);
} }
} // namespace tesseract } // namespace tesseract
#ifndef TESSERACT_TRAINING_VALIDATE_MYANMAR_H_ #ifndef TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
#define TESSERACT_TRAINING_VALIDATE_MYANMAR_H_ #define TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
#include "validator.h" #include "validator.h"
namespace tesseract { namespace tesseract {
// Subclass of Validator that validates and segments Myanmar. // Subclass of Validator that validates and segments Myanmar.
class ValidateMyanmar : public Validator { class ValidateMyanmar : public Validator {
public: public:
ValidateMyanmar(ViramaScript script, bool report_errors) ValidateMyanmar(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {} : Validator(script, report_errors) {}
~ValidateMyanmar() {} ~ValidateMyanmar() {}
protected: protected:
// Returns whether codes matches the pattern for a Myanmar Grapheme. // Returns whether codes matches the pattern for a Myanmar Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed, // parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_. // otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override; bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch. // Returns the CharClass corresponding to the given Unicode ch.
Validator::CharClass UnicodeToCharClass(char32 ch) const override; Validator::CharClass UnicodeToCharClass(char32 ch) const override;
private: private:
// Helper consumes/copies a virama and any subscript consonant. // Helper consumes/copies a virama and any subscript consonant.
// Returns true if the end of input is reached. // Returns true if the end of input is reached.
bool ConsumeSubscriptIfPresent(); bool ConsumeSubscriptIfPresent();
// Helper consumes/copies a series of optional signs. // Helper consumes/copies a series of optional signs.
// Returns true if the end of input is reached. // Returns true if the end of input is reached.
bool ConsumeOptionalSignsIfPresent(); bool ConsumeOptionalSignsIfPresent();
// Returns true if the unicode is a Myanmar "letter" including consonants // Returns true if the unicode is a Myanmar "letter" including consonants
// and independent vowels. Although table 16-3 distinguishes between some // and independent vowels. Although table 16-3 distinguishes between some
// base consonants and vowels, the extensions make no such distinction, so we // base consonants and vowels, the extensions make no such distinction, so we
// put them all into a single bucket. // put them all into a single bucket.
static bool IsMyanmarLetter(char32 ch); static bool IsMyanmarLetter(char32 ch);
// Returns true if ch is a Myanmar digit or other symbol that does not take // Returns true if ch is a Myanmar digit or other symbol that does not take
// part in being a syllable. // part in being a syllable.
static bool IsMyanmarOther(char32 ch); static bool IsMyanmarOther(char32 ch);
// Some special unicodes used only for Myanmar processing. // Some special unicodes used only for Myanmar processing.
static const char32 kMyanmarAsat = 0x103a; static const char32 kMyanmarAsat = 0x103a;
static const char32 kMyanmarMedialYa = 0x103b; static const char32 kMyanmarMedialYa = 0x103b;
}; };
} // namespace tesseract } // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_MYANMAR_H_ #endif // TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册