提交 104fe793 编写于 作者: E Egor Pugin

Move training to src.

上级 ca5c15e6
......@@ -308,7 +308,7 @@ if (BUILD_TESTS AND EXISTS ${PROJECT_SOURCE_DIR}/googletest/CMakeLists.txt)
endif()
if (BUILD_TRAINING_TOOLS)
add_subdirectory(training)
add_subdirectory(src/training)
endif()
get_target_property(tesseract_NAME libtesseract NAME)
......
......@@ -502,7 +502,7 @@ AC_CONFIG_FILES([java/com/google/scrollview/Makefile])
AC_CONFIG_FILES([java/com/google/scrollview/events/Makefile])
AC_CONFIG_FILES([java/com/google/scrollview/ui/Makefile])
AC_CONFIG_FILES([doc/Makefile])
AM_COND_IF([ENABLE_TRAINING], [AC_CONFIG_FILES(training/Makefile)])
AM_COND_IF([ENABLE_TRAINING], [AC_CONFIG_FILES(src/training/Makefile)])
AC_OUTPUT
# Final message
......
......@@ -172,7 +172,7 @@ projects:
tessopt:
type: lib
static_only: true
files: training/tessopt.*
files: src/training/tessopt.*
include_directories: training
dependencies: libtesseract
......@@ -180,104 +180,104 @@ projects:
type: lib
static_only: true
files:
- training/commandlineflags.cpp
- training/commandlineflags.h
- training/commontraining.cpp
- training/commontraining.h
- src/training/commandlineflags.cpp
- src/training/commandlineflags.h
- src/training/commontraining.cpp
- src/training/commontraining.h
include_directories: training
dependencies:
- tessopt
ambiguous_words:
files: training/ambiguous_words.cpp
files: src/training/ambiguous_words.cpp
dependencies:
- libtesseract
classifier_tester:
files: training/classifier_tester.cpp
files: src/training/classifier_tester.cpp
dependencies: common_training
combine_lang_model:
files: training/combine_lang_model.cpp
files: src/training/combine_lang_model.cpp
dependencies: unicharset_training
combine_tessdata:
files: training/combine_tessdata.cpp
files: src/training/combine_tessdata.cpp
dependencies: libtesseract
cntraining:
files: training/cntraining.cpp
files: src/training/cntraining.cpp
dependencies: common_training
dawg2wordlist:
files: training/dawg2wordlist.cpp
files: src/training/dawg2wordlist.cpp
dependencies: libtesseract
mftraining:
files:
- training/mftraining.cpp
- training/mergenf.*
- src/training/mftraining.cpp
- src/training/mergenf.*
dependencies: common_training
shapeclustering:
files: training/shapeclustering.cpp
files: src/training/shapeclustering.cpp
dependencies: common_training
unicharset_extractor:
files: training/unicharset_extractor.cpp
files: src/training/unicharset_extractor.cpp
dependencies: unicharset_training
wordlist2dawg:
files: training/wordlist2dawg.cpp
files: src/training/wordlist2dawg.cpp
dependencies: libtesseract
unicharset_training:
type: lib
static_only: true
files:
- training/fileio.*
- training/icuerrorcode.h
- training/lang_model_helpers.*
- training/lstmtester.*
- training/normstrngs.*
- training/unicharset_training_utils.*
- training/validat.*
- src/training/fileio.*
- src/training/icuerrorcode.h
- src/training/lang_model_helpers.*
- src/training/lstmtester.*
- src/training/normstrngs.*
- src/training/unicharset_training_utils.*
- src/training/validat.*
include_directories: training
dependencies:
- common_training
- pvt.cppan.demo.unicode.icu.i18n
lstmeval:
files: training/lstmeval.cpp
files: src/training/lstmeval.cpp
dependencies: unicharset_training
lstmtraining:
files: training/lstmtraining.cpp
files: src/training/lstmtraining.cpp
dependencies: unicharset_training
set_unicharset_properties:
files: training/set_unicharset_properties.cpp
files: src/training/set_unicharset_properties.cpp
dependencies: unicharset_training
text2image:
files:
- training/text2image.cpp
- training/boxchar.cpp
- training/boxchar.h
- training/degradeimage.cpp
- training/degradeimage.h
- training/ligature_table.cpp
- training/ligature_table.h
- training/normstrngs.cpp
- training/normstrngs.h
- training/pango_font_info.cpp
- training/pango_font_info.h
- training/stringrenderer.cpp
- training/stringrenderer.h
- training/tlog.cpp
- training/tlog.h
- training/util.h
- training/icuerrorcode.h
- src/training/text2image.cpp
- src/training/boxchar.cpp
- src/training/boxchar.h
- src/training/degradeimage.cpp
- src/training/degradeimage.h
- src/training/ligature_table.cpp
- src/training/ligature_table.h
- src/training/normstrngs.cpp
- src/training/normstrngs.h
- src/training/pango_font_info.cpp
- src/training/pango_font_info.h
- src/training/stringrenderer.cpp
- src/training/stringrenderer.h
- src/training/tlog.cpp
- src/training/tlog.h
- src/training/util.h
- src/training/icuerrorcode.h
dependencies:
- unicharset_training
......
/**********************************************************************
* File: degradeimage.h
* Description: Function to degrade an image (usually of text) as if it
* has been printed and then scanned.
* Authors: Ray Smith
* Created: Tue Nov 19 2013
*
* (C) Copyright 2013, Google Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_TRAINING_DEGRADEIMAGE_H_
#define TESSERACT_TRAINING_DEGRADEIMAGE_H_
#include "allheaders.h"
#include "genericvector.h"
#include "helpers.h" // For TRand.
#include "rect.h"
namespace tesseract {
// Degrade the pix as if by a print/copy/scan cycle with exposure > 0
// corresponding to darkening on the copier and <0 lighter and 0 not copied.
// If rotation is not nullptr, the clockwise rotation in radians is saved there.
// The input pix must be 8 bit grey. (Binary with values 0 and 255 is OK.)
// The input image is destroyed and a different image returned.
struct Pix* DegradeImage(struct Pix* input, int exposure, TRand* randomizer,
float* rotation);
// Creates and returns a Pix distorted by various means according to the bool
// flags. If boxes is not nullptr, the boxes are resized/positioned according to
// any spatial distortion and also by the integer reduction factor box_scale
// so they will match what the network will output.
// Returns nullptr on error. The returned Pix must be pixDestroyed.
Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert,
bool white_noise, bool smooth_noise, bool blur,
int box_reduction, TRand* randomizer,
GenericVector<TBOX>* boxes);
// Distorts anything that has a non-null pointer with the same pseudo-random
// perspective distortion. Width and height only need to be set if there
// is no pix. If there is a pix, then they will be taken from there.
void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer,
Pix** pix, GenericVector<TBOX>* boxes);
// Computes the coefficients of a randomized projective transformation.
// The image transform requires backward transformation coefficient, and the
// box transform the forward coefficients.
// Returns the incolor arg to pixProjective.
int ProjectiveCoeffs(int width, int height, TRand* randomizer,
float** im_coeffs, float** box_coeffs);
} // namespace tesseract
#endif // TESSERACT_TRAINING_DEGRADEIMAGE_H_
/**********************************************************************
* File: degradeimage.h
* Description: Function to degrade an image (usually of text) as if it
* has been printed and then scanned.
* Authors: Ray Smith
* Created: Tue Nov 19 2013
*
* (C) Copyright 2013, Google Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_TRAINING_DEGRADEIMAGE_H_
#define TESSERACT_TRAINING_DEGRADEIMAGE_H_
#include "allheaders.h"
#include "genericvector.h"
#include "helpers.h" // For TRand.
#include "rect.h"
namespace tesseract {
// Degrade the pix as if by a print/copy/scan cycle with exposure > 0
// corresponding to darkening on the copier and <0 lighter and 0 not copied.
// If rotation is not nullptr, the clockwise rotation in radians is saved there.
// The input pix must be 8 bit grey. (Binary with values 0 and 255 is OK.)
// The input image is destroyed and a different image returned.
struct Pix* DegradeImage(struct Pix* input, int exposure, TRand* randomizer,
float* rotation);
// Creates and returns a Pix distorted by various means according to the bool
// flags. If boxes is not nullptr, the boxes are resized/positioned according to
// any spatial distortion and also by the integer reduction factor box_scale
// so they will match what the network will output.
// Returns nullptr on error. The returned Pix must be pixDestroyed.
Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert,
bool white_noise, bool smooth_noise, bool blur,
int box_reduction, TRand* randomizer,
GenericVector<TBOX>* boxes);
// Distorts anything that has a non-null pointer with the same pseudo-random
// perspective distortion. Width and height only need to be set if there
// is no pix. If there is a pix, then they will be taken from there.
void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer,
Pix** pix, GenericVector<TBOX>* boxes);
// Computes the coefficients of a randomized projective transformation.
// The image transform requires backward transformation coefficient, and the
// box transform the forward coefficients.
// Returns the incolor arg to pixProjective.
int ProjectiveCoeffs(int width, int height, TRand* randomizer,
float** im_coeffs, float** box_coeffs);
} // namespace tesseract
#endif // TESSERACT_TRAINING_DEGRADEIMAGE_H_
/**********************************************************************
* File: icuerrorcode.h
* Description: Wrapper class for UErrorCode, with conversion operators for
* direct use in ICU C and C++ APIs.
* Author: Fredrik Roubert
* Created: Thu July 4 2013
*
* Features:
* - The constructor initializes the internal UErrorCode to U_ZERO_ERROR,
* removing one common source of errors.
* - Same use in C APIs taking a UErrorCode* (pointer) and C++ taking
* UErrorCode& (reference), via conversion operators.
* - Automatic checking for success when it goes out of scope. On failure,
* the destructor will log an error message and exit.
*
* Most of ICU will handle errors gracefully and provide sensible fallbacks.
* Using IcuErrorCode, it is therefore possible to write very compact code
* that does sensible things on failure and provides logging for debugging.
*
* Example:
* IcuErrorCode icuerrorcode;
* return collator.compareUTF8(a, b, icuerrorcode) == UCOL_EQUAL;
*
* (C) Copyright 2013, Google Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_CCUTIL_ICUERRORCODE_H_
#define TESSERACT_CCUTIL_ICUERRORCODE_H_
#include "tprintf.h"
#include "unicode/errorcode.h" // From libicu
namespace tesseract {
class IcuErrorCode : public icu::ErrorCode {
public:
IcuErrorCode() {}
virtual ~IcuErrorCode() {
if (isFailure()) {
handleFailure();
}
}
protected:
virtual void handleFailure() const {
tprintf("ICU ERROR: %s", errorName());
exit(errorCode);
}
private:
// Disallow implicit copying of object.
IcuErrorCode(const IcuErrorCode&);
void operator=(const IcuErrorCode&);
};
} // namespace tesseract
#endif // TESSERACT_CCUTIL_ICUERRORCODE_H_
/**********************************************************************
* File: icuerrorcode.h
* Description: Wrapper class for UErrorCode, with conversion operators for
* direct use in ICU C and C++ APIs.
* Author: Fredrik Roubert
* Created: Thu July 4 2013
*
* Features:
* - The constructor initializes the internal UErrorCode to U_ZERO_ERROR,
* removing one common source of errors.
* - Same use in C APIs taking a UErrorCode* (pointer) and C++ taking
* UErrorCode& (reference), via conversion operators.
* - Automatic checking for success when it goes out of scope. On failure,
* the destructor will log an error message and exit.
*
* Most of ICU will handle errors gracefully and provide sensible fallbacks.
* Using IcuErrorCode, it is therefore possible to write very compact code
* that does sensible things on failure and provides logging for debugging.
*
* Example:
* IcuErrorCode icuerrorcode;
* return collator.compareUTF8(a, b, icuerrorcode) == UCOL_EQUAL;
*
* (C) Copyright 2013, Google Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_CCUTIL_ICUERRORCODE_H_
#define TESSERACT_CCUTIL_ICUERRORCODE_H_
#include "tprintf.h"
#include "unicode/errorcode.h" // From libicu
namespace tesseract {
class IcuErrorCode : public icu::ErrorCode {
public:
IcuErrorCode() {}
virtual ~IcuErrorCode() {
if (isFailure()) {
handleFailure();
}
}
protected:
virtual void handleFailure() const {
tprintf("ICU ERROR: %s", errorName());
exit(errorCode);
}
private:
// Disallow implicit copying of object.
IcuErrorCode(const IcuErrorCode&);
void operator=(const IcuErrorCode&);
};
} // namespace tesseract
#endif // TESSERACT_CCUTIL_ICUERRORCODE_H_
/******************************************************************************
** Filename: MergeNF.c
** Purpose: Program for merging similar nano-feature protos
** Author: Dan Johnson
** History: Wed Nov 21 09:55:23 1990, DSJ, Created.
**
** (c) Copyright Hewlett-Packard Company, 1988.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
******************************************************************************/
#ifndef TESSERACT_TRAINING_MERGENF_H_
#define TESSERACT_TRAINING_MERGENF_H_
/**----------------------------------------------------------------------------
Include Files and Type Defines
----------------------------------------------------------------------------**/
#include "protos.h"
#include "cluster.h"
#include "ocrfeatures.h"
#include "callcpp.h"
#include "picofeat.h"
#define WORST_MATCH_ALLOWED (0.9)
#define WORST_EVIDENCE (1.0)
#define MAX_LENGTH_MISMATCH (2.0 * GetPicoFeatureLength ())
#define PROTO_SUFFIX ".mf.p"
#define CONFIG_SUFFIX ".cl"
#define NO_PROTO (-1)
#define XPOSITION 0
#define YPOSITION 1
#define MFLENGTH 2
#define ORIENTATION 3
typedef struct
{
FLOAT32 MinX, MaxX, MinY, MaxY;
} FRECT;
/**----------------------------------------------------------------------------
Public Macros
----------------------------------------------------------------------------**/
#define CenterX(M) ( (M)[XPOSITION] )
#define CenterY(M) ( (M)[YPOSITION] )
#define LengthOf(M) ( (M)[MFLENGTH] )
#define OrientationOf(M) ( (M)[ORIENTATION] )
/**----------------------------------------------------------------------------
Public Function Prototypes
----------------------------------------------------------------------------**/
FLOAT32 CompareProtos (
PROTO p1,
PROTO p2);
void ComputeMergedProto (
PROTO p1,
PROTO p2,
FLOAT32 w1,
FLOAT32 w2,
PROTO MergedProto);
int FindClosestExistingProto (
CLASS_TYPE Class,
int NumMerged[],
PROTOTYPE *Prototype);
void MakeNewFromOld (
PROTO New,
PROTOTYPE *Old);
FLOAT32 SubfeatureEvidence (
FEATURE Feature,
PROTO Proto);
double EvidenceOf (
register double Similarity);
BOOL8 DummyFastMatch (
FEATURE Feature,
PROTO Proto);
void ComputePaddedBoundingBox (
PROTO Proto,
FLOAT32 TangentPad,
FLOAT32 OrthogonalPad,
FRECT *BoundingBox);
BOOL8 PointInside (
FRECT *Rectangle,
FLOAT32 X,
FLOAT32 Y);
#endif // TESSERACT_TRAINING_MERGENF_H_
/******************************************************************************
** Filename: MergeNF.c
** Purpose: Program for merging similar nano-feature protos
** Author: Dan Johnson
** History: Wed Nov 21 09:55:23 1990, DSJ, Created.
**
** (c) Copyright Hewlett-Packard Company, 1988.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
******************************************************************************/
#ifndef TESSERACT_TRAINING_MERGENF_H_
#define TESSERACT_TRAINING_MERGENF_H_
/**----------------------------------------------------------------------------
Include Files and Type Defines
----------------------------------------------------------------------------**/
#include "protos.h"
#include "cluster.h"
#include "ocrfeatures.h"
#include "callcpp.h"
#include "picofeat.h"
#define WORST_MATCH_ALLOWED (0.9)
#define WORST_EVIDENCE (1.0)
#define MAX_LENGTH_MISMATCH (2.0 * GetPicoFeatureLength ())
#define PROTO_SUFFIX ".mf.p"
#define CONFIG_SUFFIX ".cl"
#define NO_PROTO (-1)
#define XPOSITION 0
#define YPOSITION 1
#define MFLENGTH 2
#define ORIENTATION 3
typedef struct
{
FLOAT32 MinX, MaxX, MinY, MaxY;
} FRECT;
/**----------------------------------------------------------------------------
Public Macros
----------------------------------------------------------------------------**/
#define CenterX(M) ( (M)[XPOSITION] )
#define CenterY(M) ( (M)[YPOSITION] )
#define LengthOf(M) ( (M)[MFLENGTH] )
#define OrientationOf(M) ( (M)[ORIENTATION] )
/**----------------------------------------------------------------------------
Public Function Prototypes
----------------------------------------------------------------------------**/
FLOAT32 CompareProtos (
PROTO p1,
PROTO p2);
void ComputeMergedProto (
PROTO p1,
PROTO p2,
FLOAT32 w1,
FLOAT32 w2,
PROTO MergedProto);
int FindClosestExistingProto (
CLASS_TYPE Class,
int NumMerged[],
PROTOTYPE *Prototype);
void MakeNewFromOld (
PROTO New,
PROTOTYPE *Old);
FLOAT32 SubfeatureEvidence (
FEATURE Feature,
PROTO Proto);
double EvidenceOf (
register double Similarity);
BOOL8 DummyFastMatch (
FEATURE Feature,
PROTO Proto);
void ComputePaddedBoundingBox (
PROTO Proto,
FLOAT32 TangentPad,
FLOAT32 OrthogonalPad,
FRECT *BoundingBox);
BOOL8 PointInside (
FRECT *Rectangle,
FLOAT32 X,
FLOAT32 Y);
#endif // TESSERACT_TRAINING_MERGENF_H_
/**********************************************************************
* File: tlog.cpp
* Description: Variant of printf with logging level controllable by a
* commandline flag.
* Author: Ranjith Unnikrishnan
* Created: Wed Nov 20 2013
*
* (C) Copyright 2013, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "tlog.h"
INT_PARAM_FLAG(tlog_level, 0, "Minimum logging level for tlog() output");
/**********************************************************************
* File: tlog.cpp
* Description: Variant of printf with logging level controllable by a
* commandline flag.
* Author: Ranjith Unnikrishnan
* Created: Wed Nov 20 2013
*
* (C) Copyright 2013, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "tlog.h"
INT_PARAM_FLAG(tlog_level, 0, "Minimum logging level for tlog() output");
/**********************************************************************
* File: tlog.h
* Description: Variant of printf with logging level controllable by a
* commandline flag.
* Author: Ranjith Unnikrishnan
* Created: Wed Nov 20 2013
*
* (C) Copyright 2013, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_TRAINING_TLOG_H_
#define TESSERACT_TRAINING_TLOG_H_
#include "commandlineflags.h"
#include "errcode.h"
#include "tprintf.h"
DECLARE_INT_PARAM_FLAG(tlog_level);
// Variant guarded by the numeric logging level parameter FLAGS_tlog_level
// (default 0). Code using ParseCommandLineFlags() can control its value using
// the --tlog_level commandline argument. Otherwise it must be specified in a
// config file like other params.
#define tlog(level, ...) { \
if (FLAGS_tlog_level >= level) { \
tprintf_internal(__VA_ARGS__); \
} \
}
#define TLOG_IS_ON(level) (FLAGS_tlog_level >= level)
#endif // TESSERACT_TRAINING_TLOG_H_
/**********************************************************************
* File: tlog.h
* Description: Variant of printf with logging level controllable by a
* commandline flag.
* Author: Ranjith Unnikrishnan
* Created: Wed Nov 20 2013
*
* (C) Copyright 2013, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_TRAINING_TLOG_H_
#define TESSERACT_TRAINING_TLOG_H_
#include "commandlineflags.h"
#include "errcode.h"
#include "tprintf.h"
DECLARE_INT_PARAM_FLAG(tlog_level);
// Variant guarded by the numeric logging level parameter FLAGS_tlog_level
// (default 0). Code using ParseCommandLineFlags() can control its value using
// the --tlog_level commandline argument. Otherwise it must be specified in a
// config file like other params.
#define tlog(level, ...) { \
if (FLAGS_tlog_level >= level) { \
tprintf_internal(__VA_ARGS__); \
} \
}
#define TLOG_IS_ON(level) (FLAGS_tlog_level >= level)
#endif // TESSERACT_TRAINING_TLOG_H_
#ifndef TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
#define TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
#include "validator.h"
namespace tesseract {
// Subclass of Validator that validates and segments generic unicode into
// grapheme clusters, including Latin with diacritics.
class ValidateGrapheme : public Validator {
public:
ValidateGrapheme(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {}
~ValidateGrapheme() {}
protected:
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch.
CharClass UnicodeToCharClass(char32 ch) const override;
private:
// Helper returns true if the sequence prev_ch,ch is invalid.
bool IsBadlyFormed(char32 prev_ch, char32 ch);
// Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel.
static bool IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch);
// Helper returns true if the sequence prev_ch,ch is invalid Thai.
static bool IsBadlyFormedThai(char32 prev_ch, char32 ch);
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
#ifndef TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
#define TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
#include "validator.h"
namespace tesseract {
// Subclass of Validator that validates and segments generic unicode into
// grapheme clusters, including Latin with diacritics.
class ValidateGrapheme : public Validator {
public:
ValidateGrapheme(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {}
~ValidateGrapheme() {}
protected:
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch.
CharClass UnicodeToCharClass(char32 ch) const override;
private:
// Helper returns true if the sequence prev_ch,ch is invalid.
bool IsBadlyFormed(char32 prev_ch, char32 ch);
// Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel.
static bool IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch);
// Helper returns true if the sequence prev_ch,ch is invalid Thai.
static bool IsBadlyFormedThai(char32 prev_ch, char32 ch);
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
#ifndef TESSERACT_TRAINING_VALIDATE_INDIC_H_
#define TESSERACT_TRAINING_VALIDATE_INDIC_H_
#include "validator.h"
namespace tesseract {
// Subclass of Validator that validates and segments Indic scripts in the
// unicode range 0x900-0xdff (Devanagari-Sinhala).
class ValidateIndic : public Validator {
public:
ValidateIndic(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {}
~ValidateIndic() {}
protected:
// Returns whether codes matches the pattern for an Indic Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch.
Validator::CharClass UnicodeToCharClass(char32 ch) const override;
private:
// Helper consumes/copies a virama and any associated post-virama joiners.
bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra);
// Helper consumes/copies a series of consonants separated by viramas while
// valid, but not any vowel or other modifiers.
bool ConsumeConsonantHeadIfValid();
// Helper consumes/copies a tail part of a consonant, comprising optional
// matra/piece, vowel modifier, vedic mark, terminating virama.
bool ConsumeConsonantTailIfValid();
// Helper consumes/copies a vowel and optional modifiers.
bool ConsumeVowelIfValid();
// Some special unicodes used only for Indic processing.
static const char32 kYayana = 0xdba; // Sinhala Ya
static const char32 kRayana = 0xdbb; // Sinhala Ra
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_INDIC_H_
#ifndef TESSERACT_TRAINING_VALIDATE_INDIC_H_
#define TESSERACT_TRAINING_VALIDATE_INDIC_H_
#include "validator.h"
namespace tesseract {
// Subclass of Validator that validates and segments Indic scripts in the
// unicode range 0x900-0xdff (Devanagari-Sinhala).
class ValidateIndic : public Validator {
public:
ValidateIndic(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {}
~ValidateIndic() {}
protected:
// Returns whether codes matches the pattern for an Indic Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch.
Validator::CharClass UnicodeToCharClass(char32 ch) const override;
private:
// Helper consumes/copies a virama and any associated post-virama joiners.
bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra);
// Helper consumes/copies a series of consonants separated by viramas while
// valid, but not any vowel or other modifiers.
bool ConsumeConsonantHeadIfValid();
// Helper consumes/copies a tail part of a consonant, comprising optional
// matra/piece, vowel modifier, vedic mark, terminating virama.
bool ConsumeConsonantTailIfValid();
// Helper consumes/copies a vowel and optional modifiers.
bool ConsumeVowelIfValid();
// Some special unicodes used only for Indic processing.
static const char32 kYayana = 0xdba; // Sinhala Ya
static const char32 kRayana = 0xdbb; // Sinhala Ra
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_INDIC_H_
#include "validate_khmer.h"
#include "errcode.h"
#include "tprintf.h"
namespace tesseract {
// Returns whether codes matches the pattern for a Khmer Grapheme.
// Taken from unicode standard:
// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf.
// where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation
// to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf.
// Translated to the codes used by the CharClass enum:
// C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC}
// Where R is a new symbol (Robat) and N is repurposed as a consonant shifter.
// Also the Consonant class here includes independent vowels, as they are
// treated the same anyway.
// In the split grapheme mode, the only characters that get grouped are the
// HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in
// the BNF syntax, so who knows what they do.
bool ValidateKhmer::ConsumeGraphemeIfValid() {
int num_codes = codes_.size();
if (codes_used_ == num_codes) return false;
if (codes_[codes_used_].first == CharClass::kOther) {
UseMultiCode(1);
return true;
}
if (codes_[codes_used_].first != CharClass::kConsonant) {
if (report_errors_) {
tprintf("Invalid start of Khmer syllable:0x%x\n",
codes_[codes_used_].second);
}
return false;
}
if (UseMultiCode(1)) return true;
if (codes_[codes_used_].first == CharClass::kRobat ||
codes_[codes_used_].first == CharClass::kNukta) {
if (UseMultiCode(1)) return true;
}
while (codes_used_ + 1 < num_codes &&
codes_[codes_used_].first == CharClass::kVirama &&
codes_[codes_used_ + 1].first == CharClass::kConsonant) {
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true;
if (codes_[codes_used_].first == CharClass::kRobat) {
if (UseMultiCode(1)) return true;
}
}
int num_matra_parts = 0;
if (codes_[codes_used_].second == kZeroWidthJoiner ||
codes_[codes_used_].second == kZeroWidthNonJoiner) {
if (CodeOnlyToOutput()) {
if (report_errors_) {
tprintf("Unterminated joiner: 0x%x\n", output_.back());
}
return false;
}
++num_matra_parts;
}
// Not quite as shown by the BNF, the matra piece is allowed as a matra on its
// own or as an addition to other matras.
if (codes_[codes_used_].first == CharClass::kMatra ||
codes_[codes_used_].first == CharClass::kMatraPiece) {
++num_matra_parts;
if (UseMultiCode(num_matra_parts)) return true;
} else if (num_matra_parts) {
if (report_errors_) {
tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n",
output_.back(), codes_[codes_used_].second);
}
return false;
}
if (codes_[codes_used_].first == CharClass::kMatraPiece &&
codes_[codes_used_ - 1].first != CharClass::kMatraPiece) {
if (UseMultiCode(1)) return true;
}
if (codes_[codes_used_].first == CharClass::kVowelModifier) {
if (UseMultiCode(1)) return true;
}
if (codes_used_ + 1 < num_codes &&
codes_[codes_used_].first == CharClass::kVirama &&
codes_[codes_used_ + 1].first == CharClass::kConsonant) {
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true;
}
return true;
}
Validator::CharClass ValidateKhmer::UnicodeToCharClass(char32 ch) const {
if (IsVedicAccent(ch)) return CharClass::kVedicMark;
if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
// Offset from the start of the relevant unicode code block aka code page.
int off = ch - static_cast<char32>(script_);
// Anything in another code block is other.
if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
if (off <= 0x33) return CharClass::kConsonant;
if (off <= 0x45) return CharClass::kMatra;
if (off == 0x46) return CharClass::kMatraPiece;
if (off == 0x4c) return CharClass::kRobat;
if (off == 0x49 || off == 0x4a) return CharClass::kNukta;
if (off <= 0x51) return CharClass::kVowelModifier;
if (off == 0x52) return CharClass::kVirama;
return CharClass::kOther;
}
} // namespace tesseract
#include "validate_khmer.h"
#include "errcode.h"
#include "tprintf.h"
namespace tesseract {
// Returns whether codes matches the pattern for a Khmer Grapheme.
// Taken from unicode standard:
// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf.
// where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation
// to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf.
// Translated to the codes used by the CharClass enum:
// C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC}
// Where R is a new symbol (Robat) and N is repurposed as a consonant shifter.
// Also the Consonant class here includes independent vowels, as they are
// treated the same anyway.
// In the split grapheme mode, the only characters that get grouped are the
// HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in
// the BNF syntax, so who knows what they do.
bool ValidateKhmer::ConsumeGraphemeIfValid() {
int num_codes = codes_.size();
if (codes_used_ == num_codes) return false;
if (codes_[codes_used_].first == CharClass::kOther) {
UseMultiCode(1);
return true;
}
if (codes_[codes_used_].first != CharClass::kConsonant) {
if (report_errors_) {
tprintf("Invalid start of Khmer syllable:0x%x\n",
codes_[codes_used_].second);
}
return false;
}
if (UseMultiCode(1)) return true;
if (codes_[codes_used_].first == CharClass::kRobat ||
codes_[codes_used_].first == CharClass::kNukta) {
if (UseMultiCode(1)) return true;
}
while (codes_used_ + 1 < num_codes &&
codes_[codes_used_].first == CharClass::kVirama &&
codes_[codes_used_ + 1].first == CharClass::kConsonant) {
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true;
if (codes_[codes_used_].first == CharClass::kRobat) {
if (UseMultiCode(1)) return true;
}
}
int num_matra_parts = 0;
if (codes_[codes_used_].second == kZeroWidthJoiner ||
codes_[codes_used_].second == kZeroWidthNonJoiner) {
if (CodeOnlyToOutput()) {
if (report_errors_) {
tprintf("Unterminated joiner: 0x%x\n", output_.back());
}
return false;
}
++num_matra_parts;
}
// Not quite as shown by the BNF, the matra piece is allowed as a matra on its
// own or as an addition to other matras.
if (codes_[codes_used_].first == CharClass::kMatra ||
codes_[codes_used_].first == CharClass::kMatraPiece) {
++num_matra_parts;
if (UseMultiCode(num_matra_parts)) return true;
} else if (num_matra_parts) {
if (report_errors_) {
tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n",
output_.back(), codes_[codes_used_].second);
}
return false;
}
if (codes_[codes_used_].first == CharClass::kMatraPiece &&
codes_[codes_used_ - 1].first != CharClass::kMatraPiece) {
if (UseMultiCode(1)) return true;
}
if (codes_[codes_used_].first == CharClass::kVowelModifier) {
if (UseMultiCode(1)) return true;
}
if (codes_used_ + 1 < num_codes &&
codes_[codes_used_].first == CharClass::kVirama &&
codes_[codes_used_ + 1].first == CharClass::kConsonant) {
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true;
}
return true;
}
Validator::CharClass ValidateKhmer::UnicodeToCharClass(char32 ch) const {
if (IsVedicAccent(ch)) return CharClass::kVedicMark;
if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
// Offset from the start of the relevant unicode code block aka code page.
int off = ch - static_cast<char32>(script_);
// Anything in another code block is other.
if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
if (off <= 0x33) return CharClass::kConsonant;
if (off <= 0x45) return CharClass::kMatra;
if (off == 0x46) return CharClass::kMatraPiece;
if (off == 0x4c) return CharClass::kRobat;
if (off == 0x49 || off == 0x4a) return CharClass::kNukta;
if (off <= 0x51) return CharClass::kVowelModifier;
if (off == 0x52) return CharClass::kVirama;
return CharClass::kOther;
}
} // namespace tesseract
#ifndef TESSERACT_TRAINING_VALIDATE_KHMER_H_
#define TESSERACT_TRAINING_VALIDATE_KHMER_H_
#include "validator.h"
namespace tesseract {
// Subclass of Validator that validates and segments Khmer.
class ValidateKhmer : public Validator {
public:
ValidateKhmer(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {}
~ValidateKhmer() {}
protected:
// Returns whether codes matches the pattern for an Khmer Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch.
CharClass UnicodeToCharClass(char32 ch) const override;
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_KHMER_H_
#ifndef TESSERACT_TRAINING_VALIDATE_KHMER_H_
#define TESSERACT_TRAINING_VALIDATE_KHMER_H_
#include "validator.h"
namespace tesseract {
// Subclass of Validator that validates and segments Khmer.
class ValidateKhmer : public Validator {
public:
ValidateKhmer(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {}
~ValidateKhmer() {}
protected:
// Returns whether codes matches the pattern for an Khmer Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch.
CharClass UnicodeToCharClass(char32 ch) const override;
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_KHMER_H_
#include "validate_myanmar.h"
#include "errcode.h"
#include "icuerrorcode.h"
#include "tprintf.h"
#include "unicode/uchar.h" // From libicu
#include "unicode/uscript.h" // From libicu
namespace tesseract {
// Returns whether codes matches the pattern for a Myanmar Grapheme.
// Taken directly from the unicode table 16-3.
// See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
bool ValidateMyanmar::ConsumeGraphemeIfValid() {
int num_codes = codes_.size();
if (codes_used_ == num_codes) return true;
// Other.
if (IsMyanmarOther(codes_[codes_used_].second)) {
UseMultiCode(1);
return true;
}
// Kinzi.
if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 &&
codes_[codes_used_ + 1].second == kMyanmarAsat &&
codes_[codes_used_ + 2].second == kMyanmarVirama) {
ASSERT_HOST(!CodeOnlyToOutput());
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(3)) return true;
}
// Base consonant/vowel. NOTE that since everything in Myanmar appears to be
// optional, except the base, this is the only place where invalid input can
// be detected and false returned.
if (IsMyanmarLetter(codes_[codes_used_].second)) {
if (UseMultiCode(1)) return true;
} else {
if (report_errors_) {
tprintf("Invalid start of Myanmar syllable:0x%x\n",
codes_[codes_used_].second);
}
return false; // One of these is required.
}
if (ConsumeSubscriptIfPresent()) return true;
ConsumeOptionalSignsIfPresent();
// What we have consumed so far is a valid syllable.
return true;
}
// TODO(rays) Doesn't use intermediate coding like the other scripts, as there
// is little correspondence between the content of table 16-3 and the char
// classes of the Indic languages. (Experts may disagree and improve!)
// In unicode table 16-3 there is basically a long list of optional characters,
// which can be coded quite easily.
// Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!!
// The table also allows sequences that still result in dotted circles!!
// So with a lot of guesswork the rest have been added in a reasonable place.
Validator::CharClass ValidateMyanmar::UnicodeToCharClass(char32 ch) const {
if (IsMyanmarLetter(ch)) return CharClass::kConsonant;
return CharClass::kOther;
}
// Helper consumes/copies a virama and any subscript consonant.
// Returns true if the end of input is reached.
bool ValidateMyanmar::ConsumeSubscriptIfPresent() {
// Subscript consonant. It appears there can be only one.
int num_codes = codes_.size();
if (codes_used_ + 1 < num_codes &&
codes_[codes_used_].second == kMyanmarVirama) {
if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) {
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true;
}
}
return false;
}
// Helper consumes/copies a series of optional signs.
// Returns true if the end of input is reached.
bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {
// The following characters are allowed, all optional, and in sequence.
// An exception is kMyanmarMedialYa, which can include kMyanmarAsat.
const std::vector<char32> kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c,
0x103d, 0x103e, 0x105e, 0x105f, 0x1060,
0x1081, 0x1031});
for (char32 ch : kMedials) {
if (codes_[codes_used_].second == ch) {
if (UseMultiCode(1)) return true;
if (ch == kMyanmarMedialYa &&
codes_[codes_used_].second == kMyanmarAsat) {
if (UseMultiCode(1)) return true;
}
}
}
// Vowel sign i, ii, ai.
char32 ch = codes_[codes_used_].second;
if (ch == 0x102d || ch == 0x102e || ch == 0x1032) {
if (UseMultiCode(1)) return true;
}
// Vowel sign u, uu, and extensions.
ch = codes_[codes_used_].second;
if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) ||
ch == 0x1062 || ch == 0x1067 || ch == 0x1068 ||
(0x1071 <= ch && ch <= 0x1074) || (0x1083 <= ch && ch <= 0x1086) ||
ch == 0x109c || ch == 0x109d) {
if (UseMultiCode(1)) return true;
}
// Tall aa, aa with optional asat.
if (codes_[codes_used_].second == 0x102b ||
codes_[codes_used_].second == 0x102c) {
if (UseMultiCode(1)) return true;
if (codes_[codes_used_].second == kMyanmarAsat) {
if (UseMultiCode(1)) return true;
}
}
// The following characters are allowed, all optional, and in sequence.
const std::vector<char32> kSigns({0x1036, 0x1037});
for (char32 ch : kSigns) {
if (codes_[codes_used_].second == ch) {
if (UseMultiCode(1)) return true;
}
}
// Tone mark extensions.
ch = codes_[codes_used_].second;
if (ch == 0x1038 || ch == kMyanmarAsat || ch == 0x1063 || ch == 0x1064 ||
(0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) ||
ch == 0x108f || ch == 0x109a || ch == 0x109b ||
(0xaa7b <= ch && ch <= 0xaa7d)) {
if (UseMultiCode(1)) return true;
}
return false;
}
// Returns true if the unicode is a Myanmar "letter" including consonants
// and independent vowels. Although table 16-3 distinguishes between some
// base consonants and vowels, the extensions make no such distinction, so we
// put them all into a single bucket.
/* static */
bool ValidateMyanmar::IsMyanmarLetter(char32 ch) {
return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f ||
(0x1050 <= ch && ch <= 0x1055) || (0x105a <= ch && ch <= 0x105d) ||
ch == 0x1061 || ch == 0x1065 || ch == 0x1066 ||
(0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1080) ||
ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9ef) ||
(0xa9fa <= ch && ch <= 0xa9ff) || (0xaa60 <= ch && ch <= 0xaa73) ||
ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f;
}
// Returns true if ch is a Myanmar digit or other symbol that does not take
// part in being a syllable.
/* static */
bool ValidateMyanmar::IsMyanmarOther(char32 ch) {
IcuErrorCode err;
UScriptCode script_code = uscript_getScript(ch, err);
if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner &&
ch != Validator::kZeroWidthNonJoiner)
return true;
return (0x1040 <= ch && ch <= 0x1049) || (0x1090 <= ch && ch <= 0x1099) ||
(0x109c <= ch && ch <= 0x109d) || (0xa9f0 <= ch && ch <= 0xa9f9) ||
(0xaa74 <= ch && ch <= 0xaa79);
}
} // namespace tesseract
#include "validate_myanmar.h"
#include "errcode.h"
#include "icuerrorcode.h"
#include "tprintf.h"
#include "unicode/uchar.h" // From libicu
#include "unicode/uscript.h" // From libicu
namespace tesseract {
// Returns whether codes matches the pattern for a Myanmar Grapheme.
// Taken directly from the unicode table 16-3.
// See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
bool ValidateMyanmar::ConsumeGraphemeIfValid() {
int num_codes = codes_.size();
if (codes_used_ == num_codes) return true;
// Other.
if (IsMyanmarOther(codes_[codes_used_].second)) {
UseMultiCode(1);
return true;
}
// Kinzi.
if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 &&
codes_[codes_used_ + 1].second == kMyanmarAsat &&
codes_[codes_used_ + 2].second == kMyanmarVirama) {
ASSERT_HOST(!CodeOnlyToOutput());
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(3)) return true;
}
// Base consonant/vowel. NOTE that since everything in Myanmar appears to be
// optional, except the base, this is the only place where invalid input can
// be detected and false returned.
if (IsMyanmarLetter(codes_[codes_used_].second)) {
if (UseMultiCode(1)) return true;
} else {
if (report_errors_) {
tprintf("Invalid start of Myanmar syllable:0x%x\n",
codes_[codes_used_].second);
}
return false; // One of these is required.
}
if (ConsumeSubscriptIfPresent()) return true;
ConsumeOptionalSignsIfPresent();
// What we have consumed so far is a valid syllable.
return true;
}
// TODO(rays) Doesn't use intermediate coding like the other scripts, as there
// is little correspondence between the content of table 16-3 and the char
// classes of the Indic languages. (Experts may disagree and improve!)
// In unicode table 16-3 there is basically a long list of optional characters,
// which can be coded quite easily.
// Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!!
// The table also allows sequences that still result in dotted circles!!
// So with a lot of guesswork the rest have been added in a reasonable place.
Validator::CharClass ValidateMyanmar::UnicodeToCharClass(char32 ch) const {
if (IsMyanmarLetter(ch)) return CharClass::kConsonant;
return CharClass::kOther;
}
// Helper consumes/copies a virama and any subscript consonant.
// Returns true if the end of input is reached.
bool ValidateMyanmar::ConsumeSubscriptIfPresent() {
// Subscript consonant. It appears there can be only one.
int num_codes = codes_.size();
if (codes_used_ + 1 < num_codes &&
codes_[codes_used_].second == kMyanmarVirama) {
if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) {
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true;
}
}
return false;
}
// Helper consumes/copies a series of optional signs.
// Returns true if the end of input is reached.
bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {
// The following characters are allowed, all optional, and in sequence.
// An exception is kMyanmarMedialYa, which can include kMyanmarAsat.
const std::vector<char32> kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c,
0x103d, 0x103e, 0x105e, 0x105f, 0x1060,
0x1081, 0x1031});
for (char32 ch : kMedials) {
if (codes_[codes_used_].second == ch) {
if (UseMultiCode(1)) return true;
if (ch == kMyanmarMedialYa &&
codes_[codes_used_].second == kMyanmarAsat) {
if (UseMultiCode(1)) return true;
}
}
}
// Vowel sign i, ii, ai.
char32 ch = codes_[codes_used_].second;
if (ch == 0x102d || ch == 0x102e || ch == 0x1032) {
if (UseMultiCode(1)) return true;
}
// Vowel sign u, uu, and extensions.
ch = codes_[codes_used_].second;
if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) ||
ch == 0x1062 || ch == 0x1067 || ch == 0x1068 ||
(0x1071 <= ch && ch <= 0x1074) || (0x1083 <= ch && ch <= 0x1086) ||
ch == 0x109c || ch == 0x109d) {
if (UseMultiCode(1)) return true;
}
// Tall aa, aa with optional asat.
if (codes_[codes_used_].second == 0x102b ||
codes_[codes_used_].second == 0x102c) {
if (UseMultiCode(1)) return true;
if (codes_[codes_used_].second == kMyanmarAsat) {
if (UseMultiCode(1)) return true;
}
}
// The following characters are allowed, all optional, and in sequence.
const std::vector<char32> kSigns({0x1036, 0x1037});
for (char32 ch : kSigns) {
if (codes_[codes_used_].second == ch) {
if (UseMultiCode(1)) return true;
}
}
// Tone mark extensions.
ch = codes_[codes_used_].second;
if (ch == 0x1038 || ch == kMyanmarAsat || ch == 0x1063 || ch == 0x1064 ||
(0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) ||
ch == 0x108f || ch == 0x109a || ch == 0x109b ||
(0xaa7b <= ch && ch <= 0xaa7d)) {
if (UseMultiCode(1)) return true;
}
return false;
}
// Returns true if the unicode is a Myanmar "letter" including consonants
// and independent vowels. Although table 16-3 distinguishes between some
// base consonants and vowels, the extensions make no such distinction, so we
// put them all into a single bucket.
/* static */
bool ValidateMyanmar::IsMyanmarLetter(char32 ch) {
return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f ||
(0x1050 <= ch && ch <= 0x1055) || (0x105a <= ch && ch <= 0x105d) ||
ch == 0x1061 || ch == 0x1065 || ch == 0x1066 ||
(0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1080) ||
ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9ef) ||
(0xa9fa <= ch && ch <= 0xa9ff) || (0xaa60 <= ch && ch <= 0xaa73) ||
ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f;
}
// Returns true if ch is a Myanmar digit or other symbol that does not take
// part in being a syllable.
/* static */
bool ValidateMyanmar::IsMyanmarOther(char32 ch) {
IcuErrorCode err;
UScriptCode script_code = uscript_getScript(ch, err);
if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner &&
ch != Validator::kZeroWidthNonJoiner)
return true;
return (0x1040 <= ch && ch <= 0x1049) || (0x1090 <= ch && ch <= 0x1099) ||
(0x109c <= ch && ch <= 0x109d) || (0xa9f0 <= ch && ch <= 0xa9f9) ||
(0xaa74 <= ch && ch <= 0xaa79);
}
} // namespace tesseract
#ifndef TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
#define TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
#include "validator.h"
namespace tesseract {
// Subclass of Validator that validates and segments Myanmar.
class ValidateMyanmar : public Validator {
public:
ValidateMyanmar(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {}
~ValidateMyanmar() {}
protected:
// Returns whether codes matches the pattern for a Myanmar Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch.
Validator::CharClass UnicodeToCharClass(char32 ch) const override;
private:
// Helper consumes/copies a virama and any subscript consonant.
// Returns true if the end of input is reached.
bool ConsumeSubscriptIfPresent();
// Helper consumes/copies a series of optional signs.
// Returns true if the end of input is reached.
bool ConsumeOptionalSignsIfPresent();
// Returns true if the unicode is a Myanmar "letter" including consonants
// and independent vowels. Although table 16-3 distinguishes between some
// base consonants and vowels, the extensions make no such distinction, so we
// put them all into a single bucket.
static bool IsMyanmarLetter(char32 ch);
// Returns true if ch is a Myanmar digit or other symbol that does not take
// part in being a syllable.
static bool IsMyanmarOther(char32 ch);
// Some special unicodes used only for Myanmar processing.
static const char32 kMyanmarAsat = 0x103a;
static const char32 kMyanmarMedialYa = 0x103b;
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
#ifndef TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
#define TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
#include "validator.h"
namespace tesseract {
// Subclass of Validator that validates and segments Myanmar.
class ValidateMyanmar : public Validator {
public:
ValidateMyanmar(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {}
~ValidateMyanmar() {}
protected:
// Returns whether codes matches the pattern for a Myanmar Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch.
Validator::CharClass UnicodeToCharClass(char32 ch) const override;
private:
// Helper consumes/copies a virama and any subscript consonant.
// Returns true if the end of input is reached.
bool ConsumeSubscriptIfPresent();
// Helper consumes/copies a series of optional signs.
// Returns true if the end of input is reached.
bool ConsumeOptionalSignsIfPresent();
// Returns true if the unicode is a Myanmar "letter" including consonants
// and independent vowels. Although table 16-3 distinguishes between some
// base consonants and vowels, the extensions make no such distinction, so we
// put them all into a single bucket.
static bool IsMyanmarLetter(char32 ch);
// Returns true if ch is a Myanmar digit or other symbol that does not take
// part in being a syllable.
static bool IsMyanmarOther(char32 ch);
// Some special unicodes used only for Myanmar processing.
static const char32 kMyanmarAsat = 0x103a;
static const char32 kMyanmarMedialYa = 0x103b;
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册