提交 df41eab6 编写于 作者: R Ray Smith

Added script-specific validation and normalization for virama-using scripts...

Added script-specific validation and normalization for virama-using scripts and updated normalization for others
上级 da03e4e9
......@@ -226,3 +226,19 @@ std::vector<char32> UNICHAR::UTF8ToUTF32(const char* utf8_str) {
return unicodes;
}
// Returns an empty string if the input contains an invalid unicode.
string UNICHAR::UTF32ToUTF8(const std::vector<char32>& str32) {
string utf8_str;
for (char32 ch : str32) {
UNICHAR uni_ch(ch);
int step;
if (uni_ch.utf8_len() > 0 && (step = utf8_step(uni_ch.utf8())) > 0) {
utf8_str.append(uni_ch.utf8(), step);
} else {
return "";
}
}
return utf8_str;
}
} // namespace tesseract
......@@ -21,7 +21,9 @@ noinst_HEADERS = \
boxchar.h commandlineflags.h commontraining.h degradeimage.h \
fileio.h icuerrorcode.h ligature_table.h lstmtester.h normstrngs.h \
mergenf.h pango_font_info.h stringrenderer.h \
tessopt.h tlog.h unicharset_training_utils.h util.h
tessopt.h tlog.h unicharset_training_utils.h util.h \
validate_grapheme.h validate_indic.h validate_khmer.h \
validate_myanmar.h validator.h
noinst_LTLIBRARIES = libtesseract_training.la libtesseract_tessopt.la
......@@ -32,7 +34,9 @@ libtesseract_training_la_LIBADD = \
libtesseract_training_la_SOURCES = \
boxchar.cpp commandlineflags.cpp commontraining.cpp degradeimage.cpp \
fileio.cpp ligature_table.cpp lstmtester.cpp normstrngs.cpp pango_font_info.cpp \
stringrenderer.cpp tlog.cpp unicharset_training_utils.cpp
stringrenderer.cpp tlog.cpp unicharset_training_utils.cpp \
validate_grapheme.cpp validate_indic.cpp validate_khmer.cpp \
validate_myanmar.cpp validator.cpp
libtesseract_tessopt_la_SOURCES = \
tessopt.cpp
......
......@@ -21,6 +21,10 @@
#include "normstrngs.h"
#include <assert.h>
#include <string>
#include <unordered_map>
#include <vector>
#include "icuerrorcode.h"
#include "unichar.h"
#include "unicode/normalizer2.h" // From libicu
......@@ -34,18 +38,17 @@ namespace tesseract {
bool is_hyphen_punc(const char32 ch) {
static const int kNumHyphenPuncUnicodes = 13;
static const char32 kHyphenPuncUnicodes[kNumHyphenPuncUnicodes] = {
'-',
0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, // hyphen..horizontal bar
0x207b, // superscript minus
0x208b, // subscript minus
0x2212, // minus sign
0xfe58, // small em dash
0xfe63, // small hyphen-minus
0xff0d, // fullwidth hyphen-minus
'-', 0x2010, 0x2011, 0x2012,
0x2013, 0x2014, 0x2015, // hyphen..horizontal bar
0x207b, // superscript minus
0x208b, // subscript minus
0x2212, // minus sign
0xfe58, // small em dash
0xfe63, // small hyphen-minus
0xff0d, // fullwidth hyphen-minus
};
for (int i = 0; i < kNumHyphenPuncUnicodes; ++i) {
if (kHyphenPuncUnicodes[i] == ch)
return true;
if (kHyphenPuncUnicodes[i] == ch) return true;
}
return false;
}
......@@ -53,19 +56,17 @@ bool is_hyphen_punc(const char32 ch) {
bool is_single_quote(const char32 ch) {
static const int kNumSingleQuoteUnicodes = 8;
static const char32 kSingleQuoteUnicodes[kNumSingleQuoteUnicodes] = {
'\'',
'`',
0x2018, // left single quotation mark (English, others)
0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.)
// We may have to introduce a comma set with 0x201a
0x201B, // single high-reveresed-9 quotation mark (PropList.txt)
0x2032, // prime
0x300C, // left corner bracket (East Asian languages)
0xFF07, // fullwidth apostrophe
'\'', '`',
0x2018, // left single quotation mark (English, others)
0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.)
// We may have to introduce a comma set with 0x201a
0x201B, // single high-reveresed-9 quotation mark (PropList.txt)
0x2032, // prime
0x300C, // left corner bracket (East Asian languages)
0xFF07, // fullwidth apostrophe
};
for (int i = 0; i < kNumSingleQuoteUnicodes; ++i) {
if (kSingleQuoteUnicodes[i] == ch)
return true;
if (kSingleQuoteUnicodes[i] == ch) return true;
}
return false;
}
......@@ -73,60 +74,130 @@ bool is_single_quote(const char32 ch) {
bool is_double_quote(const char32 ch) {
static const int kNumDoubleQuoteUnicodes = 8;
static const char32 kDoubleQuoteUnicodes[kNumDoubleQuoteUnicodes] = {
'"',
0x201C, // left double quotation mark (English, others)
0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.)
0x201F, // double high-reversed-9 quotation mark (PropList.txt)
0x2033, // double prime
0x301D, // reversed double prime quotation mark (East Asian langs, horiz.)
0x301E, // close double prime (East Asian languages written horizontally)
0xFF02, // fullwidth quotation mark
'"',
0x201C, // left double quotation mark (English, others)
0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.)
0x201F, // double high-reversed-9 quotation mark (PropList.txt)
0x2033, // double prime
0x301D, // reversed double prime quotation mark (East Asian langs,
// horiz.)
0x301E, // close double prime (East Asian languages written horizontally)
0xFF02, // fullwidth quotation mark
};
for (int i = 0; i < kNumDoubleQuoteUnicodes; ++i) {
if (kDoubleQuoteUnicodes[i] == ch)
return true;
if (kDoubleQuoteUnicodes[i] == ch) return true;
}
return false;
}
STRING NormalizeUTF8String(bool decompose, const char* str8) {
GenericVector<char32> str32, out_str32, norm_str;
UTF8ToUTF32(str8, &str32);
for (int i = 0; i < str32.length(); ++i) {
norm_str.clear();
NormalizeChar32(str32[i], decompose, &norm_str);
for (int j = 0; j < norm_str.length(); ++j) {
out_str32.push_back(norm_str[j]);
}
}
STRING out_str8;
UTF32ToUTF8(out_str32, &out_str8);
return out_str8;
}
void NormalizeChar32(char32 ch, bool decompose, GenericVector<char32>* str) {
// Helper runs a standard unicode normalization, optional OCR normalization,
// and leaves the result as char32 for subsequent processing.
static void NormalizeUTF8ToUTF32(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
const char* str8,
std::vector<char32>* normed32) {
// Convert to ICU string for unicode normalization.
icu::UnicodeString uch_str(str8, "UTF-8");
IcuErrorCode error_code;
const icu::Normalizer2* nfkc = icu::Normalizer2::getInstance(
nullptr, "nfkc", decompose ? UNORM2_DECOMPOSE : UNORM2_COMPOSE,
error_code);
// Convert the enum to the new weird icu representation.
const char* norm_type =
u_mode == UnicodeNormMode::kNFKD || u_mode == UnicodeNormMode::kNFKC
? "nfkc"
: "nfc";
UNormalization2Mode compose =
u_mode == UnicodeNormMode::kNFC || u_mode == UnicodeNormMode::kNFKC
? UNORM2_COMPOSE
: UNORM2_DECOMPOSE;
// Pointer to singleton does not require deletion.
const icu::Normalizer2* normalizer =
icu::Normalizer2::getInstance(nullptr, norm_type, compose, error_code);
error_code.assertSuccess();
error_code.reset();
icu::UnicodeString uch_str(static_cast<UChar32>(ch));
icu::UnicodeString norm_str = nfkc->normalize(uch_str, error_code);
icu::UnicodeString norm_str = normalizer->normalize(uch_str, error_code);
error_code.assertSuccess();
// Convert to char32 for output. OCR normalization if required.
normed32->reserve(norm_str.length()); // An approximation.
for (int offset = 0; offset < norm_str.length();
offset = norm_str.moveIndex32(offset, 1)) {
char32 ch = norm_str.char32At(offset);
// Skip all ZWS, RTL and LTR marks.
if (Validator::IsZeroWidthMark(ch)) continue;
if (ocr_normalize == OCRNorm::kNormalize) ch = OCRNormalize(ch);
normed32->push_back(ch);
}
}
// Helper removes joiners from strings that contain no letters.
static void StripJoiners(std::vector<char32>* str32) {
for (char32 ch : *str32) {
if (u_isalpha(ch)) return;
}
int len = 0;
for (char32 ch : *str32) {
if (ch != Validator::kZeroWidthJoiner &&
ch != Validator::kZeroWidthNonJoiner) {
(*str32)[len++] = ch;
}
}
str32->resize(len);
}
// Normalizes a UTF8 string according to the given modes. Returns true on
// success. If false is returned, some failure or invalidity was present, and
// the result string is produced on a "best effort" basis.
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
GraphemeNorm grapheme_normalize, const char* str8,
string* normalized) {
std::vector<char32> normed32;
NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32);
if (grapheme_normalize == GraphemeNorm::kNormalize) {
StripJoiners(&normed32);
std::vector<std::vector<char32>> graphemes;
bool success = Validator::ValidateCleanAndSegment(
GraphemeNormMode::kSingleString, false, normed32, &graphemes);
if (graphemes.empty() || graphemes[0].empty()) {
success = false;
} else if (normalized != nullptr) {
*normalized = UNICHAR::UTF32ToUTF8(graphemes[0]);
}
return success;
}
if (normalized != nullptr) *normalized = UNICHAR::UTF32ToUTF8(normed32);
return true;
}
str->clear();
for (int i = 0; i < norm_str.length(); ++i) {
// If any spaces were added by NFKC, pretend normalization is a nop.
if (norm_str[i] == ' ') {
str->clear();
str->push_back(ch);
break;
} else {
str->push_back(OCRNormalize(static_cast<char32>(norm_str[i])));
// Normalizes a UTF8 string according to the given modes and splits into
// graphemes according to g_mode. Returns true on success. If false is returned,
// some failure or invalidity was present, and the result string is produced on
// a "best effort" basis.
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
GraphemeNormMode g_mode, bool report_errors,
const char* str8,
std::vector<string>* graphemes) {
std::vector<char32> normed32;
NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32);
StripJoiners(&normed32);
std::vector<std::vector<char32>> graphemes32;
bool success = Validator::ValidateCleanAndSegment(g_mode, report_errors,
normed32, &graphemes32);
if (g_mode != GraphemeNormMode::kSingleString && success) {
// If we modified the string to clean it up, the segmentation may not be
// correct, so check for changes and do it again.
std::vector<char32> cleaned32;
for (const auto& g : graphemes32) {
cleaned32.insert(cleaned32.end(), g.begin(), g.end());
}
if (cleaned32 != normed32) {
graphemes32.clear();
success = Validator::ValidateCleanAndSegment(g_mode, report_errors,
cleaned32, &graphemes32);
}
}
graphemes->clear();
graphemes->reserve(graphemes32.size());
for (const auto& grapheme : graphemes32) {
graphemes->push_back(UNICHAR::UTF32ToUTF8(grapheme));
}
return success;
}
// Apply just the OCR-specific normalizations and return the normalized char.
......
......@@ -21,34 +21,50 @@
#ifndef TESSERACT_CCUTIL_NORMSTRNGS_H_
#define TESSERACT_CCUTIL_NORMSTRNGS_H_
#include "genericvector.h"
#include "strngs.h"
#include <string>
#include <vector>
typedef signed int char32;
#include "validator.h"
namespace tesseract {
// UTF-8 to UTF-32 conversion function.
void UTF8ToUTF32(const char* utf8_str, GenericVector<char32>* str32);
// UTF-32 to UTF-8 convesion function.
void UTF32ToUTF8(const GenericVector<char32>& str32, STRING* utf8_str);
// Normalize a single char32 using NFKC + OCR-specific transformations.
// NOTE that proper NFKC may require multiple characters as input. The
// assumption of this function is that the input is already as fully composed
// as it can be, but may require some compatibility normalizations or just
// OCR evaluation related normalizations.
void NormalizeChar32(char32 ch, bool decompose, GenericVector<char32>* str);
// Normalize a UTF8 string. Same as above, but for UTF8-encoded strings, that
// can contain multiple UTF32 code points.
STRING NormalizeUTF8String(bool decompose, const char* str8);
// Default behavior is to compose, until it is proven that decomposed benefits
// at least one language.
inline STRING NormalizeUTF8String(const char* str8) {
return NormalizeUTF8String(false, str8);
}
// The standard unicode normalizations.
enum class UnicodeNormMode {
kNFD,
kNFC,
kNFKD,
kNFKC,
};
// To normalize away differences in punctuation that are ambiguous, like
// curly quotes and different widths of dash.
enum class OCRNorm {
kNone,
kNormalize,
};
// To validate and normalize away some subtle differences that can occur in
// Indic scripts, eg ensuring that an explicit virama is always followed by
// a zero-width non-joiner.
enum class GraphemeNorm {
kNone,
kNormalize,
};
// Normalizes a UTF8 string according to the given modes. Returns true on
// success. If false is returned, some failure or invalidity was present, and
// the result string is produced on a "best effort" basis.
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
GraphemeNorm grapheme_normalize, const char* str8,
string* normalized);
// Normalizes a UTF8 string according to the given modes and splits into
// graphemes according to g_mode. Returns true on success. If false is returned,
// some failure or invalidity was present, and the result string is produced on
// a "best effort" basis.
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
GraphemeNormMode g_mode, bool report_errors,
const char* str8,
std::vector<string>* graphemes);
// Applies just the OCR-specific normalizations and return the normalized char.
char32 OCRNormalize(char32 ch);
......
......@@ -122,8 +122,14 @@ void SetupBasicProperties(bool report_errors, bool decompose,
}
// Record normalized version of this unichar.
string normed_str = tesseract::NormalizeUTF8String(decompose, unichar_str);
if (unichar_id != 0 && !normed_str.empty()) {
string normed_str;
if (unichar_id != 0 &&
tesseract::NormalizeUTF8String(
decompose ? tesseract::UnicodeNormMode::kNFKD
: tesseract::UnicodeNormMode::kNFKC,
tesseract::OCRNorm::kNormalize, tesseract::GraphemeNorm::kNone,
unichar_str, &normed_str) &&
!normed_str.empty()) {
unicharset->set_normed(unichar_id, normed_str.c_str());
} else {
unicharset->set_normed(unichar_id, unichar_str);
......
#include "validate_grapheme.h"
#include "tprintf.h"
#include "unicode/uchar.h" // From libicu
namespace tesseract {
bool ValidateGrapheme::ConsumeGraphemeIfValid() {
int num_codes = codes_.size();
char32 prev_prev_ch = ' ';
char32 prev_ch = ' ';
CharClass prev_cc = CharClass::kWhitespace;
int num_codes_in_grapheme = 0;
while (codes_used_ < num_codes) {
CharClass cc = codes_[codes_used_].first;
char32 ch = codes_[codes_used_].second;
const bool is_combiner =
cc == CharClass::kCombiner || cc == CharClass::kVirama;
// Reject easily detected badly formed sequences.
if (prev_cc == CharClass::kWhitespace && is_combiner) {
if (report_errors_) tprintf("Word started with a combiner:0x%x\n", ch);
return false;
}
if (prev_cc == CharClass::kVirama && cc == CharClass::kVirama) {
if (report_errors_)
tprintf("Two grapheme links in a row:0x%x 0x%x\n", prev_ch, ch);
return false;
}
if (prev_cc != CharClass::kWhitespace && cc != CharClass::kWhitespace &&
IsBadlyFormed(prev_ch, ch)) {
return false;
}
bool prev_is_fwd_combiner =
prev_ch == kZeroWidthJoiner || prev_cc == CharClass::kVirama ||
(prev_ch == kZeroWidthNonJoiner &&
(cc == CharClass::kVirama || prev_prev_ch == kZeroWidthJoiner));
if (num_codes_in_grapheme > 0 && !is_combiner && !prev_is_fwd_combiner)
break;
CodeOnlyToOutput();
++num_codes_in_grapheme;
prev_prev_ch = prev_ch;
prev_ch = ch;
prev_cc = cc;
}
if (num_codes_in_grapheme > 0) MultiCodePart(num_codes_in_grapheme);
return true;
}
Validator::CharClass ValidateGrapheme::UnicodeToCharClass(char32 ch) const {
if (IsVedicAccent(ch)) return CharClass::kVedicMark;
// The ZeroWidth[Non]Joiner characters are mapped to kCombiner as they
// always combine with the previous character.
if (u_hasBinaryProperty(ch, UCHAR_GRAPHEME_LINK)) return CharClass::kVirama;
if (u_isUWhiteSpace(ch)) return CharClass::kWhitespace;
int char_type = u_charType(ch);
if (char_type == U_NON_SPACING_MARK || char_type == U_ENCLOSING_MARK ||
char_type == U_COMBINING_SPACING_MARK || ch == kZeroWidthNonJoiner ||
ch == kZeroWidthJoiner)
return CharClass::kCombiner;
return CharClass::kOther;
}
// Helper returns true if the sequence prev_ch,ch is invalid.
bool ValidateGrapheme::IsBadlyFormed(char32 prev_ch, char32 ch) {
// Reject badly formed Indic vowels.
if (IsBadlyFormedIndicVowel(prev_ch, ch)) {
if (report_errors_)
tprintf("Badly formed Indic vowel sequence:0x%x 0x%x\n", prev_ch, ch);
return true;
}
if (IsBadlyFormedThai(prev_ch, ch)) {
if (report_errors_) tprintf("Badly formed Thai:0x%x 0x%x\n", prev_ch, ch);
return true;
}
return false;
}
// Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel.
// Some vowels in Indic scripts may be analytically decomposed into atomic pairs
// of components that are themselves valid unicode symbols. (See Table 12-1 in
// http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
// for examples in Devanagari). The Unicode standard discourages specifying
// vowels this way, but they are sometimes encountered in text, probably because
// some editors still permit it. Renderers however dislike such pairs, and so
// this function may be used to detect their occurence for removal.
// TODO(rays) This function only covers a subset of Indic languages and doesn't
// include all rules. Add rules as appropriate to support other languages or
// find a way to generalize these existing rules that makes use of the
// regularity of the mapping from ISCII to Unicode.
/* static */
bool ValidateGrapheme::IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch) {
return ((prev_ch == 0x905 && (ch == 0x946 || ch == 0x93E)) ||
(prev_ch == 0x909 && ch == 0x941) ||
(prev_ch == 0x90F && (ch >= 0x945 && ch <= 0x947)) ||
(prev_ch == 0x905 && (ch >= 0x949 && ch <= 0x94C)) ||
(prev_ch == 0x906 && (ch >= 0x949 && ch <= 0x94C)) ||
// Illegal combinations of two dependent Devanagari vowels.
(prev_ch == 0x93E && (ch >= 0x945 && ch <= 0x948)) ||
// Dependent Devanagari vowels following a virama.
(prev_ch == 0x94D && (ch >= 0x93E && ch <= 0x94C)) ||
// Bengali vowels (Table 9-5, pg 313)
(prev_ch == 0x985 && ch == 0x9BE) ||
// Telugu vowels (Table 9-19, pg 331)
(prev_ch == 0xC12 && (ch == 0xC55 || ch == 0xC4C)) ||
// Kannada vowels (Table 9-20, pg 332)
(prev_ch == 0xC92 && ch == 0xCCC));
}
// Helper returns true if ch is a Thai consonant.
static bool IsThaiConsonant(char32 ch) { return 0xe01 <= ch && ch <= 0xe2e; }
// Helper returns true is ch is a before-consonant vowel.
static bool IsThaiBeforeConsonantVowel(char32 ch) {
return 0xe40 <= ch && ch <= 0xe44;
}
// Helper returns true if ch is a Thai tone mark.
static bool IsThaiToneMark(char32 ch) { return 0xe48 <= ch && ch <= 0xe4b; }
// Helper returns true if ch is a Thai vowel that may be followed by a tone
// mark.
static bool IsThaiTonableVowel(char32 ch) {
return (0xe34 <= ch && ch <= 0xe39) || ch == 0xe31;
}
// Helper returns true if the sequence prev_ch,ch is invalid Thai.
// These rules come from a native Thai speaker, and are not covered by the
// Thai section in the unicode book:
// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
// Comments below added by Ray interpreting the code ranges.
/* static */
bool ValidateGrapheme::IsBadlyFormedThai(char32 prev_ch, char32 ch) {
// Tone marks must follow consonants or specific vowels.
if (IsThaiToneMark(ch) &&
!(IsThaiConsonant(prev_ch) || IsThaiTonableVowel(prev_ch))) {
return true;
}
// Tonable vowels must follow consonants.
if ((IsThaiTonableVowel(ch) || ch == 0xe47) && !IsThaiConsonant(prev_ch)) {
return true;
}
// Thanthakhat must follow consonant or specific vowels.
if (ch == 0xe4c &&
!(IsThaiConsonant(prev_ch) || prev_ch == 0xe38 || prev_ch == 0xe34)) {
return true;
}
// Nikkhahit must follow a consonant ?or certain markers?.
// TODO(rays) confirm this, but there were so many in the ground truth of the
// validation set that it seems reasonable to assume it is valid.
if (ch == 0xe4d &&
!(IsThaiConsonant(prev_ch) || prev_ch == 0xe48 || prev_ch == 0xe49)) {
return true;
}
// The vowels e30, e32, e33 can be used more liberally.
if ((ch == 0xe30 || ch == 0xe32 || ch == 0xe33) &&
!(IsThaiConsonant(prev_ch) || IsThaiToneMark(prev_ch)) &&
!(prev_ch == 0xe32 && ch == 0xe30) &&
!(prev_ch == 0xe4d && ch == 0xe32)) {
return true;
}
// Some vowels come before consonants, and therefore cannot follow things
// that cannot end a syllable.
if (IsThaiBeforeConsonantVowel(ch) &&
(IsThaiBeforeConsonantVowel(prev_ch) || prev_ch == 0xe31 ||
prev_ch == 0xe37)) {
return true;
}
// Dont allow the standalone vowel U+0e24 to be followed by other vowels.
if ((0xe30 <= ch && ch <= 0xe4D) && prev_ch == 0xe24) {
return true;
}
return false;
}
} // namespace tesseract
#ifndef TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
#define TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
#include "validator.h"
namespace tesseract {
// Subclass of Validator that validates and segments generic unicode into
// grapheme clusters, including Latin with diacritics.
class ValidateGrapheme : public Validator {
public:
ValidateGrapheme(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {}
~ValidateGrapheme() {}
protected:
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch.
CharClass UnicodeToCharClass(char32 ch) const override;
private:
// Helper returns true if the sequence prev_ch,ch is invalid.
bool IsBadlyFormed(char32 prev_ch, char32 ch);
// Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel.
static bool IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch);
// Helper returns true if the sequence prev_ch,ch is invalid Thai.
static bool IsBadlyFormedThai(char32 prev_ch, char32 ch);
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
#include "validate_indic.h"
#include "errcode.h"
#include "tprintf.h"
namespace tesseract {
// Returns whether codes matches the pattern for an Indic Grapheme.
// The ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf
// has a BNF for valid syllables (Graphemes) which is modified slightly
// for Unicode. Notably U+200C and U+200D are used before/after the
// virama/virama to express explicit or soft viramas.
// Also the unicode v.9 Malayalam entry states that CZHC can be used in several
// Indic languages to request traditional ligatures, and CzHC is Malayalam-
// specific for requesting open conjuncts.
//
// + vowel Grapheme: V[D](v)*
// + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
bool ValidateIndic::ConsumeGraphemeIfValid() {
switch (codes_[codes_used_].first) {
case CharClass::kConsonant:
return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();
case CharClass::kVowel:
return ConsumeVowelIfValid();
case CharClass::kZeroWidthJoiner:
case CharClass::kZeroWidthNonJoiner:
// Apart from within an aksara, joiners are silently dropped.
if (report_errors_)
tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second);
++codes_used_;
return true;
case CharClass::kOther:
UseMultiCode(1);
return true;
default:
if (report_errors_) {
tprintf("Invalid start of grapheme sequence:%c=0x%x\n",
codes_[codes_used_].first, codes_[codes_used_].second);
}
return false;
}
}
Validator::CharClass ValidateIndic::UnicodeToCharClass(char32 ch) const {
if (IsVedicAccent(ch)) return CharClass::kVedicMark;
if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
// Offset from the start of the relevant unicode code block aka code page.
int base = static_cast<char32>(script_);
int off = ch - base;
// Anything in another code block is other.
if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
// Exception for Tamil. The aytham character is considered a letter.
if (script_ == ViramaScript::kTamil && off == 0x03) return CharClass::kVowel;
if (off < 0x4) return CharClass::kVowelModifier;
if (script_ == ViramaScript::kSinhala) {
// Sinhala is an exception.
if (off <= 0x19) return CharClass::kVowel;
if (off <= 0x49) return CharClass::kConsonant;
if (off == 0x4a) return CharClass::kVirama;
if (off <= 0x5f) return CharClass::kMatra;
} else {
if (off <= 0x14 || off == 0x50) return CharClass::kVowel;
if (off <= 0x3b || (0x58 <= off && off <= 0x5f))
return CharClass::kConsonant;
// Sinhala doesn't have Nukta or Avagraha.
if (off == 0x3c) return CharClass::kNukta;
if (off == 0x3d) return CharClass::kVowel;
if (off <= 0x4c || (0x51 <= off && off <= 0x54)) return CharClass::kMatra;
if (0x55 <= off && off <= 0x57) return CharClass::kMatraPiece;
if (off == 0x4d) return CharClass::kVirama;
}
if (off == 0x60 || off == 0x61) return CharClass::kVowel;
if (off == 0x62 || off == 0x63) return CharClass::kMatra;
// Danda and digits up to 6f are OK as other.
// 70-7f are script-specific.
if (script_ == ViramaScript::kBengali && (off == 0x70 || off == 0x71))
return CharClass::kConsonant;
if (script_ == ViramaScript::kGurmukhi && (off == 0x72 || off == 0x73))
return CharClass::kConsonant;
if (script_ == ViramaScript::kSinhala && off == 0x70)
return CharClass::kConsonant;
if (script_ == ViramaScript::kDevanagari && off == 0x70)
return CharClass::kOther;
if (0x70 <= off && off <= 0x73) return CharClass::kVowelModifier;
// Non Indic, Digits, Measures, danda, etc.
return CharClass::kOther;
}
// Helper consumes/copies a virama and any associated post-virama joiners.
// A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or
// no joiner at all) must be followed by a consonant.
// A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non
// consonant, space, or character from a different script. We clean up the
// representation to make it consistent by adding a ZWNJ if missing from a
// non-linking virama. Returns false with an invalid sequence.
bool ValidateIndic::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) {
int num_codes = codes_.size();
if (joiner.first == CharClass::kOther) {
CodeOnlyToOutput();
if (codes_used_ < num_codes &&
codes_[codes_used_].second == kZeroWidthJoiner) {
// Post-matra viramas must be explicit, so no joiners allowed here.
if (post_matra) {
if (report_errors_) tprintf("ZWJ after a post-matra virama!!\n");
return false;
}
if (codes_used_ + 1 < num_codes &&
codes_[codes_used_ - 2].second != kRayana &&
(codes_[codes_used_ + 1].second == kZeroWidthNonJoiner ||
codes_[codes_used_ + 1].second == kYayana ||
codes_[codes_used_ + 1].second == kRayana)) {
// This combination will be picked up later.
ASSERT_HOST(!CodeOnlyToOutput());
} else {
// Half-form with optional Nukta.
int len = output_.size() + 1 - output_used_;
if (UseMultiCode(len)) return true;
}
if (codes_used_ < num_codes &&
codes_[codes_used_].second == kZeroWidthNonJoiner) {
if (output_used_ == output_.size() ||
output_[output_used_] != kRayana) {
if (report_errors_) {
tprintf("Virama ZWJ ZWNJ in non-Sinhala: base=0x%x!\n",
static_cast<int>(script_));
}
return false;
}
// Special Sinhala case of Stand-alone Repaya. ['RA' H Z z]
if (UseMultiCode(4)) return true;
}
} else if (codes_used_ == num_codes ||
codes_[codes_used_].first != CharClass::kConsonant ||
post_matra) {
if (codes_used_ == num_codes ||
codes_[codes_used_].second != kZeroWidthNonJoiner) {
// It is valid to have an unterminated virama at the end of a word, but
// for consistency, we will always add ZWNJ if not present.
output_.push_back(kZeroWidthNonJoiner);
} else {
CodeOnlyToOutput();
}
// Explicit virama [H z]
MultiCodePart(2);
}
} else {
// Pre-virama joiner [{Z|z} H] requests specific conjunct.
if (UseMultiCode(2)) {
if (report_errors_)
tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n");
return false;
}
if (codes_[codes_used_].second == kZeroWidthJoiner ||
codes_[codes_used_].second == kZeroWidthNonJoiner) {
if (report_errors_) {
tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(),
codes_[codes_used_].second);
}
return false;
}
}
// It is good so far as it goes.
return true;
}
// Helper consumes/copies a series of consonants separated by viramas while
// valid, but not any vowel or other modifiers.
bool ValidateIndic::ConsumeConsonantHeadIfValid() {
const int num_codes = codes_.size();
// Consonant aksara
do {
CodeOnlyToOutput();
// Special Sinhala case of [H Z Yayana/Rayana].
int index = output_.size() - 3;
if (output_used_ <= index &&
(output_.back() == kYayana || output_.back() == kRayana) &&
IsVirama(output_[index]) && output_[index + 1] == kZeroWidthJoiner) {
MultiCodePart(3);
}
bool have_nukta = false;
if (codes_used_ < num_codes &&
codes_[codes_used_].first == CharClass::kNukta) {
have_nukta = true;
CodeOnlyToOutput();
}
// Test for subscript conjunct.
index = output_.size() - 2 - have_nukta;
if (output_used_ <= index && IsSubscriptScript() &&
IsVirama(output_[index])) {
// Output previous virama, consonant + optional nukta.
MultiCodePart(2 + have_nukta);
}
IndicPair joiner(CharClass::kOther, 0);
if (codes_used_ < num_codes &&
(codes_[codes_used_].second == kZeroWidthJoiner ||
(codes_[codes_used_].second == kZeroWidthNonJoiner &&
script_ == ViramaScript::kMalayalam))) {
joiner = codes_[codes_used_];
if (++codes_used_ == num_codes) {
if (report_errors_) {
tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(),
joiner.second);
}
return true;
}
if (codes_[codes_used_].first == CharClass::kVirama) {
output_.push_back(joiner.second);
} else {
if (report_errors_) {
tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n",
output_.back(), joiner.second, codes_[codes_used_].second);
}
joiner = std::make_pair(CharClass::kOther, 0);
}
}
if (codes_used_ < num_codes &&
codes_[codes_used_].first == CharClass::kVirama) {
if (!ConsumeViramaIfValid(joiner, false)) return false;
} else {
break; // No virama, so the run of consonants is over.
}
} while (codes_used_ < num_codes &&
codes_[codes_used_].first == CharClass::kConsonant);
if (output_used_ < output_.size()) MultiCodePart(1);
return true;
}
// Helper consumes/copies a tail part of a consonant, comprising optional
// matra/piece, vowel modifier, vedic mark, terminating virama.
bool ValidateIndic::ConsumeConsonantTailIfValid() {
if (codes_used_ == codes_.size()) return true;
// No virama: Finish the grapheme.
// Are multiple matras allowed?
if (codes_[codes_used_].first == CharClass::kMatra) {
if (UseMultiCode(1)) return true;
if (codes_[codes_used_].first == CharClass::kMatraPiece) {
if (UseMultiCode(1)) return true;
}
}
while (codes_[codes_used_].first == CharClass::kVowelModifier) {
if (UseMultiCode(1)) return true;
// Only Malayalam allows only repeated 0xd02.
if (script_ != ViramaScript::kMalayalam || output_.back() != 0xd02) break;
}
while (codes_[codes_used_].first == CharClass::kVedicMark) {
if (UseMultiCode(1)) return true;
}
if (codes_[codes_used_].first == CharClass::kVirama) {
if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) {
return false;
}
}
// What we have consumed so far is a valid consonant cluster.
if (output_used_ < output_.size()) MultiCodePart(1);
return true;
}
// Helper consumes/copies a vowel and optional modifiers.
bool ValidateIndic::ConsumeVowelIfValid() {
if (UseMultiCode(1)) return true;
while (codes_[codes_used_].first == CharClass::kVowelModifier) {
if (UseMultiCode(1)) return true;
// Only Malayalam allows repeated modifiers?
if (script_ != ViramaScript::kMalayalam) break;
}
while (codes_[codes_used_].first == CharClass::kVedicMark) {
if (UseMultiCode(1)) return true;
}
// What we have consumed so far is a valid vowel cluster.
return true;
}
} // namespace tesseract
#ifndef TESSERACT_TRAINING_VALIDATE_INDIC_H_
#define TESSERACT_TRAINING_VALIDATE_INDIC_H_
#include "validator.h"
namespace tesseract {
// Subclass of Validator that validates and segments Indic scripts in the
// unicode range 0x900-0xdff (Devanagari-Sinhala).
class ValidateIndic : public Validator {
public:
ValidateIndic(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {}
~ValidateIndic() {}
protected:
// Returns whether codes matches the pattern for an Indic Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch.
Validator::CharClass UnicodeToCharClass(char32 ch) const override;
private:
// Helper consumes/copies a virama and any associated post-virama joiners.
bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra);
// Helper consumes/copies a series of consonants separated by viramas while
// valid, but not any vowel or other modifiers.
bool ConsumeConsonantHeadIfValid();
// Helper consumes/copies a tail part of a consonant, comprising optional
// matra/piece, vowel modifier, vedic mark, terminating virama.
bool ConsumeConsonantTailIfValid();
// Helper consumes/copies a vowel and optional modifiers.
bool ConsumeVowelIfValid();
// Some special unicodes used only for Indic processing.
static const char32 kYayana = 0xdba; // Sinhala Ya
static const char32 kRayana = 0xdbb; // Sinhala Ra
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_INDIC_H_
#include "validate_khmer.h"
#include "errcode.h"
#include "tprintf.h"
namespace tesseract {
// Returns whether codes matches the pattern for a Khmer Grapheme.
// Taken from unicode standard:
// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf.
// where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation
// to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf.
// Translated to the codes used by the CharClass enum:
// C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC}
// Where R is a new symbol (Robat) and N is repurposed as a consonant shifter.
// Also the Consonant class here includes independent vowels, as they are
// treated the same anyway.
// In the split grapheme mode, the only characters that get grouped are the
// HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in
// the BNF syntax, so who knows what they do.
bool ValidateKhmer::ConsumeGraphemeIfValid() {
int num_codes = codes_.size();
if (codes_used_ == num_codes) return false;
if (codes_[codes_used_].first == CharClass::kOther) {
UseMultiCode(1);
return true;
}
if (codes_[codes_used_].first != CharClass::kConsonant) {
if (report_errors_) {
tprintf("Invalid start of Khmer syllable:0x%x\n",
codes_[codes_used_].second);
}
return false;
}
if (UseMultiCode(1)) return true;
if (codes_[codes_used_].first == CharClass::kRobat ||
codes_[codes_used_].first == CharClass::kNukta) {
if (UseMultiCode(1)) return true;
}
while (codes_used_ + 1 < num_codes &&
codes_[codes_used_].first == CharClass::kVirama &&
codes_[codes_used_ + 1].first == CharClass::kConsonant) {
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true;
if (codes_[codes_used_].first == CharClass::kRobat) {
if (UseMultiCode(1)) return true;
}
}
int num_matra_parts = 0;
if (codes_[codes_used_].second == kZeroWidthJoiner ||
codes_[codes_used_].second == kZeroWidthNonJoiner) {
if (CodeOnlyToOutput()) {
if (report_errors_) {
tprintf("Unterminated joiner: 0x%x\n", output_.back());
}
return false;
}
++num_matra_parts;
}
// Not quite as shown by the BNF, the matra piece is allowed as a matra on its
// own or as an addition to other matras.
if (codes_[codes_used_].first == CharClass::kMatra ||
codes_[codes_used_].first == CharClass::kMatraPiece) {
++num_matra_parts;
if (UseMultiCode(num_matra_parts)) return true;
} else if (num_matra_parts) {
if (report_errors_) {
tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n",
output_.back(), codes_[codes_used_].second);
}
return false;
}
if (codes_[codes_used_].first == CharClass::kMatraPiece &&
codes_[codes_used_ - 1].first != CharClass::kMatraPiece) {
if (UseMultiCode(1)) return true;
}
if (codes_[codes_used_].first == CharClass::kVowelModifier) {
if (UseMultiCode(1)) return true;
}
if (codes_used_ + 1 < num_codes &&
codes_[codes_used_].first == CharClass::kVirama &&
codes_[codes_used_ + 1].first == CharClass::kConsonant) {
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true;
}
return true;
}
Validator::CharClass ValidateKhmer::UnicodeToCharClass(char32 ch) const {
if (IsVedicAccent(ch)) return CharClass::kVedicMark;
if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
// Offset from the start of the relevant unicode code block aka code page.
int off = ch - static_cast<char32>(script_);
// Anything in another code block is other.
if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
if (off <= 0x33) return CharClass::kConsonant;
if (off <= 0x45) return CharClass::kMatra;
if (off == 0x46) return CharClass::kMatraPiece;
if (off == 0x4c) return CharClass::kRobat;
if (off == 0x49 || off == 0x4a) return CharClass::kNukta;
if (off <= 0x51) return CharClass::kVowelModifier;
if (off == 0x52) return CharClass::kVirama;
return CharClass::kOther;
}
} // namespace tesseract
#ifndef TESSERACT_TRAINING_VALIDATE_KHMER_H_
#define TESSERACT_TRAINING_VALIDATE_KHMER_H_
#include "validator.h"
namespace tesseract {
// Subclass of Validator that validates and segments Khmer.
class ValidateKhmer : public Validator {
public:
ValidateKhmer(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {}
~ValidateKhmer() {}
protected:
// Returns whether codes matches the pattern for an Khmer Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch.
CharClass UnicodeToCharClass(char32 ch) const override;
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_KHMER_H_
#include "validate_myanmar.h"
#include "errcode.h"
#include "icuerrorcode.h"
#include "tprintf.h"
#include "unicode/uchar.h" // From libicu
#include "unicode/uscript.h" // From libicu
namespace tesseract {
// Returns whether codes matches the pattern for a Myanmar Grapheme.
// Taken directly from the unicode table 16-3.
// See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
bool ValidateMyanmar::ConsumeGraphemeIfValid() {
int num_codes = codes_.size();
if (codes_used_ == num_codes) return true;
// Other.
if (IsMyanmarOther(codes_[codes_used_].second)) {
UseMultiCode(1);
return true;
}
// Kinzi.
if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 &&
codes_[codes_used_ + 1].second == kMyanmarAsat &&
codes_[codes_used_ + 2].second == kMyanmarVirama) {
ASSERT_HOST(!CodeOnlyToOutput());
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(3)) return true;
}
// Base consonant/vowel. NOTE that since everything in Myanmar appears to be
// optional, except the base, this is the only place where invalid input can
// be detected and false returned.
if (IsMyanmarLetter(codes_[codes_used_].second)) {
if (UseMultiCode(1)) return true;
} else {
if (report_errors_) {
tprintf("Invalid start of Myanmar syllable:0x%x\n",
codes_[codes_used_].second);
}
return false; // One of these is required.
}
if (ConsumeSubscriptIfPresent()) return true;
ConsumeOptionalSignsIfPresent();
// What we have consumed so far is a valid syllable.
return true;
}
// TODO(rays) Doesn't use intermediate coding like the other scripts, as there
// is little correspondence between the content of table 16-3 and the char
// classes of the Indic languages. (Experts may disagree and improve!)
// In unicode table 16-3 there is basically a long list of optional characters,
// which can be coded quite easily.
// Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!!
// The table also allows sequences that still result in dotted circles!!
// So with a lot of guesswork the rest have been added in a reasonable place.
Validator::CharClass ValidateMyanmar::UnicodeToCharClass(char32 ch) const {
if (IsMyanmarLetter(ch)) return CharClass::kConsonant;
return CharClass::kOther;
}
// Helper consumes/copies a virama and any subscript consonant.
// Returns true if the end of input is reached.
bool ValidateMyanmar::ConsumeSubscriptIfPresent() {
// Subscript consonant. It appears there can be only one.
int num_codes = codes_.size();
if (codes_used_ + 1 < num_codes &&
codes_[codes_used_].second == kMyanmarVirama) {
if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) {
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true;
}
}
return false;
}
// Helper consumes/copies a series of optional signs.
// Returns true if the end of input is reached.
bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {
// The following characters are allowed, all optional, and in sequence.
// An exception is kMyanmarMedialYa, which can include kMyanmarAsat.
const std::vector<char32> kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c,
0x103d, 0x103e, 0x105e, 0x105f, 0x1060,
0x1081, 0x1031});
for (char32 ch : kMedials) {
if (codes_[codes_used_].second == ch) {
if (UseMultiCode(1)) return true;
if (ch == kMyanmarMedialYa &&
codes_[codes_used_].second == kMyanmarAsat) {
if (UseMultiCode(1)) return true;
}
}
}
// Vowel sign i, ii, ai.
char32 ch = codes_[codes_used_].second;
if (ch == 0x102d || ch == 0x102e || ch == 0x1032) {
if (UseMultiCode(1)) return true;
}
// Vowel sign u, uu, and extensions.
ch = codes_[codes_used_].second;
if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) ||
ch == 0x1062 || ch == 0x1067 || ch == 0x1068 ||
(0x1071 <= ch && ch <= 0x1074) || (0x1083 <= ch && ch <= 0x1086) ||
ch == 0x109c || ch == 0x109d) {
if (UseMultiCode(1)) return true;
}
// Tall aa, aa with optional asat.
if (codes_[codes_used_].second == 0x102b ||
codes_[codes_used_].second == 0x102c) {
if (UseMultiCode(1)) return true;
if (codes_[codes_used_].second == kMyanmarAsat) {
if (UseMultiCode(1)) return true;
}
}
// The following characters are allowed, all optional, and in sequence.
const std::vector<char32> kSigns({0x1036, 0x1037});
for (char32 ch : kSigns) {
if (codes_[codes_used_].second == ch) {
if (UseMultiCode(1)) return true;
}
}
// Tone mark extensions.
ch = codes_[codes_used_].second;
if (ch == 0x1038 || ch == kMyanmarAsat || ch == 0x1063 || ch == 0x1064 ||
(0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) ||
ch == 0x108f || ch == 0x109a || ch == 0x109b ||
(0xaa7b <= ch && ch <= 0xaa7d)) {
if (UseMultiCode(1)) return true;
}
return false;
}
// Returns true if the unicode is a Myanmar "letter" including consonants
// and independent vowels. Although table 16-3 distinguishes between some
// base consonants and vowels, the extensions make no such distinction, so we
// put them all into a single bucket.
/* static */
bool ValidateMyanmar::IsMyanmarLetter(char32 ch) {
return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f ||
(0x1050 <= ch && ch <= 0x1055) || (0x105a <= ch && ch <= 0x105d) ||
ch == 0x1061 || ch == 0x1065 || ch == 0x1066 ||
(0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1080) ||
ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9ef) ||
(0xa9fa <= ch && ch <= 0xa9ff) || (0xaa60 <= ch && ch <= 0xaa73) ||
ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f;
}
// Returns true if ch is a Myanmar digit or other symbol that does not take
// part in being a syllable.
/* static */
bool ValidateMyanmar::IsMyanmarOther(char32 ch) {
IcuErrorCode err;
UScriptCode script_code = uscript_getScript(ch, err);
if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner &&
ch != Validator::kZeroWidthNonJoiner)
return true;
return (0x1040 <= ch && ch <= 0x1049) || (0x1090 <= ch && ch <= 0x1099) ||
(0x109c <= ch && ch <= 0x109d) || (0xa9f0 <= ch && ch <= 0xa9f9) ||
(0xaa74 <= ch && ch <= 0xaa79);
}
} // namespace tesseract
#ifndef TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
#define TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
#include "validator.h"
namespace tesseract {
// Subclass of Validator that validates and segments Myanmar.
class ValidateMyanmar : public Validator {
public:
ValidateMyanmar(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {}
~ValidateMyanmar() {}
protected:
// Returns whether codes matches the pattern for a Myanmar Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch.
Validator::CharClass UnicodeToCharClass(char32 ch) const override;
private:
// Helper consumes/copies a virama and any subscript consonant.
// Returns true if the end of input is reached.
bool ConsumeSubscriptIfPresent();
// Helper consumes/copies a series of optional signs.
// Returns true if the end of input is reached.
bool ConsumeOptionalSignsIfPresent();
// Returns true if the unicode is a Myanmar "letter" including consonants
// and independent vowels. Although table 16-3 distinguishes between some
// base consonants and vowels, the extensions make no such distinction, so we
// put them all into a single bucket.
static bool IsMyanmarLetter(char32 ch);
// Returns true if ch is a Myanmar digit or other symbol that does not take
// part in being a syllable.
static bool IsMyanmarOther(char32 ch);
// Some special unicodes used only for Myanmar processing.
static const char32 kMyanmarAsat = 0x103a;
static const char32 kMyanmarMedialYa = 0x103b;
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
#include "validator.h"
#include <algorithm>
#include <unordered_map>
#include <vector>
#include "icuerrorcode.h"
#include "unicode/uchar.h" // From libicu
#include "unicode/uscript.h" // From libicu
#include "validate_grapheme.h"
#include "validate_indic.h"
#include "validate_khmer.h"
#include "validate_myanmar.h"
namespace tesseract {
// Some specific but universally useful unicodes.
const char32 Validator::kZeroWidthSpace = 0x200B;
const char32 Validator::kZeroWidthNonJoiner = 0x200C;
const char32 Validator::kZeroWidthJoiner = 0x200D;
const char32 Validator::kLeftToRightMark = 0x200E;
const char32 Validator::kRightToLeftMark = 0x200F;
const char32 Validator::kInvalid = 0xfffd;
// Validates and cleans the src vector of unicodes to the *dest, according to
// g_mode. In the case of kSingleString, a single vector containing the whole
// result is added to *dest. With kCombined, multiple vectors are added to
// *dest with one grapheme in each. With kGlyphSplit, multiple vectors are
// added to *dest with a smaller unit representing a glyph in each.
// In case of validation error, returns false and as much as possible of the
// input, without discarding invalid text.
/* static */
bool Validator::ValidateCleanAndSegment(
GraphemeNormMode g_mode, bool report_errors, const std::vector<char32>& src,
std::vector<std::vector<char32>>* dest) {
ValidateGrapheme g_validator(ViramaScript::kNonVirama, report_errors);
std::vector<std::vector<char32>> graphemes;
ViramaScript script = MostFrequentViramaScript(src);
bool success = true;
if (script == ViramaScript::kNonVirama) {
// The grapheme segmenter's maximum segmentation is the grapheme unit, so
// up the mode by 1 to get the desired effect.
if (g_mode == GraphemeNormMode::kCombined)
g_mode = GraphemeNormMode::kGlyphSplit;
else if (g_mode == GraphemeNormMode::kGlyphSplit)
g_mode = GraphemeNormMode::kIndividualUnicodes;
// Just do grapheme segmentation.
success = g_validator.ValidateCleanAndSegmentInternal(g_mode, src, dest);
} else {
success = g_validator.ValidateCleanAndSegmentInternal(
GraphemeNormMode::kGlyphSplit, src, &graphemes);
std::unique_ptr<Validator> validator(
ScriptValidator(script, report_errors));
for (const auto& grapheme : graphemes) {
if (!validator->ValidateCleanAndSegmentInternal(g_mode, grapheme, dest)) {
success = false;
}
}
}
return success;
}
// Factory method that understands how to map script to the right subclass.
std::unique_ptr<Validator> Validator::ScriptValidator(ViramaScript script,
bool report_errors) {
switch (script) {
case ViramaScript::kNonVirama:
return std::unique_ptr<Validator>(
new ValidateGrapheme(script, report_errors));
case ViramaScript::kMyanmar:
return std::unique_ptr<Validator>(
new ValidateMyanmar(script, report_errors));
case ViramaScript::kKhmer:
return std::unique_ptr<Validator>(
new ValidateKhmer(script, report_errors));
default:
return std::unique_ptr<Validator>(
new ValidateIndic(script, report_errors));
}
}
// Internal version of the public static ValidateCleanAndSegment.
// Validates and cleans the src vector of unicodes to the *dest, according to
// its type and the given g_mode.
// In case of validation error, returns false and returns as much as possible
// of the input, without discarding invalid text.
bool Validator::ValidateCleanAndSegmentInternal(
GraphemeNormMode g_mode, const std::vector<char32>& src,
std::vector<std::vector<char32>>* dest) {
Clear();
ComputeClassCodes(src);
bool success = true;
for (codes_used_ = 0; codes_used_ < codes_.size();) {
if (!ConsumeGraphemeIfValid()) {
success = false;
++codes_used_;
}
}
MoveResultsToDest(g_mode, dest);
return success;
}
// Moves the results from parts_ or output_ to dest according to g_mode.
void Validator::MoveResultsToDest(GraphemeNormMode g_mode,
std::vector<std::vector<char32>>* dest) {
if (g_mode == GraphemeNormMode::kIndividualUnicodes) {
// Append each element of the combined output_ that we made as a new vector
// in dest.
dest->reserve(dest->size() + output_.size());
for (char32 ch : output_) dest->push_back({ch});
} else if (g_mode == GraphemeNormMode::kGlyphSplit) {
// Append all the parts_ that we made onto dest.
std::move(parts_.begin(), parts_.end(), std::back_inserter(*dest));
} else if (g_mode == GraphemeNormMode::kCombined || dest->empty()) {
// Append the combined output_ that we made onto dest as one new vector.
dest->push_back(std::vector<char32>());
output_.swap(dest->back());
} else { // kNone.
// Append the combined output_ that we made onto the last existing element
// of dest.
dest->back().insert(dest->back().end(), output_.begin(), output_.end());
}
}
bool CmpPairSecond(const std::pair<int, int>& p1,
const std::pair<int, int>& p2) {
return p1.second < p2.second;
}
// Computes and returns the ViramaScript corresponding to the most frequent
// virama-using script in the input, or kNonVirama if none are present.
/* static */
ViramaScript Validator::MostFrequentViramaScript(
const std::vector<char32>& utf32) {
std::unordered_map<int, int> histogram;
for (char32 ch : utf32) {
// Determine the codepage base. For the Indic scripts, and Khmer, it is
// sufficient to divide by kIndicCodePageSize but Myanmar is all over the
// unicode code space, so use its script id.
int base = ch / kIndicCodePageSize;
IcuErrorCode err;
UScriptCode script_code = uscript_getScript(ch, err);
if ((kMinIndicUnicode <= ch && ch <= kMaxViramaScriptUnicode &&
script_code != USCRIPT_COMMON) ||
script_code == USCRIPT_MYANMAR) {
if (script_code == USCRIPT_MYANMAR)
base = static_cast<char32>(ViramaScript::kMyanmar) / kIndicCodePageSize;
++histogram[base];
}
}
if (!histogram.empty()) {
int base =
std::max_element(histogram.begin(), histogram.end(), CmpPairSecond)
->first;
char32 codebase = static_cast<char32>(base * kIndicCodePageSize);
// Check for validity.
if (codebase == static_cast<char32>(ViramaScript::kMyanmar) ||
codebase == static_cast<char32>(ViramaScript::kKhmer) ||
(static_cast<char32>(ViramaScript::kDevanagari) <= codebase &&
codebase <= static_cast<char32>(ViramaScript::kSinhala))) {
return static_cast<ViramaScript>(codebase);
}
}
return ViramaScript::kNonVirama;
}
// Returns true if the given UTF-32 unicode is a "virama" character.
/* static */
bool Validator::IsVirama(char32 unicode) {
return (kMinIndicUnicode <= unicode && unicode <= kMaxSinhalaUnicode &&
(unicode & 0x7f) == 0x4d) ||
unicode == kSinhalaVirama || unicode == kMyanmarVirama ||
unicode == kKhmerVirama;
}
// Returns true if the given UTF-32 unicode is a vedic accent.
/* static */
bool Validator::IsVedicAccent(char32 unicode) {
return 0x1cd0 <= unicode && unicode < 0x1d00;
}
// Returns true if the script is one that uses subscripts for conjuncts.
bool Validator::IsSubscriptScript() const {
return script_ == ViramaScript::kTelugu ||
script_ == ViramaScript::kKannada ||
script_ == ViramaScript::kMyanmar || script_ == ViramaScript::kKhmer;
}
void Validator::ComputeClassCodes(const std::vector<char32>& text) {
codes_.reserve(text.size());
for (char32 c : text) {
codes_.push_back(std::make_pair(UnicodeToCharClass(c), c));
}
}
// Resets to the initial state.
void Validator::Clear() {
codes_.clear();
parts_.clear();
output_.clear();
codes_used_ = 0;
output_used_ = 0;
}
} // namespace tesseract
/**********************************************************************
* File: validator.h
* Description: Base class for various text validators. Intended mainly for
* scripts that use a virama character.
* Author: Ray Smith
* Created: Tue May 23 2017
*
* (C) Copyright 2017, Google Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_TRAINING_VALIDATOR_H_
#define TESSERACT_TRAINING_VALIDATOR_H_
#include <memory>
#include <vector>
#include "unichar.h"
namespace tesseract {
// Different kinds of grapheme normalization - not just for Indic!
// A grapheme is a syllable unit in Indic and can be several unicodes.
// In other scripts, a grapheme is a base character and accent/diacritic
// combination, as not all accented characters have a single composed form.
enum class GraphemeNormMode {
// Validation result is a single string, even if input is multi-word.
kSingleString,
// Standard unicode graphemes are validated and output as grapheme units.
kCombined,
// Graphemes are validated and sub-divided. For virama-using scripts, units
// that correspond to repeatable glyphs are generated. (Mostly single unicodes
// but viramas and joiners are paired with the most sensible neighbor.)
// For non-virama scripts, this means that base/accent pairs are separated,
// ie the output is individual unicodes.
kGlyphSplit,
// The output is always single unicodes, regardless of the script.
kIndividualUnicodes,
};
// An enum representing the scripts that use a virama character. It is
// guaranteed that the value of any element, (except kNonVirama) can be cast
// to a unicode (char32) value that represents the start of the unicode range
// of the corresponding script.
enum class ViramaScript : char32 {
kNonVirama = 0,
kDevanagari = 0x900,
kBengali = 0x980,
kGurmukhi = 0xa00,
kGujarati = 0xa80,
kOriya = 0xb00,
kTamil = 0xb80,
kTelugu = 0xc00,
kKannada = 0xc80,
kMalayalam = 0xd00,
kSinhala = 0xd80,
kMyanmar = 0x1000,
kKhmer = 0x1780,
};
// Base class offers a validation API and protected methods to allow subclasses
// to easily build the validated/segmented output.
class Validator {
public:
// Validates and cleans the src vector of unicodes to the *dest, according to
// g_mode. In the case of kSingleString, a single vector containing the whole
// result is added to *dest. With kCombined, multiple vectors are added to
// *dest with one grapheme in each. With kGlyphSplit, multiple vectors are
// added to *dest with a smaller unit representing a glyph in each.
// In case of validation error, returns false and as much as possible of the
// input, without discarding invalid text.
static bool ValidateCleanAndSegment(GraphemeNormMode g_mode,
bool report_errors,
const std::vector<char32>& src,
std::vector<std::vector<char32>>* dest);
// Returns true if the unicode ch is a non-printing zero-width mark of no
// significance to OCR training or evaluation.
static bool IsZeroWidthMark(char32 ch) {
return ch == kZeroWidthSpace || ch == kLeftToRightMark ||
ch == kRightToLeftMark || ch == kInvalid;
}
virtual ~Validator() {}
// Some specific but universally useful unicodes.
static const char32 kZeroWidthSpace;
static const char32 kZeroWidthNonJoiner;
static const char32 kZeroWidthJoiner;
static const char32 kLeftToRightMark;
static const char32 kRightToLeftMark;
static const char32 kInvalid;
protected:
// These are more or less the character class identifiers in the ISCII
// standard, section 8. They have been augmented with the Unicode meta
// characters Zero Width Joiner and Zero Width Non Joiner, and the
// Unicode Vedic Marks.
// The best sources of information on Unicode and Indic scripts are:
// http://varamozhi.sourceforge.net/iscii91.pdf
// http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
// http://unicode.org/faq/indic.html
// http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx
enum class CharClass {
// NOTE: The values of the enum members are meaningless and arbitrary, ie
// they are not used for sorting, or any other risky application.
// The reason they are what they are is they are a single character
// abbreviation that can be used in a regexp/BNF definition of a grammar,
// IN A COMMENT, and still not relied upon in the code.
kConsonant = 'C',
kVowel = 'V',
kVirama = 'H', // (aka Halant)
kMatra = 'M', // (aka Dependent Vowel)
kMatraPiece = 'P', // unicode provides pieces of Matras.
kVowelModifier = 'D', // (candrabindu, anusvara, visarga, other marks)
kZeroWidthNonJoiner = 'z', // Unicode Zero Width Non-Joiner U+200C
kZeroWidthJoiner = 'Z', // Unicode Zero Width Joiner U+200D
kVedicMark = 'v', // Modifiers can come modify any indic syllable.
kNukta = 'N', // Occurs only immediately after consonants.
kRobat = 'R', // Khmer only.
kOther = 'O', // (digits, measures, non-Indic, etc)
// Additional classes used only by ValidateGrapheme.
kWhitespace = ' ',
kCombiner = 'c', // Combiners other than virama.
};
typedef std::pair<CharClass, char32> IndicPair;
Validator(ViramaScript script, bool report_errors)
: script_(script),
codes_used_(0),
output_used_(0),
report_errors_(report_errors) {}
// Factory method that understands how to map script to the right subclass.
static std::unique_ptr<Validator> ScriptValidator(ViramaScript script,
bool report_errors);
// Internal version of the public static ValidateCleanAndSegment.
// Validates and cleans the src vector of unicodes to the *dest, according to
// its type and the given g_mode.
// In case of validation error, returns false and returns as much as possible
// of the input, without discarding invalid text.
bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode,
const std::vector<char32>& src,
std::vector<std::vector<char32>>* dest);
// Moves the results from parts_ or output_ to dest according to g_mode.
void MoveResultsToDest(GraphemeNormMode g_mode,
std::vector<std::vector<char32>>* dest);
// Computes and returns the ViramaScript corresponding to the most frequent
// virama-using script in the input, or kNonVirama if none are present.
static ViramaScript MostFrequentViramaScript(
const std::vector<char32>& utf32);
// Returns true if the given UTF-32 unicode is a "virama" character.
static bool IsVirama(char32 unicode);
// Returns true if the given UTF-32 unicode is a vedic accent.
static bool IsVedicAccent(char32 unicode);
// Returns true if the script is one that uses subscripts for conjuncts.
bool IsSubscriptScript() const;
// Helper function appends the next element of codes_ only to output_,
// without touching parts_
// Returns true at the end of codes_.
bool CodeOnlyToOutput() {
output_.push_back(codes_[codes_used_].second);
return ++codes_used_ == codes_.size();
}
// Helper function adds a length-element vector to parts_ from the last length
// elements of output_. If there are more than length unused elements in
// output_, adds unicodes as single-element vectors to parts_ to catch
// output_used_ up to output->size() - length before adding the length-element
// vector.
void MultiCodePart(int length) {
while (output_used_ + length < output_.size()) {
parts_.emplace_back(
std::initializer_list<char32>{output_[output_used_++]});
}
parts_.emplace_back(std::initializer_list<char32>{output_[output_used_]});
while (++output_used_ < output_.size()) {
parts_.back().push_back(output_[output_used_]);
}
}
// Helper function appends the next element of codes_ to output_, and then
// calls MultiCodePart to add the appropriate components to parts_.
// Returns true at the end of codes_.
bool UseMultiCode(int length) {
output_.push_back(codes_[codes_used_].second);
MultiCodePart(length);
return ++codes_used_ == codes_.size();
}
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
virtual bool ConsumeGraphemeIfValid() = 0;
// Sets codes_ to the class codes for the given unicode text.
void ComputeClassCodes(const std::vector<char32>& text);
// Returns the CharClass corresponding to the given Unicode ch.
virtual CharClass UnicodeToCharClass(char32 ch) const = 0;
// Resets to the initial state.
void Clear();
// Number of unicodes in each Indic codepage.
static const int kIndicCodePageSize = 128;
// Lowest unicode value of any Indic script. (Devanagari).
static const char32 kMinIndicUnicode = 0x900;
// Highest unicode value of any consistent (ISCII-based) Indic script.
static const char32 kMaxSinhalaUnicode = 0xdff;
// Highest unicode value of any virama-using script. (Khmer).
static const char32 kMaxViramaScriptUnicode = 0x17ff;
// Some special unicodes.
static const char32 kSinhalaVirama = 0xdca;
static const char32 kMyanmarVirama = 0x1039;
static const char32 kKhmerVirama = 0x17d2;
// Script we are operating on.
ViramaScript script_;
// Input unicodes with assigned CharClass is the data to be validated.
std::vector<IndicPair> codes_;
// Glyph-like components of the input.
std::vector<std::vector<char32>> parts_;
// Copied validated unicodes from codes_ that are OK to output.
std::vector<char32> output_;
// The number of elements of codes_ that have been processed so far.
int codes_used_;
// The number of elements of output_ that have already been added to parts_.
int output_used_;
// Log error messages for reasons why text is invalid.
bool report_errors_;
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATOR_H_
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册