From b0ead95d64a3667339775b2f99ac37e97e90c2a0 Mon Sep 17 00:00:00 2001 From: Ray Smith Date: Mon, 24 Jul 2017 11:45:57 -0700 Subject: [PATCH] =?UTF-8?q?Changed=20the=20way=20unicharsets=20are=20handl?= =?UTF-8?q?ed=20to=20allow=20support=20for=20the=20=E2=84=A2=20character.?= =?UTF-8?q?=20Can=20find=20the=20issue=20where=20it=20was=20requested.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ccstruct/ratngs.cpp | 7 ++- ccutil/ambigs.cpp | 4 +- ccutil/unicharcompress.cpp | 5 +- ccutil/unicharmap.cpp | 72 ++++++----------------- ccutil/unicharmap.h | 9 --- ccutil/unicharset.cpp | 113 +++++++++++++++++++++++++++---------- ccutil/unicharset.h | 54 ++++++++++++++++-- lstm/lstmtrainer.cpp | 13 +++-- training/text2image.cpp | 12 ++-- 9 files changed, 177 insertions(+), 112 deletions(-) diff --git a/ccstruct/ratngs.cpp b/ccstruct/ratngs.cpp index 03ed873c..888c026c 100644 --- a/ccstruct/ratngs.cpp +++ b/ccstruct/ratngs.cpp @@ -24,6 +24,7 @@ #include "ratngs.h" +#include #include "blobs.h" #include "callcpp.h" #include "genericvector.h" @@ -200,10 +201,12 @@ WERD_CHOICE::WERD_CHOICE(const char *src_string, : unicharset_(&unicharset){ GenericVector encoding; GenericVector lengths; - if (unicharset.encode_string(src_string, true, &encoding, &lengths, NULL)) { + string cleaned = unicharset.CleanupString(src_string); + if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths, + NULL)) { lengths.push_back('\0'); STRING src_lengths = &lengths[0]; - this->init(src_string, src_lengths.string(), 0.0, 0.0, NO_PERM); + this->init(cleaned.c_str(), src_lengths.string(), 0.0, 0.0, NO_PERM); } else { // There must have been an invalid unichar in the string. this->init(8); this->make_bad(); diff --git a/ccutil/ambigs.cpp b/ccutil/ambigs.cpp index b940dea0..2db2d820 100644 --- a/ccutil/ambigs.cpp +++ b/ccutil/ambigs.cpp @@ -357,7 +357,7 @@ bool UnicharAmbigs::InsertIntoTable( // Insert the corresponding correct ngram into the unicharset. // Unicharset code assumes that the "base" ngram is inserted into // the unicharset before fragments of this ngram are inserted. - unicharset->unichar_insert(replacement_string); + unicharset->unichar_insert(replacement_string, OldUncleanUnichars::kTrue); ambig_spec->correct_ngram_id = unicharset->unichar_to_id(replacement_string); if (replacement_ambig_part_size > 1) { @@ -372,7 +372,7 @@ bool UnicharAmbigs::InsertIntoTable( } else { STRING frag_str = CHAR_FRAGMENT::to_string( replacement_string, i, test_ambig_part_size, false); - unicharset->unichar_insert(frag_str.string()); + unicharset->unichar_insert(frag_str.string(), OldUncleanUnichars::kTrue); unichar_id = unicharset->unichar_to_id(frag_str.string()); } ambig_spec->correct_fragments[i] = unichar_id; diff --git a/ccutil/unicharcompress.cpp b/ccutil/unicharcompress.cpp index 64b8f578..c030d566 100644 --- a/ccutil/unicharcompress.cpp +++ b/ccutil/unicharcompress.cpp @@ -117,7 +117,7 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id, direct_set.clear(); radicals.clear(); // Always keep space as 0; - direct_set.unichar_insert(" "); + direct_set.unichar_insert(" ", OldUncleanUnichars::kTrue); // Null char is next if we have one. if (null_id >= 0) { direct_set.unichar_insert(kNullChar); @@ -160,7 +160,8 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id, if (it != radical_map.end()) { // This is Han. Convert to radical, stroke, index. if (!radicals.contains_unichar(it->second.radical.string())) { - radicals.unichar_insert(it->second.radical.string()); + radicals.unichar_insert(it->second.radical.string(), + OldUncleanUnichars::kTrue); } int radical = radicals.unichar_to_id(it->second.radical.string()); int num_strokes = it->second.num_strokes; diff --git a/ccutil/unicharmap.cpp b/ccutil/unicharmap.cpp index 6b1bb1d6..b13acdc3 100644 --- a/ccutil/unicharmap.cpp +++ b/ccutil/unicharmap.cpp @@ -31,41 +31,24 @@ UNICHARMAP::~UNICHARMAP() { delete[] nodes; } -// Search the given unichar representation in the tree. Each character in the -// string is interpreted as an index in an array of nodes. -UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr) const { - const char* current_char = unichar_repr; - UNICHARMAP_NODE* current_nodes = nodes; - - assert(*unichar_repr != '\0'); - - do { - if (*(current_char + 1) == '\0') - return current_nodes[static_cast(*current_char)].id; - current_nodes = - current_nodes[static_cast(*current_char)].children; - ++current_char; - } while (true); -} - // Search the given unichar representation in the tree, using length characters // from it maximum. Each character in the string is interpreted as an index in // an array of nodes. UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr, int length) const { - const char* current_char = unichar_repr; UNICHARMAP_NODE* current_nodes = nodes; assert(*unichar_repr != '\0'); assert(length > 0 && length <= UNICHAR_LEN); + int index = 0; + if (index >= length || unichar_repr[index] == '\0') return INVALID_UNICHAR_ID; do { - if (length == 1 || *(current_char + 1) == '\0') - return current_nodes[static_cast(*current_char)].id; + if (index + 1 >= length || unichar_repr[index + 1] == '\0') + return current_nodes[static_cast(unichar_repr[index])].id; current_nodes = - current_nodes[static_cast(*current_char)].children; - ++current_char; - --length; + current_nodes[static_cast(unichar_repr[index])].children; + ++index; } while (true); } @@ -75,15 +58,12 @@ UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr, // string is interpreted as an index in an array of nodes. void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) { const char* current_char = unichar_repr; + if (*current_char == '\0') return; UNICHARMAP_NODE** current_nodes_pointer = &nodes; - - assert(*unichar_repr != '\0'); - assert(id >= 0); - do { if (*current_nodes_pointer == 0) *current_nodes_pointer = new UNICHARMAP_NODE[256]; - if (*(current_char + 1) == '\0') { + if (current_char[1] == '\0') { (*current_nodes_pointer) [static_cast(*current_char)].id = id; return; @@ -95,24 +75,6 @@ void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) { } while (true); } -// Search the given unichar representation in the tree. Each character in the -// string is interpreted as an index in an array of nodes. Stop once the tree -// does not have anymore nodes or once we found the right unichar_repr. -bool UNICHARMAP::contains(const char* const unichar_repr) const { - if (unichar_repr == NULL || *unichar_repr == '\0') return false; - - const char* current_char = unichar_repr; - UNICHARMAP_NODE* current_nodes = nodes; - - while (current_nodes != 0 && *(current_char + 1) != '\0') { - current_nodes = - current_nodes[static_cast(*current_char)].children; - ++current_char; - } - return current_nodes != 0 && *(current_char + 1) == '\0' && - current_nodes[static_cast(*current_char)].id >= 0; -} - // Search the given unichar representation in the tree, using length characters // from it maximum. Each character in the string is interpreted as an index in // an array of nodes. Stop once the tree does not have anymore nodes or once we @@ -121,24 +83,26 @@ bool UNICHARMAP::contains(const char* const unichar_repr, int length) const { if (unichar_repr == NULL || *unichar_repr == '\0') return false; if (length <= 0 || length > UNICHAR_LEN) return false; - - const char* current_char = unichar_repr; + int index = 0; + if (index >= length || unichar_repr[index] == '\0') return false; UNICHARMAP_NODE* current_nodes = nodes; - while (current_nodes != 0 && (length > 1 && *(current_char + 1) != '\0')) { + while (current_nodes != 0 && index + 1 < length && + unichar_repr[index + 1] != '\0') { current_nodes = - current_nodes[static_cast(*current_char)].children; - --length; - ++current_char; + current_nodes[static_cast(unichar_repr[index])].children; + ++index; } - return current_nodes != 0 && (length == 1 || *(current_char + 1) == '\0') && - current_nodes[static_cast(*current_char)].id >= 0; + return current_nodes != 0 && + (index + 1 >= length || unichar_repr[index + 1] == '\0') && + current_nodes[static_cast(unichar_repr[index])].id >= 0; } // Return the minimum number of characters that must be used from this string // to obtain a match in the UNICHARMAP. int UNICHARMAP::minmatch(const char* const unichar_repr) const { const char* current_char = unichar_repr; + if (*current_char == '\0') return 0; UNICHARMAP_NODE* current_nodes = nodes; while (current_nodes != NULL && *current_char != '\0') { diff --git a/ccutil/unicharmap.h b/ccutil/unicharmap.h index ecc4065e..45170c4f 100644 --- a/ccutil/unicharmap.h +++ b/ccutil/unicharmap.h @@ -36,21 +36,12 @@ class UNICHARMAP { // with the given id. The length of the representation MUST be non-zero. void insert(const char* const unichar_repr, UNICHAR_ID id); - // Return the id associated with the given unichar representation, - // this representation MUST exist within the UNICHARMAP. - // The length of the representation MUST be non-zero. - UNICHAR_ID unichar_to_id(const char* const unichar_repr) const; - // Return the id associated with the given unichar representation, // this representation MUST exist within the UNICHARMAP. The first // length characters (maximum) from unichar_repr are used. The length // MUST be non-zero. UNICHAR_ID unichar_to_id(const char* const unichar_repr, int length) const; - // Return true if the given unichar representation is already present in the - // UNICHARMAP. The length of the representation MUST be non-zero. - bool contains(const char* const unichar_repr) const; - // Return true if the given unichar representation is already present in the // UNICHARMAP. The first length characters (maximum) from unichar_repr are // used. The length MUST be non-zero. diff --git a/ccutil/unicharset.cpp b/ccutil/unicharset.cpp index aa87c127..bd904b3f 100644 --- a/ccutil/unicharset.cpp +++ b/ccutil/unicharset.cpp @@ -67,6 +67,15 @@ const char* UNICHARSET::kCustomLigatures[][2] = { {NULL, NULL} }; +// List of mappings to make when ingesting strings from the outside. +// The substitutions clean up text that should exist for rendering of +// synthetic data, but not in the recognition set. +const char* UNICHARSET::kCleanupMaps[][2] = { + {"\u0640", ""}, // TATWEEL is deleted. + {"\ufb01", "fi"}, // fi ligature->fi pair. + {"\ufb02", "fl"}, // fl ligature->fl pair. + {nullptr, nullptr}}; + // List of strings for the SpecialUnicharCodes. Keep in sync with the enum. const char* UNICHARSET::kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT] = { " ", @@ -196,15 +205,21 @@ void UNICHARSET::reserve(int unichars_number) { UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr) const { - return ids.contains(unichar_repr) ? - ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID; + string cleaned = + old_style_included_ ? unichar_repr : CleanupString(unichar_repr); + return ids.contains(cleaned.data(), cleaned.size()) + ? ids.unichar_to_id(cleaned.data(), cleaned.size()) + : INVALID_UNICHAR_ID; } UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr, int length) const { assert(length > 0 && length <= UNICHAR_LEN); - return ids.contains(unichar_repr, length) ? - ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID; + string cleaned(unichar_repr, length); + if (!old_style_included_) cleaned = CleanupString(unichar_repr, length); + return ids.contains(cleaned.data(), cleaned.size()) + ? ids.unichar_to_id(cleaned.data(), cleaned.size()) + : INVALID_UNICHAR_ID; } // Return the minimum number of bytes that matches a legal UNICHAR_ID, @@ -235,6 +250,9 @@ bool UNICHARSET::encodable_string(const char *str, // the rest of the string is still encoded. // If lengths is not NULL, then it is filled with the corresponding // byte length of each encoded UNICHAR_ID. +// WARNING: Caller must guarantee that str has already been cleaned of codes +// that do not belong in the unicharset, or encoding may fail. +// Use CleanupString to perform the cleaning. bool UNICHARSET::encode_string(const char* str, bool give_up_on_failure, GenericVector* encoding, GenericVector* lengths, @@ -429,7 +447,7 @@ void UNICHARSET::CopyFrom(const UNICHARSET& src) { for (int ch = 0; ch < src.size_used; ++ch) { const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties; const char* utf8 = src.id_to_unichar(ch); - unichar_insert(utf8); + unichar_insert_backwards_compatible(utf8); unichars[ch].properties.ExpandRangesFrom(src_props); } // Set properties, including mirror and other_case, WITHOUT reordering @@ -445,24 +463,13 @@ void UNICHARSET::AppendOtherUnicharset(const UNICHARSET& src) { for (int ch = 0; ch < src.size_used; ++ch) { const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties; const char* utf8 = src.id_to_unichar(ch); - if (ch >= SPECIAL_UNICHAR_CODES_COUNT && src_props.AnyRangeEmpty()) { - // Only use fully valid entries. - tprintf("Bad properties for index %d, char %s: " - "%d,%d %d,%d %g,%g %g,%g %g,%g\n", - ch, utf8, src_props.min_bottom, src_props.max_bottom, - src_props.min_top, src_props.max_top, - src_props.width, src_props.width_sd, - src_props.bearing, src_props.bearing_sd, - src_props.advance, src_props.advance_sd); - continue; - } int id = size_used; if (contains_unichar(utf8)) { id = unichar_to_id(utf8); // Just expand current ranges. unichars[id].properties.ExpandRangesFrom(src_props); } else { - unichar_insert(utf8); + unichar_insert_backwards_compatible(utf8); unichars[id].properties.SetRangesEmpty(); } } @@ -613,40 +620,55 @@ char UNICHARSET::get_chartype(UNICHAR_ID id) const { return 0; } -void UNICHARSET::unichar_insert(const char* const unichar_repr) { - if (!ids.contains(unichar_repr)) { - if (strlen(unichar_repr) > UNICHAR_LEN) { - fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n", - int(strlen(unichar_repr)), unichar_repr); +void UNICHARSET::unichar_insert(const char* const unichar_repr, + OldUncleanUnichars old_style) { + if (old_style == OldUncleanUnichars::kTrue) old_style_included_ = true; + string cleaned = + old_style_included_ ? unichar_repr : CleanupString(unichar_repr); + if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) { + const char* str = cleaned.c_str(); + GenericVector encoding; + if (!old_style_included_ && + encode_string(str, true, &encoding, nullptr, nullptr)) return; - } if (size_used == size_reserved) { if (size_used == 0) reserve(8); else reserve(2 * size_used); } - - strcpy(unichars[size_used].representation, unichar_repr); + int index = 0; + do { + if (index > UNICHAR_LEN) { + fprintf(stderr, "Utf8 buffer too big, size>%d for %s\n", UNICHAR_LEN, + unichar_repr); + return; + } + unichars[size_used].representation[index++] = *str++; + } while (*str != '\0'); + unichars[size_used].representation[index] = '\0'; this->set_script(size_used, null_script); // If the given unichar_repr represents a fragmented character, set // fragment property to a pointer to CHAR_FRAGMENT class instance with // information parsed from the unichar representation. Use the script // of the base unichar for the fragmented character if possible. - CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr); + CHAR_FRAGMENT* frag = + CHAR_FRAGMENT::parse_from_string(unichars[size_used].representation); this->unichars[size_used].properties.fragment = frag; if (frag != NULL && this->contains_unichar(frag->get_unichar())) { this->unichars[size_used].properties.script_id = this->get_script(frag->get_unichar()); } this->unichars[size_used].properties.enabled = true; - ids.insert(unichar_repr, size_used); + ids.insert(unichars[size_used].representation, size_used); ++size_used; } } bool UNICHARSET::contains_unichar(const char* const unichar_repr) const { - return ids.contains(unichar_repr); + string cleaned = + old_style_included_ ? unichar_repr : CleanupString(unichar_repr); + return ids.contains(cleaned.data(), cleaned.size()); } bool UNICHARSET::contains_unichar(const char* const unichar_repr, @@ -654,7 +676,9 @@ bool UNICHARSET::contains_unichar(const char* const unichar_repr, if (length == 0) { return false; } - return ids.contains(unichar_repr, length); + string cleaned(unichar_repr, length); + if (!old_style_included_) cleaned = CleanupString(unichar_repr, length); + return ids.contains(cleaned.data(), cleaned.size()); } bool UNICHARSET::eq(UNICHAR_ID unichar_id, @@ -840,7 +864,7 @@ bool UNICHARSET::load_via_fgets( if (strcmp(unichar, "NULL") == 0) this->unichar_insert(" "); else - this->unichar_insert(unichar); + this->unichar_insert_backwards_compatible(unichar); this->set_isalpha(id, properties & ISALPHA_MASK); this->set_islower(id, properties & ISLOWER_MASK); @@ -1088,3 +1112,32 @@ int UNICHARSET::get_script_id_from_name(const char* script_name) const { } return 0; // 0 is always the null_script } + +// Removes/replaces content that belongs in rendered text, but not in the +// unicharset. +/* static */ +string UNICHARSET::CleanupString(const char* utf8_str, int length) { + string result; + result.reserve(length); + char ch; + while ((ch = *utf8_str) != '\0' && --length >= 0) { + int key_index = 0; + const char* key; + while ((key = kCleanupMaps[key_index][0]) != nullptr) { + int match = 0; + while (key[match] != '\0' && key[match] == utf8_str[match]) ++match; + if (key[match] == '\0') { + utf8_str += match; + break; + } + ++key_index; + } + if (key == nullptr) { + result.push_back(ch); + ++utf8_str; + } else { + result.append(kCleanupMaps[key_index][1]); + } + } + return result; +} diff --git a/ccutil/unicharset.h b/ccutil/unicharset.h index a2e4e3b7..767c0de8 100644 --- a/ccutil/unicharset.h +++ b/ccutil/unicharset.h @@ -39,6 +39,13 @@ enum SpecialUnicharCodes { SPECIAL_UNICHAR_CODES_COUNT }; +// Boolean flag for unichar_insert. It's a bit of a double negative to allow +// the default value to be false. +enum class OldUncleanUnichars { + kFalse, + kTrue, +}; + class CHAR_FRAGMENT { public: // Minimum number of characters used for fragment representation. @@ -190,7 +197,7 @@ class UNICHARSET { // Use encode_string in preference to repeatedly calling step. int step(const char* str) const; - // Return whether the given UTF-8 string is encodable with this UNICHARSET. + // Returns true if the given UTF-8 string is encodable with this UNICHARSET. // If not encodable, write the first byte offset which cannot be converted // into the second (return) argument. bool encodable_string(const char *str, int *first_bad_position) const; @@ -207,6 +214,9 @@ class UNICHARSET { // If encoded_length is not NULL then on return it contains the length of // str that was encoded. (if give_up_on_failure the location of the first // failure, otherwise strlen(str).) + // WARNING: Caller must guarantee that str has already been cleaned of codes + // that do not belong in the unicharset, or encoding may fail. + // Use CleanupString to perform the cleaning. bool encode_string(const char* str, bool give_up_on_failure, GenericVector* encoding, GenericVector* lengths, @@ -226,6 +236,13 @@ class UNICHARSET { // by its hex unicodes. static STRING debug_utf8_str(const char* str); + // Removes/replaces content that belongs in rendered text, but not in the + // unicharset. + static string CleanupString(const char* utf8_str) { + return CleanupString(utf8_str, strlen(utf8_str)); + } + static string CleanupString(const char* utf8_str, int length); + // Return a STRING containing debug information on the unichar, including // the id_to_unichar, its hex unicodes and the properties. STRING debug_str(UNICHAR_ID id) const; @@ -233,8 +250,29 @@ class UNICHARSET { return debug_str(unichar_to_id(unichar_repr)); } - // Add a unichar representation to the set. - void unichar_insert(const char* const unichar_repr); + // Adds a unichar representation to the set. If old_style is true, then + // TATWEEL characters are kept and n-grams are allowed. Otherwise TATWEEL + // characters are ignored/skipped as if they don't exist and n-grams that + // can already be encoded are not added. + void unichar_insert(const char* const unichar_repr, + OldUncleanUnichars old_style); + void unichar_insert(const char* const unichar_repr) { + unichar_insert(unichar_repr, OldUncleanUnichars::kFalse); + } + // Adds a unichar representation to the set. Avoids setting old_style to true, + // unless it is necessary to make the new unichar get added. + void unichar_insert_backwards_compatible(const char* const unichar_repr) { + string cleaned = CleanupString(unichar_repr); + if (cleaned != unichar_repr) { + unichar_insert(unichar_repr, OldUncleanUnichars::kTrue); + } else { + int old_size = size(); + unichar_insert(unichar_repr, OldUncleanUnichars::kFalse); + if (size() == old_size) { + unichar_insert(unichar_repr, OldUncleanUnichars::kTrue); + } + } + } // Return true if the given unichar id exists within the set. // Relies on the fact that unichar ids are contiguous in the unicharset. @@ -282,6 +320,7 @@ class UNICHARSET { top_bottom_set_ = false; script_has_upper_lower_ = false; script_has_xheight_ = false; + old_style_included_ = false; null_sid_ = 0; common_sid_ = 0; latin_sid_ = 0; @@ -743,7 +782,7 @@ class UNICHARSET { // unichar representation represents a character fragment. const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const { if (unichar_repr == NULL || unichar_repr[0] == '\0' || - !ids.contains(unichar_repr)) { + !ids.contains(unichar_repr, false)) { return NULL; } return get_fragment(unichar_to_id(unichar_repr)); @@ -965,6 +1004,11 @@ class UNICHARSET { bool load_via_fgets(TessResultCallback2 *fgets_cb, bool skip_fragments); + // List of mappings to make when ingesting strings from the outside. + // The substitutions clean up text that should exists for rendering of + // synthetic data, but not in the recognition set. + static const char* kCleanupMaps[][2]; + UNICHAR_SLOT* unichars; UNICHARMAP ids; int size_used; @@ -980,6 +1024,8 @@ class UNICHARSET { // True if the unicharset has a significant mean-line with significant // ascenders above that. bool script_has_xheight_; + // True if the set contains chars that would be changed by the cleanup. + bool old_style_included_; // A few convenient script name-to-id mapping without using hash. // These are initialized when unicharset file is loaded. Anything diff --git a/lstm/lstmtrainer.cpp b/lstm/lstmtrainer.cpp index e9722d64..f13b278a 100644 --- a/lstm/lstmtrainer.cpp +++ b/lstm/lstmtrainer.cpp @@ -170,6 +170,7 @@ bool LSTMTrainer::InitNetwork(const STRING& network_spec, int append_index, tprintf("Training parameters:\n Debug interval = %d," " weights = %g, learning rate = %g, momentum=%g\n", debug_interval_, weight_range_, learning_rate_, momentum_); + tprintf("null char=%d\n", null_char_); return true; } @@ -733,7 +734,8 @@ bool LSTMTrainer::EncodeString(const STRING& str, const UNICHARSET& unicharset, GenericVector internal_labels; labels->truncate(0); if (!simple_text) labels->push_back(null_char); - if (unicharset.encode_string(str.string(), true, &internal_labels, NULL, + string cleaned = unicharset.CleanupString(str.string()); + if (unicharset.encode_string(cleaned.c_str(), true, &internal_labels, NULL, &err_index)) { bool success = true; for (int i = 0; i < internal_labels.size(); ++i) { @@ -759,8 +761,8 @@ bool LSTMTrainer::EncodeString(const STRING& str, const UNICHARSET& unicharset, if (success) return true; } tprintf("Encoding of string failed! Failure bytes:"); - while (err_index < str.length()) { - tprintf(" %x", str[err_index++]); + while (err_index < cleaned.size()) { + tprintf(" %x", cleaned[err_index++]); } tprintf("\n"); return false; @@ -813,8 +815,9 @@ Trainability LSTMTrainer::PrepareForBackward(const ImageData* trainingdata, training_iteration() % debug_interval_ == 0; GenericVector truth_labels; if (!EncodeString(trainingdata->transcription(), &truth_labels)) { - tprintf("Can't encode transcription: %s\n", - trainingdata->transcription().string()); + tprintf("Can't encode transcription: '%s' in language '%s'\n", + trainingdata->transcription().string(), + trainingdata->language().string()); return UNENCODABLE; } int w = 0; diff --git a/training/text2image.cpp b/training/text2image.cpp index c3438060..0858d480 100644 --- a/training/text2image.cpp +++ b/training/text2image.cpp @@ -409,9 +409,7 @@ using tesseract::SpanUTF8NotWhitespace; using tesseract::SpanUTF8Whitespace; using tesseract::StringRenderer; -int main(int argc, char** argv) { - tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true); - +int Main() { if (FLAGS_list_available_fonts) { const std::vector& all_fonts = FontUtils::ListAvailableFonts(); for (unsigned int i = 0; i < all_fonts.size(); ++i) { @@ -543,8 +541,9 @@ int main(int argc, char** argv) { const char *curr_pos = str8 + offsets[i].first; int ngram_len = offsets[i].second; // Skip words that contain characters not in found in unicharset. + string cleaned = UNICHARSET::CleanupString(curr_pos, ngram_len); if (!FLAGS_unicharset_file.empty() && - !unicharset.encodable_string(curr_pos, nullptr)) { + !unicharset.encodable_string(cleaned.c_str(), nullptr)) { continue; } rand_utf8.append(curr_pos, ngram_len); @@ -665,3 +664,8 @@ int main(int argc, char** argv) { return 0; } + +int main(int argc, char** argv) { + tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true); + Main(); +} -- GitLab