提交 b0ead95d 编写于 作者: R Ray Smith

Changed the way unicharsets are handled to allow support for the ™ character....

Changed the way unicharsets are handled to allow support for the ™ character. Can find the issue where it was requested.
上级 4efc539f
......@@ -24,6 +24,7 @@
#include "ratngs.h"
#include <string>
#include "blobs.h"
#include "callcpp.h"
#include "genericvector.h"
......@@ -200,10 +201,12 @@ WERD_CHOICE::WERD_CHOICE(const char *src_string,
: unicharset_(&unicharset){
GenericVector<UNICHAR_ID> encoding;
GenericVector<char> lengths;
if (unicharset.encode_string(src_string, true, &encoding, &lengths, NULL)) {
string cleaned = unicharset.CleanupString(src_string);
if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths,
NULL)) {
lengths.push_back('\0');
STRING src_lengths = &lengths[0];
this->init(src_string, src_lengths.string(), 0.0, 0.0, NO_PERM);
this->init(cleaned.c_str(), src_lengths.string(), 0.0, 0.0, NO_PERM);
} else { // There must have been an invalid unichar in the string.
this->init(8);
this->make_bad();
......
......@@ -357,7 +357,7 @@ bool UnicharAmbigs::InsertIntoTable(
// Insert the corresponding correct ngram into the unicharset.
// Unicharset code assumes that the "base" ngram is inserted into
// the unicharset before fragments of this ngram are inserted.
unicharset->unichar_insert(replacement_string);
unicharset->unichar_insert(replacement_string, OldUncleanUnichars::kTrue);
ambig_spec->correct_ngram_id =
unicharset->unichar_to_id(replacement_string);
if (replacement_ambig_part_size > 1) {
......@@ -372,7 +372,7 @@ bool UnicharAmbigs::InsertIntoTable(
} else {
STRING frag_str = CHAR_FRAGMENT::to_string(
replacement_string, i, test_ambig_part_size, false);
unicharset->unichar_insert(frag_str.string());
unicharset->unichar_insert(frag_str.string(), OldUncleanUnichars::kTrue);
unichar_id = unicharset->unichar_to_id(frag_str.string());
}
ambig_spec->correct_fragments[i] = unichar_id;
......
......@@ -117,7 +117,7 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
direct_set.clear();
radicals.clear();
// Always keep space as 0;
direct_set.unichar_insert(" ");
direct_set.unichar_insert(" ", OldUncleanUnichars::kTrue);
// Null char is next if we have one.
if (null_id >= 0) {
direct_set.unichar_insert(kNullChar);
......@@ -160,7 +160,8 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
if (it != radical_map.end()) {
// This is Han. Convert to radical, stroke, index.
if (!radicals.contains_unichar(it->second.radical.string())) {
radicals.unichar_insert(it->second.radical.string());
radicals.unichar_insert(it->second.radical.string(),
OldUncleanUnichars::kTrue);
}
int radical = radicals.unichar_to_id(it->second.radical.string());
int num_strokes = it->second.num_strokes;
......
......@@ -31,41 +31,24 @@ UNICHARMAP::~UNICHARMAP() {
delete[] nodes;
}
// Search the given unichar representation in the tree. Each character in the
// string is interpreted as an index in an array of nodes.
UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr) const {
const char* current_char = unichar_repr;
UNICHARMAP_NODE* current_nodes = nodes;
assert(*unichar_repr != '\0');
do {
if (*(current_char + 1) == '\0')
return current_nodes[static_cast<unsigned char>(*current_char)].id;
current_nodes =
current_nodes[static_cast<unsigned char>(*current_char)].children;
++current_char;
} while (true);
}
// Search the given unichar representation in the tree, using length characters
// from it maximum. Each character in the string is interpreted as an index in
// an array of nodes.
UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
int length) const {
const char* current_char = unichar_repr;
UNICHARMAP_NODE* current_nodes = nodes;
assert(*unichar_repr != '\0');
assert(length > 0 && length <= UNICHAR_LEN);
int index = 0;
if (index >= length || unichar_repr[index] == '\0') return INVALID_UNICHAR_ID;
do {
if (length == 1 || *(current_char + 1) == '\0')
return current_nodes[static_cast<unsigned char>(*current_char)].id;
if (index + 1 >= length || unichar_repr[index + 1] == '\0')
return current_nodes[static_cast<unsigned char>(unichar_repr[index])].id;
current_nodes =
current_nodes[static_cast<unsigned char>(*current_char)].children;
++current_char;
--length;
current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
++index;
} while (true);
}
......@@ -75,15 +58,12 @@ UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
// string is interpreted as an index in an array of nodes.
void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
const char* current_char = unichar_repr;
if (*current_char == '\0') return;
UNICHARMAP_NODE** current_nodes_pointer = &nodes;
assert(*unichar_repr != '\0');
assert(id >= 0);
do {
if (*current_nodes_pointer == 0)
*current_nodes_pointer = new UNICHARMAP_NODE[256];
if (*(current_char + 1) == '\0') {
if (current_char[1] == '\0') {
(*current_nodes_pointer)
[static_cast<unsigned char>(*current_char)].id = id;
return;
......@@ -95,24 +75,6 @@ void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
} while (true);
}
// Search the given unichar representation in the tree. Each character in the
// string is interpreted as an index in an array of nodes. Stop once the tree
// does not have anymore nodes or once we found the right unichar_repr.
bool UNICHARMAP::contains(const char* const unichar_repr) const {
if (unichar_repr == NULL || *unichar_repr == '\0') return false;
const char* current_char = unichar_repr;
UNICHARMAP_NODE* current_nodes = nodes;
while (current_nodes != 0 && *(current_char + 1) != '\0') {
current_nodes =
current_nodes[static_cast<unsigned char>(*current_char)].children;
++current_char;
}
return current_nodes != 0 && *(current_char + 1) == '\0' &&
current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
}
// Search the given unichar representation in the tree, using length characters
// from it maximum. Each character in the string is interpreted as an index in
// an array of nodes. Stop once the tree does not have anymore nodes or once we
......@@ -121,24 +83,26 @@ bool UNICHARMAP::contains(const char* const unichar_repr,
int length) const {
if (unichar_repr == NULL || *unichar_repr == '\0') return false;
if (length <= 0 || length > UNICHAR_LEN) return false;
const char* current_char = unichar_repr;
int index = 0;
if (index >= length || unichar_repr[index] == '\0') return false;
UNICHARMAP_NODE* current_nodes = nodes;
while (current_nodes != 0 && (length > 1 && *(current_char + 1) != '\0')) {
while (current_nodes != 0 && index + 1 < length &&
unichar_repr[index + 1] != '\0') {
current_nodes =
current_nodes[static_cast<unsigned char>(*current_char)].children;
--length;
++current_char;
current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
++index;
}
return current_nodes != 0 && (length == 1 || *(current_char + 1) == '\0') &&
current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
return current_nodes != 0 &&
(index + 1 >= length || unichar_repr[index + 1] == '\0') &&
current_nodes[static_cast<unsigned char>(unichar_repr[index])].id >= 0;
}
// Return the minimum number of characters that must be used from this string
// to obtain a match in the UNICHARMAP.
int UNICHARMAP::minmatch(const char* const unichar_repr) const {
const char* current_char = unichar_repr;
if (*current_char == '\0') return 0;
UNICHARMAP_NODE* current_nodes = nodes;
while (current_nodes != NULL && *current_char != '\0') {
......
......@@ -36,21 +36,12 @@ class UNICHARMAP {
// with the given id. The length of the representation MUST be non-zero.
void insert(const char* const unichar_repr, UNICHAR_ID id);
// Return the id associated with the given unichar representation,
// this representation MUST exist within the UNICHARMAP.
// The length of the representation MUST be non-zero.
UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;
// Return the id associated with the given unichar representation,
// this representation MUST exist within the UNICHARMAP. The first
// length characters (maximum) from unichar_repr are used. The length
// MUST be non-zero.
UNICHAR_ID unichar_to_id(const char* const unichar_repr, int length) const;
// Return true if the given unichar representation is already present in the
// UNICHARMAP. The length of the representation MUST be non-zero.
bool contains(const char* const unichar_repr) const;
// Return true if the given unichar representation is already present in the
// UNICHARMAP. The first length characters (maximum) from unichar_repr are
// used. The length MUST be non-zero.
......
......@@ -67,6 +67,15 @@ const char* UNICHARSET::kCustomLigatures[][2] = {
{NULL, NULL}
};
// List of mappings to make when ingesting strings from the outside.
// The substitutions clean up text that should exist for rendering of
// synthetic data, but not in the recognition set.
const char* UNICHARSET::kCleanupMaps[][2] = {
{"\u0640", ""}, // TATWEEL is deleted.
{"\ufb01", "fi"}, // fi ligature->fi pair.
{"\ufb02", "fl"}, // fl ligature->fl pair.
{nullptr, nullptr}};
// List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
const char* UNICHARSET::kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT] = {
" ",
......@@ -196,15 +205,21 @@ void UNICHARSET::reserve(int unichars_number) {
UNICHAR_ID
UNICHARSET::unichar_to_id(const char* const unichar_repr) const {
return ids.contains(unichar_repr) ?
ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID;
string cleaned =
old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
return ids.contains(cleaned.data(), cleaned.size())
? ids.unichar_to_id(cleaned.data(), cleaned.size())
: INVALID_UNICHAR_ID;
}
UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
int length) const {
assert(length > 0 && length <= UNICHAR_LEN);
return ids.contains(unichar_repr, length) ?
ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID;
string cleaned(unichar_repr, length);
if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
return ids.contains(cleaned.data(), cleaned.size())
? ids.unichar_to_id(cleaned.data(), cleaned.size())
: INVALID_UNICHAR_ID;
}
// Return the minimum number of bytes that matches a legal UNICHAR_ID,
......@@ -235,6 +250,9 @@ bool UNICHARSET::encodable_string(const char *str,
// the rest of the string is still encoded.
// If lengths is not NULL, then it is filled with the corresponding
// byte length of each encoded UNICHAR_ID.
// WARNING: Caller must guarantee that str has already been cleaned of codes
// that do not belong in the unicharset, or encoding may fail.
// Use CleanupString to perform the cleaning.
bool UNICHARSET::encode_string(const char* str, bool give_up_on_failure,
GenericVector<UNICHAR_ID>* encoding,
GenericVector<char>* lengths,
......@@ -429,7 +447,7 @@ void UNICHARSET::CopyFrom(const UNICHARSET& src) {
for (int ch = 0; ch < src.size_used; ++ch) {
const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
const char* utf8 = src.id_to_unichar(ch);
unichar_insert(utf8);
unichar_insert_backwards_compatible(utf8);
unichars[ch].properties.ExpandRangesFrom(src_props);
}
// Set properties, including mirror and other_case, WITHOUT reordering
......@@ -445,24 +463,13 @@ void UNICHARSET::AppendOtherUnicharset(const UNICHARSET& src) {
for (int ch = 0; ch < src.size_used; ++ch) {
const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
const char* utf8 = src.id_to_unichar(ch);
if (ch >= SPECIAL_UNICHAR_CODES_COUNT && src_props.AnyRangeEmpty()) {
// Only use fully valid entries.
tprintf("Bad properties for index %d, char %s: "
"%d,%d %d,%d %g,%g %g,%g %g,%g\n",
ch, utf8, src_props.min_bottom, src_props.max_bottom,
src_props.min_top, src_props.max_top,
src_props.width, src_props.width_sd,
src_props.bearing, src_props.bearing_sd,
src_props.advance, src_props.advance_sd);
continue;
}
int id = size_used;
if (contains_unichar(utf8)) {
id = unichar_to_id(utf8);
// Just expand current ranges.
unichars[id].properties.ExpandRangesFrom(src_props);
} else {
unichar_insert(utf8);
unichar_insert_backwards_compatible(utf8);
unichars[id].properties.SetRangesEmpty();
}
}
......@@ -613,40 +620,55 @@ char UNICHARSET::get_chartype(UNICHAR_ID id) const {
return 0;
}
void UNICHARSET::unichar_insert(const char* const unichar_repr) {
if (!ids.contains(unichar_repr)) {
if (strlen(unichar_repr) > UNICHAR_LEN) {
fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n",
int(strlen(unichar_repr)), unichar_repr);
void UNICHARSET::unichar_insert(const char* const unichar_repr,
OldUncleanUnichars old_style) {
if (old_style == OldUncleanUnichars::kTrue) old_style_included_ = true;
string cleaned =
old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) {
const char* str = cleaned.c_str();
GenericVector<int> encoding;
if (!old_style_included_ &&
encode_string(str, true, &encoding, nullptr, nullptr))
return;
}
if (size_used == size_reserved) {
if (size_used == 0)
reserve(8);
else
reserve(2 * size_used);
}
strcpy(unichars[size_used].representation, unichar_repr);
int index = 0;
do {
if (index > UNICHAR_LEN) {
fprintf(stderr, "Utf8 buffer too big, size>%d for %s\n", UNICHAR_LEN,
unichar_repr);
return;
}
unichars[size_used].representation[index++] = *str++;
} while (*str != '\0');
unichars[size_used].representation[index] = '\0';
this->set_script(size_used, null_script);
// If the given unichar_repr represents a fragmented character, set
// fragment property to a pointer to CHAR_FRAGMENT class instance with
// information parsed from the unichar representation. Use the script
// of the base unichar for the fragmented character if possible.
CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr);
CHAR_FRAGMENT* frag =
CHAR_FRAGMENT::parse_from_string(unichars[size_used].representation);
this->unichars[size_used].properties.fragment = frag;
if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
this->unichars[size_used].properties.script_id =
this->get_script(frag->get_unichar());
}
this->unichars[size_used].properties.enabled = true;
ids.insert(unichar_repr, size_used);
ids.insert(unichars[size_used].representation, size_used);
++size_used;
}
}
bool UNICHARSET::contains_unichar(const char* const unichar_repr) const {
return ids.contains(unichar_repr);
string cleaned =
old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
return ids.contains(cleaned.data(), cleaned.size());
}
bool UNICHARSET::contains_unichar(const char* const unichar_repr,
......@@ -654,7 +676,9 @@ bool UNICHARSET::contains_unichar(const char* const unichar_repr,
if (length == 0) {
return false;
}
return ids.contains(unichar_repr, length);
string cleaned(unichar_repr, length);
if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
return ids.contains(cleaned.data(), cleaned.size());
}
bool UNICHARSET::eq(UNICHAR_ID unichar_id,
......@@ -840,7 +864,7 @@ bool UNICHARSET::load_via_fgets(
if (strcmp(unichar, "NULL") == 0)
this->unichar_insert(" ");
else
this->unichar_insert(unichar);
this->unichar_insert_backwards_compatible(unichar);
this->set_isalpha(id, properties & ISALPHA_MASK);
this->set_islower(id, properties & ISLOWER_MASK);
......@@ -1088,3 +1112,32 @@ int UNICHARSET::get_script_id_from_name(const char* script_name) const {
}
return 0; // 0 is always the null_script
}
// Removes/replaces content that belongs in rendered text, but not in the
// unicharset.
/* static */
string UNICHARSET::CleanupString(const char* utf8_str, int length) {
string result;
result.reserve(length);
char ch;
while ((ch = *utf8_str) != '\0' && --length >= 0) {
int key_index = 0;
const char* key;
while ((key = kCleanupMaps[key_index][0]) != nullptr) {
int match = 0;
while (key[match] != '\0' && key[match] == utf8_str[match]) ++match;
if (key[match] == '\0') {
utf8_str += match;
break;
}
++key_index;
}
if (key == nullptr) {
result.push_back(ch);
++utf8_str;
} else {
result.append(kCleanupMaps[key_index][1]);
}
}
return result;
}
......@@ -39,6 +39,13 @@ enum SpecialUnicharCodes {
SPECIAL_UNICHAR_CODES_COUNT
};
// Boolean flag for unichar_insert. It's a bit of a double negative to allow
// the default value to be false.
enum class OldUncleanUnichars {
kFalse,
kTrue,
};
class CHAR_FRAGMENT {
public:
// Minimum number of characters used for fragment representation.
......@@ -190,7 +197,7 @@ class UNICHARSET {
// Use encode_string in preference to repeatedly calling step.
int step(const char* str) const;
// Return whether the given UTF-8 string is encodable with this UNICHARSET.
// Returns true if the given UTF-8 string is encodable with this UNICHARSET.
// If not encodable, write the first byte offset which cannot be converted
// into the second (return) argument.
bool encodable_string(const char *str, int *first_bad_position) const;
......@@ -207,6 +214,9 @@ class UNICHARSET {
// If encoded_length is not NULL then on return it contains the length of
// str that was encoded. (if give_up_on_failure the location of the first
// failure, otherwise strlen(str).)
// WARNING: Caller must guarantee that str has already been cleaned of codes
// that do not belong in the unicharset, or encoding may fail.
// Use CleanupString to perform the cleaning.
bool encode_string(const char* str, bool give_up_on_failure,
GenericVector<UNICHAR_ID>* encoding,
GenericVector<char>* lengths,
......@@ -226,6 +236,13 @@ class UNICHARSET {
// by its hex unicodes.
static STRING debug_utf8_str(const char* str);
// Removes/replaces content that belongs in rendered text, but not in the
// unicharset.
static string CleanupString(const char* utf8_str) {
return CleanupString(utf8_str, strlen(utf8_str));
}
static string CleanupString(const char* utf8_str, int length);
// Return a STRING containing debug information on the unichar, including
// the id_to_unichar, its hex unicodes and the properties.
STRING debug_str(UNICHAR_ID id) const;
......@@ -233,8 +250,29 @@ class UNICHARSET {
return debug_str(unichar_to_id(unichar_repr));
}
// Add a unichar representation to the set.
void unichar_insert(const char* const unichar_repr);
// Adds a unichar representation to the set. If old_style is true, then
// TATWEEL characters are kept and n-grams are allowed. Otherwise TATWEEL
// characters are ignored/skipped as if they don't exist and n-grams that
// can already be encoded are not added.
void unichar_insert(const char* const unichar_repr,
OldUncleanUnichars old_style);
void unichar_insert(const char* const unichar_repr) {
unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
}
// Adds a unichar representation to the set. Avoids setting old_style to true,
// unless it is necessary to make the new unichar get added.
void unichar_insert_backwards_compatible(const char* const unichar_repr) {
string cleaned = CleanupString(unichar_repr);
if (cleaned != unichar_repr) {
unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
} else {
int old_size = size();
unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
if (size() == old_size) {
unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
}
}
}
// Return true if the given unichar id exists within the set.
// Relies on the fact that unichar ids are contiguous in the unicharset.
......@@ -282,6 +320,7 @@ class UNICHARSET {
top_bottom_set_ = false;
script_has_upper_lower_ = false;
script_has_xheight_ = false;
old_style_included_ = false;
null_sid_ = 0;
common_sid_ = 0;
latin_sid_ = 0;
......@@ -743,7 +782,7 @@ class UNICHARSET {
// unichar representation represents a character fragment.
const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {
if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
!ids.contains(unichar_repr)) {
!ids.contains(unichar_repr, false)) {
return NULL;
}
return get_fragment(unichar_to_id(unichar_repr));
......@@ -965,6 +1004,11 @@ class UNICHARSET {
bool load_via_fgets(TessResultCallback2<char *, char *, int> *fgets_cb,
bool skip_fragments);
// List of mappings to make when ingesting strings from the outside.
// The substitutions clean up text that should exists for rendering of
// synthetic data, but not in the recognition set.
static const char* kCleanupMaps[][2];
UNICHAR_SLOT* unichars;
UNICHARMAP ids;
int size_used;
......@@ -980,6 +1024,8 @@ class UNICHARSET {
// True if the unicharset has a significant mean-line with significant
// ascenders above that.
bool script_has_xheight_;
// True if the set contains chars that would be changed by the cleanup.
bool old_style_included_;
// A few convenient script name-to-id mapping without using hash.
// These are initialized when unicharset file is loaded. Anything
......
......@@ -170,6 +170,7 @@ bool LSTMTrainer::InitNetwork(const STRING& network_spec, int append_index,
tprintf("Training parameters:\n Debug interval = %d,"
" weights = %g, learning rate = %g, momentum=%g\n",
debug_interval_, weight_range_, learning_rate_, momentum_);
tprintf("null char=%d\n", null_char_);
return true;
}
......@@ -733,7 +734,8 @@ bool LSTMTrainer::EncodeString(const STRING& str, const UNICHARSET& unicharset,
GenericVector<int> internal_labels;
labels->truncate(0);
if (!simple_text) labels->push_back(null_char);
if (unicharset.encode_string(str.string(), true, &internal_labels, NULL,
string cleaned = unicharset.CleanupString(str.string());
if (unicharset.encode_string(cleaned.c_str(), true, &internal_labels, NULL,
&err_index)) {
bool success = true;
for (int i = 0; i < internal_labels.size(); ++i) {
......@@ -759,8 +761,8 @@ bool LSTMTrainer::EncodeString(const STRING& str, const UNICHARSET& unicharset,
if (success) return true;
}
tprintf("Encoding of string failed! Failure bytes:");
while (err_index < str.length()) {
tprintf(" %x", str[err_index++]);
while (err_index < cleaned.size()) {
tprintf(" %x", cleaned[err_index++]);
}
tprintf("\n");
return false;
......@@ -813,8 +815,9 @@ Trainability LSTMTrainer::PrepareForBackward(const ImageData* trainingdata,
training_iteration() % debug_interval_ == 0;
GenericVector<int> truth_labels;
if (!EncodeString(trainingdata->transcription(), &truth_labels)) {
tprintf("Can't encode transcription: %s\n",
trainingdata->transcription().string());
tprintf("Can't encode transcription: '%s' in language '%s'\n",
trainingdata->transcription().string(),
trainingdata->language().string());
return UNENCODABLE;
}
int w = 0;
......
......@@ -409,9 +409,7 @@ using tesseract::SpanUTF8NotWhitespace;
using tesseract::SpanUTF8Whitespace;
using tesseract::StringRenderer;
int main(int argc, char** argv) {
tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
int Main() {
if (FLAGS_list_available_fonts) {
const std::vector<string>& all_fonts = FontUtils::ListAvailableFonts();
for (unsigned int i = 0; i < all_fonts.size(); ++i) {
......@@ -543,8 +541,9 @@ int main(int argc, char** argv) {
const char *curr_pos = str8 + offsets[i].first;
int ngram_len = offsets[i].second;
// Skip words that contain characters not in found in unicharset.
string cleaned = UNICHARSET::CleanupString(curr_pos, ngram_len);
if (!FLAGS_unicharset_file.empty() &&
!unicharset.encodable_string(curr_pos, nullptr)) {
!unicharset.encodable_string(cleaned.c_str(), nullptr)) {
continue;
}
rand_utf8.append(curr_pos, ngram_len);
......@@ -665,3 +664,8 @@ int main(int argc, char** argv) {
return 0;
}
int main(int argc, char** argv) {
tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
Main();
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册