From b0ead95d64a3667339775b2f99ac37e97e90c2a0 Mon Sep 17 00:00:00 2001
From: Ray Smith <rays@google.com>
Date: Mon, 24 Jul 2017 11:45:57 -0700
Subject: [PATCH] =?UTF-8?q?Changed=20the=20way=20unicharsets=20are=20handl?=
 =?UTF-8?q?ed=20to=20allow=20support=20for=20the=20=E2=84=A2=20character.?=
 =?UTF-8?q?=20Can=20find=20the=20issue=20where=20it=20was=20requested.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ccstruct/ratngs.cpp        |   7 ++-
 ccutil/ambigs.cpp          |   4 +-
 ccutil/unicharcompress.cpp |   5 +-
 ccutil/unicharmap.cpp      |  72 ++++++-----------------
 ccutil/unicharmap.h        |   9 ---
 ccutil/unicharset.cpp      | 113 +++++++++++++++++++++++++++----------
 ccutil/unicharset.h        |  54 ++++++++++++++++--
 lstm/lstmtrainer.cpp       |  13 +++--
 training/text2image.cpp    |  12 ++--
 9 files changed, 177 insertions(+), 112 deletions(-)
diff --git a/ccstruct/ratngs.cpp b/ccstruct/ratngs.cpp
index 03ed873c..888c026c 100644
--- a/ccstruct/ratngs.cpp
+++ b/ccstruct/ratngs.cpp
@@ -24,6 +24,7 @@
 
 #include "ratngs.h"
 
+#include <string>
 #include "blobs.h"
 #include "callcpp.h"
 #include "genericvector.h"
@@ -200,10 +201,12 @@ WERD_CHOICE::WERD_CHOICE(const char *src_string,
     : unicharset_(&unicharset){
   GenericVector<UNICHAR_ID> encoding;
   GenericVector<char> lengths;
-  if (unicharset.encode_string(src_string, true, &encoding, &lengths, NULL)) {
+  string cleaned = unicharset.CleanupString(src_string);
+  if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths,
+                               NULL)) {
     lengths.push_back('\0');
     STRING src_lengths = &lengths[0];
-    this->init(src_string, src_lengths.string(), 0.0, 0.0, NO_PERM);
+    this->init(cleaned.c_str(), src_lengths.string(), 0.0, 0.0, NO_PERM);
   } else {  // There must have been an invalid unichar in the string.
     this->init(8);
     this->make_bad();
diff --git a/ccutil/ambigs.cpp b/ccutil/ambigs.cpp
index b940dea0..2db2d820 100644
--- a/ccutil/ambigs.cpp
+++ b/ccutil/ambigs.cpp
@@ -357,7 +357,7 @@ bool UnicharAmbigs::InsertIntoTable(
   // Insert the corresponding correct ngram into the unicharset.
   // Unicharset code assumes that the "base" ngram is inserted into
   // the unicharset before fragments of this ngram are inserted.
-  unicharset->unichar_insert(replacement_string);
+  unicharset->unichar_insert(replacement_string, OldUncleanUnichars::kTrue);
   ambig_spec->correct_ngram_id =
     unicharset->unichar_to_id(replacement_string);
   if (replacement_ambig_part_size > 1) {
@@ -372,7 +372,7 @@ bool UnicharAmbigs::InsertIntoTable(
     } else {
       STRING frag_str = CHAR_FRAGMENT::to_string(
           replacement_string, i, test_ambig_part_size, false);
-      unicharset->unichar_insert(frag_str.string());
+      unicharset->unichar_insert(frag_str.string(), OldUncleanUnichars::kTrue);
       unichar_id = unicharset->unichar_to_id(frag_str.string());
     }
     ambig_spec->correct_fragments[i] = unichar_id;
diff --git a/ccutil/unicharcompress.cpp b/ccutil/unicharcompress.cpp
index 64b8f578..c030d566 100644
--- a/ccutil/unicharcompress.cpp
+++ b/ccutil/unicharcompress.cpp
@@ -117,7 +117,7 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
   direct_set.clear();
   radicals.clear();
   // Always keep space as 0;
-  direct_set.unichar_insert(" ");
+  direct_set.unichar_insert(" ", OldUncleanUnichars::kTrue);
   // Null char is next if we have one.
   if (null_id >= 0) {
     direct_set.unichar_insert(kNullChar);
@@ -160,7 +160,8 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
       if (it != radical_map.end()) {
         // This is Han. Convert to radical, stroke, index.
         if (!radicals.contains_unichar(it->second.radical.string())) {
-          radicals.unichar_insert(it->second.radical.string());
+          radicals.unichar_insert(it->second.radical.string(),
+                                  OldUncleanUnichars::kTrue);
         }
         int radical = radicals.unichar_to_id(it->second.radical.string());
         int num_strokes = it->second.num_strokes;
diff --git a/ccutil/unicharmap.cpp b/ccutil/unicharmap.cpp
index 6b1bb1d6..b13acdc3 100644
--- a/ccutil/unicharmap.cpp
+++ b/ccutil/unicharmap.cpp
@@ -31,41 +31,24 @@ UNICHARMAP::~UNICHARMAP() {
     delete[] nodes;
 }
 
-// Search the given unichar representation in the tree. Each character in the
-// string is interpreted as an index in an array of nodes.
-UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr) const {
-  const char* current_char = unichar_repr;
-  UNICHARMAP_NODE* current_nodes = nodes;
-
-  assert(*unichar_repr != '\0');
-
-  do {
-    if (*(current_char + 1) == '\0')
-      return current_nodes[static_cast<unsigned char>(*current_char)].id;
-    current_nodes =
-        current_nodes[static_cast<unsigned char>(*current_char)].children;
-    ++current_char;
-  } while (true);
-}
-
 // Search the given unichar representation in the tree, using length characters
 // from it maximum. Each character in the string is interpreted as an index in
 // an array of nodes.
 UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
                                      int length) const {
-  const char* current_char = unichar_repr;
   UNICHARMAP_NODE* current_nodes = nodes;
 
   assert(*unichar_repr != '\0');
   assert(length > 0 && length <= UNICHAR_LEN);
 
+  int index = 0;
+  if (index >= length || unichar_repr[index] == '\0') return INVALID_UNICHAR_ID;
   do {
-    if (length == 1 || *(current_char + 1) == '\0')
-      return current_nodes[static_cast<unsigned char>(*current_char)].id;
+    if (index + 1 >= length || unichar_repr[index + 1] == '\0')
+      return current_nodes[static_cast<unsigned char>(unichar_repr[index])].id;
     current_nodes =
-        current_nodes[static_cast<unsigned char>(*current_char)].children;
-    ++current_char;
-    --length;
+        current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
+    ++index;
   } while (true);
 }
 
@@ -75,15 +58,12 @@ UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
 // string is interpreted as an index in an array of nodes.
 void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
   const char* current_char = unichar_repr;
+  if (*current_char == '\0') return;
   UNICHARMAP_NODE** current_nodes_pointer = &nodes;
-
-  assert(*unichar_repr != '\0');
-  assert(id >= 0);
-
   do {
     if (*current_nodes_pointer == 0)
       *current_nodes_pointer = new UNICHARMAP_NODE[256];
-    if (*(current_char + 1) == '\0') {
+    if (current_char[1] == '\0') {
       (*current_nodes_pointer)
           [static_cast<unsigned char>(*current_char)].id = id;
       return;
@@ -95,24 +75,6 @@ void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
   } while (true);
 }
 
-// Search the given unichar representation in the tree. Each character in the
-// string is interpreted as an index in an array of nodes. Stop once the tree
-// does not have anymore nodes or once we found the right unichar_repr.
-bool UNICHARMAP::contains(const char* const unichar_repr) const {
-  if (unichar_repr == NULL || *unichar_repr == '\0') return false;
-
-  const char* current_char = unichar_repr;
-  UNICHARMAP_NODE* current_nodes = nodes;
-
-  while (current_nodes != 0 && *(current_char + 1) != '\0') {
-    current_nodes =
-        current_nodes[static_cast<unsigned char>(*current_char)].children;
-    ++current_char;
-  }
-  return current_nodes != 0 && *(current_char + 1) == '\0' &&
-      current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
-}
-
 // Search the given unichar representation in the tree, using length characters
 // from it maximum. Each character in the string is interpreted as an index in
 // an array of nodes. Stop once the tree does not have anymore nodes or once we
@@ -121,24 +83,26 @@ bool UNICHARMAP::contains(const char* const unichar_repr,
                           int length) const {
   if (unichar_repr == NULL || *unichar_repr == '\0') return false;
   if (length <= 0 || length > UNICHAR_LEN) return false;
-
-  const char* current_char = unichar_repr;
+  int index = 0;
+  if (index >= length || unichar_repr[index] == '\0') return false;
   UNICHARMAP_NODE* current_nodes = nodes;
 
-  while (current_nodes != 0 && (length > 1 && *(current_char + 1) != '\0')) {
+  while (current_nodes != 0 && index + 1 < length &&
+         unichar_repr[index + 1] != '\0') {
     current_nodes =
-        current_nodes[static_cast<unsigned char>(*current_char)].children;
-    --length;
-    ++current_char;
+        current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
+    ++index;
   }
-  return current_nodes != 0 && (length == 1 || *(current_char + 1) == '\0') &&
-      current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
+  return current_nodes != 0 &&
+         (index + 1 >= length || unichar_repr[index + 1] == '\0') &&
+         current_nodes[static_cast<unsigned char>(unichar_repr[index])].id >= 0;
 }
 
 // Return the minimum number of characters that must be used from this string
 // to obtain a match in the UNICHARMAP.
 int UNICHARMAP::minmatch(const char* const unichar_repr) const {
   const char* current_char = unichar_repr;
+  if (*current_char == '\0') return 0;
   UNICHARMAP_NODE* current_nodes = nodes;
 
   while (current_nodes != NULL && *current_char != '\0') {
diff --git a/ccutil/unicharmap.h b/ccutil/unicharmap.h
index ecc4065e..45170c4f 100644
--- a/ccutil/unicharmap.h
+++ b/ccutil/unicharmap.h
@@ -36,21 +36,12 @@ class UNICHARMAP {
   // with the given id. The length of the representation MUST be non-zero.
   void insert(const char* const unichar_repr, UNICHAR_ID id);
 
-  // Return the id associated with the given unichar representation,
-  // this representation MUST exist within the UNICHARMAP.
-  // The length of the representation MUST be non-zero.
-  UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;
-
   // Return the id associated with the given unichar representation,
   // this representation MUST exist within the UNICHARMAP. The first
   // length characters (maximum) from unichar_repr are used. The length
   // MUST be non-zero.
   UNICHAR_ID unichar_to_id(const char* const unichar_repr, int length) const;
 
-  // Return true if the given unichar representation is already present in the
-  // UNICHARMAP. The length of the representation MUST be non-zero.
-  bool contains(const char* const unichar_repr) const;
-
   // Return true if the given unichar representation is already present in the
   // UNICHARMAP. The first length characters (maximum) from unichar_repr are
   // used. The length MUST be non-zero.
diff --git a/ccutil/unicharset.cpp b/ccutil/unicharset.cpp
index aa87c127..bd904b3f 100644
--- a/ccutil/unicharset.cpp
+++ b/ccutil/unicharset.cpp
@@ -67,6 +67,15 @@ const char* UNICHARSET::kCustomLigatures[][2] = {
   {NULL, NULL}
 };
 
+// List of mappings to make when ingesting strings from the outside.
+// The substitutions clean up text that should exist for rendering of
+// synthetic data, but not in the recognition set.
+const char* UNICHARSET::kCleanupMaps[][2] = {
+    {"\u0640", ""},    // TATWEEL is deleted.
+    {"\ufb01", "fi"},  // fi ligature->fi pair.
+    {"\ufb02", "fl"},  // fl ligature->fl pair.
+    {nullptr, nullptr}};
+
 // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
 const char* UNICHARSET::kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT] = {
     " ",
@@ -196,15 +205,21 @@ void UNICHARSET::reserve(int unichars_number) {
 
 UNICHAR_ID
 UNICHARSET::unichar_to_id(const char* const unichar_repr) const {
-  return ids.contains(unichar_repr) ?
-    ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID;
+  string cleaned =
+      old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
+  return ids.contains(cleaned.data(), cleaned.size())
+             ? ids.unichar_to_id(cleaned.data(), cleaned.size())
+             : INVALID_UNICHAR_ID;
 }
 
 UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
                                      int length) const {
   assert(length > 0 && length <= UNICHAR_LEN);
-  return ids.contains(unichar_repr, length) ?
-    ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID;
+  string cleaned(unichar_repr, length);
+  if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
+  return ids.contains(cleaned.data(), cleaned.size())
+             ? ids.unichar_to_id(cleaned.data(), cleaned.size())
+             : INVALID_UNICHAR_ID;
 }
 
 // Return the minimum number of bytes that matches a legal UNICHAR_ID,
@@ -235,6 +250,9 @@ bool UNICHARSET::encodable_string(const char *str,
 // the rest of the string is still encoded.
 // If lengths is not NULL, then it is filled with the corresponding
 // byte length of each encoded UNICHAR_ID.
+// WARNING: Caller must guarantee that str has already been cleaned of codes
+// that do not belong in the unicharset, or encoding may fail.
+// Use CleanupString to perform the cleaning.
 bool UNICHARSET::encode_string(const char* str, bool give_up_on_failure,
                                GenericVector<UNICHAR_ID>* encoding,
                                GenericVector<char>* lengths,
@@ -429,7 +447,7 @@ void UNICHARSET::CopyFrom(const UNICHARSET& src) {
   for (int ch = 0; ch < src.size_used; ++ch) {
     const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
     const char* utf8 = src.id_to_unichar(ch);
-    unichar_insert(utf8);
+    unichar_insert_backwards_compatible(utf8);
     unichars[ch].properties.ExpandRangesFrom(src_props);
   }
   // Set properties, including mirror and other_case, WITHOUT reordering
@@ -445,24 +463,13 @@ void UNICHARSET::AppendOtherUnicharset(const UNICHARSET& src) {
   for (int ch = 0; ch < src.size_used; ++ch) {
     const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
     const char* utf8 = src.id_to_unichar(ch);
-    if (ch >= SPECIAL_UNICHAR_CODES_COUNT && src_props.AnyRangeEmpty()) {
-      // Only use fully valid entries.
-      tprintf("Bad properties for index %d, char %s: "
-              "%d,%d %d,%d %g,%g %g,%g %g,%g\n",
-              ch, utf8, src_props.min_bottom, src_props.max_bottom,
-              src_props.min_top, src_props.max_top,
-              src_props.width, src_props.width_sd,
-              src_props.bearing, src_props.bearing_sd,
-              src_props.advance, src_props.advance_sd);
-      continue;
-    }
     int id = size_used;
     if (contains_unichar(utf8)) {
       id = unichar_to_id(utf8);
       // Just expand current ranges.
       unichars[id].properties.ExpandRangesFrom(src_props);
     } else {
-      unichar_insert(utf8);
+      unichar_insert_backwards_compatible(utf8);
       unichars[id].properties.SetRangesEmpty();
     }
   }
@@ -613,40 +620,55 @@ char UNICHARSET::get_chartype(UNICHAR_ID id) const {
   return 0;
 }
 
-void UNICHARSET::unichar_insert(const char* const unichar_repr) {
-  if (!ids.contains(unichar_repr)) {
-    if (strlen(unichar_repr) > UNICHAR_LEN) {
-      fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n",
-              int(strlen(unichar_repr)), unichar_repr);
+void UNICHARSET::unichar_insert(const char* const unichar_repr,
+                                OldUncleanUnichars old_style) {
+  if (old_style == OldUncleanUnichars::kTrue) old_style_included_ = true;
+  string cleaned =
+      old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
+  if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) {
+    const char* str = cleaned.c_str();
+    GenericVector<int> encoding;
+    if (!old_style_included_ &&
+        encode_string(str, true, &encoding, nullptr, nullptr))
       return;
-    }
     if (size_used == size_reserved) {
       if (size_used == 0)
         reserve(8);
       else
         reserve(2 * size_used);
     }
-
-    strcpy(unichars[size_used].representation, unichar_repr);
+    int index = 0;
+    do {
+      if (index > UNICHAR_LEN) {
+        fprintf(stderr, "Utf8 buffer too big, size>%d for %s\n", UNICHAR_LEN,
+                unichar_repr);
+        return;
+      }
+      unichars[size_used].representation[index++] = *str++;
+    } while (*str != '\0');
+    unichars[size_used].representation[index] = '\0';
     this->set_script(size_used, null_script);
     // If the given unichar_repr represents a fragmented character, set
     // fragment property to a pointer to CHAR_FRAGMENT class instance with
     // information parsed from the unichar representation. Use the script
     // of the base unichar for the fragmented character if possible.
-    CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr);
+    CHAR_FRAGMENT* frag =
+        CHAR_FRAGMENT::parse_from_string(unichars[size_used].representation);
     this->unichars[size_used].properties.fragment = frag;
     if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
       this->unichars[size_used].properties.script_id =
         this->get_script(frag->get_unichar());
     }
     this->unichars[size_used].properties.enabled = true;
-    ids.insert(unichar_repr, size_used);
+    ids.insert(unichars[size_used].representation, size_used);
     ++size_used;
   }
 }
 
 bool UNICHARSET::contains_unichar(const char* const unichar_repr) const {
-  return ids.contains(unichar_repr);
+  string cleaned =
+      old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
+  return ids.contains(cleaned.data(), cleaned.size());
 }
 
 bool UNICHARSET::contains_unichar(const char* const unichar_repr,
@@ -654,7 +676,9 @@ bool UNICHARSET::contains_unichar(const char* const unichar_repr,
   if (length == 0) {
     return false;
   }
-  return ids.contains(unichar_repr, length);
+  string cleaned(unichar_repr, length);
+  if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
+  return ids.contains(cleaned.data(), cleaned.size());
 }
 
 bool UNICHARSET::eq(UNICHAR_ID unichar_id,
@@ -840,7 +864,7 @@ bool UNICHARSET::load_via_fgets(
     if (strcmp(unichar, "NULL") == 0)
       this->unichar_insert(" ");
     else
-      this->unichar_insert(unichar);
+      this->unichar_insert_backwards_compatible(unichar);
 
     this->set_isalpha(id, properties & ISALPHA_MASK);
     this->set_islower(id, properties & ISLOWER_MASK);
@@ -1088,3 +1112,32 @@ int UNICHARSET::get_script_id_from_name(const char* script_name) const {
   }
   return 0;  // 0 is always the null_script
 }
+
+// Removes/replaces content that belongs in rendered text, but not in the
+// unicharset.
+/* static */
+string UNICHARSET::CleanupString(const char* utf8_str, int length) {
+  string result;
+  result.reserve(length);
+  char ch;
+  while ((ch = *utf8_str) != '\0' && --length >= 0) {
+    int key_index = 0;
+    const char* key;
+    while ((key = kCleanupMaps[key_index][0]) != nullptr) {
+      int match = 0;
+      while (key[match] != '\0' && key[match] == utf8_str[match]) ++match;
+      if (key[match] == '\0') {
+        utf8_str += match;
+        break;
+      }
+      ++key_index;
+    }
+    if (key == nullptr) {
+      result.push_back(ch);
+      ++utf8_str;
+    } else {
+      result.append(kCleanupMaps[key_index][1]);
+    }
+  }
+  return result;
+}
diff --git a/ccutil/unicharset.h b/ccutil/unicharset.h
index a2e4e3b7..767c0de8 100644
--- a/ccutil/unicharset.h
+++ b/ccutil/unicharset.h
@@ -39,6 +39,13 @@ enum SpecialUnicharCodes {
   SPECIAL_UNICHAR_CODES_COUNT
 };
 
+// Boolean flag for unichar_insert. It's a bit of a double negative to allow
+// the default value to be false.
+enum class OldUncleanUnichars {
+  kFalse,
+  kTrue,
+};
+
 class CHAR_FRAGMENT {
  public:
   // Minimum number of characters used for fragment representation.
@@ -190,7 +197,7 @@ class UNICHARSET {
   // Use encode_string in preference to repeatedly calling step.
   int step(const char* str) const;
 
-  // Return whether the given UTF-8 string is encodable with this UNICHARSET.
+  // Returns true if the given UTF-8 string is encodable with this UNICHARSET.
   // If not encodable, write the first byte offset which cannot be converted
   // into the second (return) argument.
   bool encodable_string(const char *str, int *first_bad_position) const;
@@ -207,6 +214,9 @@ class UNICHARSET {
   // If encoded_length is not NULL then on return it contains the length of
   // str that was encoded. (if give_up_on_failure the location of the first
   // failure, otherwise strlen(str).)
+  // WARNING: Caller must guarantee that str has already been cleaned of codes
+  // that do not belong in the unicharset, or encoding may fail.
+  // Use CleanupString to perform the cleaning.
   bool encode_string(const char* str, bool give_up_on_failure,
                      GenericVector<UNICHAR_ID>* encoding,
                      GenericVector<char>* lengths,
@@ -226,6 +236,13 @@ class UNICHARSET {
   // by its hex unicodes.
   static STRING debug_utf8_str(const char* str);
 
+  // Removes/replaces content that belongs in rendered text, but not in the
+  // unicharset.
+  static string CleanupString(const char* utf8_str) {
+    return CleanupString(utf8_str, strlen(utf8_str));
+  }
+  static string CleanupString(const char* utf8_str, int length);
+
   // Return a STRING containing debug information on the unichar, including
   // the id_to_unichar, its hex unicodes and the properties.
   STRING debug_str(UNICHAR_ID id) const;
@@ -233,8 +250,29 @@ class UNICHARSET {
     return debug_str(unichar_to_id(unichar_repr));
   }
 
-  // Add a unichar representation to the set.
-  void unichar_insert(const char* const unichar_repr);
+  // Adds a unichar representation to the set. If old_style is true, then
+  // TATWEEL characters are kept and n-grams are allowed. Otherwise TATWEEL
+  // characters are ignored/skipped as if they don't exist and n-grams that
+  // can already be encoded are not added.
+  void unichar_insert(const char* const unichar_repr,
+                      OldUncleanUnichars old_style);
+  void unichar_insert(const char* const unichar_repr) {
+    unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
+  }
+  // Adds a unichar representation to the set. Avoids setting old_style to true,
+  // unless it is necessary to make the new unichar get added.
+  void unichar_insert_backwards_compatible(const char* const unichar_repr) {
+    string cleaned = CleanupString(unichar_repr);
+    if (cleaned != unichar_repr) {
+      unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
+    } else {
+      int old_size = size();
+      unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
+      if (size() == old_size) {
+        unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
+      }
+    }
+  }
 
   // Return true if the given unichar id exists within the set.
   // Relies on the fact that unichar ids are contiguous in the unicharset.
@@ -282,6 +320,7 @@ class UNICHARSET {
     top_bottom_set_ = false;
     script_has_upper_lower_ = false;
     script_has_xheight_ = false;
+    old_style_included_ = false;
     null_sid_ = 0;
     common_sid_ = 0;
     latin_sid_ = 0;
@@ -743,7 +782,7 @@ class UNICHARSET {
   // unichar representation represents a character fragment.
   const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {
     if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
-        !ids.contains(unichar_repr)) {
+        !ids.contains(unichar_repr, false)) {
       return NULL;
     }
     return get_fragment(unichar_to_id(unichar_repr));
@@ -965,6 +1004,11 @@ class UNICHARSET {
   bool load_via_fgets(TessResultCallback2<char *, char *, int> *fgets_cb,
                       bool skip_fragments);
 
+  // List of mappings to make when ingesting strings from the outside.
+  // The substitutions clean up text that should exists for rendering of
+  // synthetic data, but not in the recognition set.
+  static const char* kCleanupMaps[][2];
+
   UNICHAR_SLOT* unichars;
   UNICHARMAP ids;
   int size_used;
@@ -980,6 +1024,8 @@ class UNICHARSET {
   // True if the unicharset has a significant mean-line with significant
   // ascenders above that.
   bool script_has_xheight_;
+  // True if the set contains chars that would be changed by the cleanup.
+  bool old_style_included_;
 
   // A few convenient script name-to-id mapping without using hash.
   // These are initialized when unicharset file is loaded.  Anything
diff --git a/lstm/lstmtrainer.cpp b/lstm/lstmtrainer.cpp
index e9722d64..f13b278a 100644
--- a/lstm/lstmtrainer.cpp
+++ b/lstm/lstmtrainer.cpp
@@ -170,6 +170,7 @@ bool LSTMTrainer::InitNetwork(const STRING& network_spec, int append_index,
   tprintf("Training parameters:\n  Debug interval = %d,"
           " weights = %g, learning rate = %g, momentum=%g\n",
           debug_interval_, weight_range_, learning_rate_, momentum_);
+  tprintf("null char=%d\n", null_char_);
   return true;
 }
 
@@ -733,7 +734,8 @@ bool LSTMTrainer::EncodeString(const STRING& str, const UNICHARSET& unicharset,
   GenericVector<int> internal_labels;
   labels->truncate(0);
   if (!simple_text) labels->push_back(null_char);
-  if (unicharset.encode_string(str.string(), true, &internal_labels, NULL,
+  string cleaned = unicharset.CleanupString(str.string());
+  if (unicharset.encode_string(cleaned.c_str(), true, &internal_labels, NULL,
                                &err_index)) {
     bool success = true;
     for (int i = 0; i < internal_labels.size(); ++i) {
@@ -759,8 +761,8 @@ bool LSTMTrainer::EncodeString(const STRING& str, const UNICHARSET& unicharset,
     if (success) return true;
   }
   tprintf("Encoding of string failed! Failure bytes:");
-  while (err_index < str.length()) {
-    tprintf(" %x", str[err_index++]);
+  while (err_index < cleaned.size()) {
+    tprintf(" %x", cleaned[err_index++]);
   }
   tprintf("\n");
   return false;
@@ -813,8 +815,9 @@ Trainability LSTMTrainer::PrepareForBackward(const ImageData* trainingdata,
       training_iteration() % debug_interval_ == 0;
   GenericVector<int> truth_labels;
   if (!EncodeString(trainingdata->transcription(), &truth_labels)) {
-    tprintf("Can't encode transcription: %s\n",
-            trainingdata->transcription().string());
+    tprintf("Can't encode transcription: '%s' in language '%s'\n",
+            trainingdata->transcription().string(),
+            trainingdata->language().string());
     return UNENCODABLE;
   }
   int w = 0;
diff --git a/training/text2image.cpp b/training/text2image.cpp
index c3438060..0858d480 100644
--- a/training/text2image.cpp
+++ b/training/text2image.cpp
@@ -409,9 +409,7 @@ using tesseract::SpanUTF8NotWhitespace;
 using tesseract::SpanUTF8Whitespace;
 using tesseract::StringRenderer;
 
-int main(int argc, char** argv) {
-  tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
-
+int Main() {
   if (FLAGS_list_available_fonts) {
     const std::vector<string>& all_fonts = FontUtils::ListAvailableFonts();
     for (unsigned int i = 0; i < all_fonts.size(); ++i) {
@@ -543,8 +541,9 @@ int main(int argc, char** argv) {
       const char *curr_pos = str8 + offsets[i].first;
       int ngram_len = offsets[i].second;
       // Skip words that contain characters not in found in unicharset.
+      string cleaned = UNICHARSET::CleanupString(curr_pos, ngram_len);
       if (!FLAGS_unicharset_file.empty() &&
-          !unicharset.encodable_string(curr_pos, nullptr)) {
+          !unicharset.encodable_string(cleaned.c_str(), nullptr)) {
         continue;
       }
       rand_utf8.append(curr_pos, ngram_len);
@@ -665,3 +664,8 @@ int main(int argc, char** argv) {
 
   return 0;
 }
+
+int main(int argc, char** argv) {
+  tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
+  Main();
+}
-- 
GitLab