Fixed endian bug in dawg reader, Added word bigram correction,

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@649 d0cd1f9f-072b-0410-8dd7-cf729c803f20

Fixed endian bug in dawg reader, Added word bigram correction,
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@649 d0cd1f9f-072b-0410-8dd7-cf729c803f20
fdd4ffe8 · theraysmith@gmail.com · 6e3d810c · fdd4ffe8 · fdd4ffe8 · fdd4ffe8
15 changed file
--- a/dict/dawg.cpp
+++ b/dict/dawg.cpp
@@ -98,6 +98,32 @@ int Dawg::check_for_words(const char *filename,
  return misses;
 }

+void Dawg::iterate_words(const UNICHARSET &unicharset,
+                         TessCallback1<const char *> *cb) const {
+  WERD_CHOICE word(&unicharset);
+  iterate_words_rec(word, 0, cb);
+}
+
+void Dawg::iterate_words_rec(const WERD_CHOICE &word_so_far,
+                             NODE_REF to_explore,
+                             TessCallback1<const char *> *cb) const {
+  NodeChildVector children;
+  this->unichar_ids_of(to_explore, &children);
+  for (int i = 0; i < children.size(); i++) {
+    WERD_CHOICE next_word(word_so_far);
+    next_word.append_unichar_id(children[i].unichar_id, 1, 0.0, 0.0);
+    if (this->end_of_word(children[i].edge_ref)) {
+      STRING s;
+      next_word.string_and_lengths(&s, NULL);
+      cb->Run(s.string());
+    }
+    NODE_REF next = next_node(children[i].edge_ref);
+    if (next != 0) {
+      iterate_words_rec(next_word, next, cb);
+    }
+  }
+}
+
 bool Dawg::match_words(WERD_CHOICE *word, inT32 index,
                       NODE_REF node, UNICHAR_ID wildcard) const {
  EDGE_REF edge;
@@ -286,12 +312,12 @@ void SquishedDawg::read_squished_dawg(FILE *file,
  int unicharset_size;
  fread(&unicharset_size, sizeof(inT32), 1, file);
  fread(&num_edges_, sizeof(inT32), 1, file);
-  ASSERT_HOST(num_edges_ > 0);  // DAWG should not be empty

  if (swap) {
    unicharset_size = reverse32(unicharset_size);
    num_edges_ = reverse32(num_edges_);
  }
+  ASSERT_HOST(num_edges_ > 0);  // DAWG should not be empty
  Dawg::init(type, lang, perm, unicharset_size, debug_level);

  edges_ = (EDGE_ARRAY) memalloc(sizeof(EDGE_RECORD) * num_edges_);
@@ -318,13 +344,13 @@ NODE_MAP SquishedDawg::build_node_map(inT32 *num_nodes) const {

  node_map = (NODE_MAP) malloc(sizeof(EDGE_REF) * num_edges_);

-  for (edge=0; edge < num_edges_; edge++)       // init all slots
+  for (edge = 0; edge < num_edges_; edge++)       // init all slots
    node_map [edge] = -1;

  node_counter = num_forward_edges(0);

  *num_nodes   = 0;
-  for (edge=0; edge < num_edges_; edge++) {     // search all slots
+  for (edge = 0; edge < num_edges_; edge++) {     // search all slots

    if (forward_edge(edge)) {
      (*num_nodes)++;                          // count nodes links
@@ -332,6 +358,7 @@ NODE_MAP SquishedDawg::build_node_map(inT32 *num_nodes) const {
      num_edges = num_forward_edges(edge);
      if (edge != 0) node_counter += num_edges;
      edge += num_edges;
+      if (edge >= num_edges_) break;
      if (backward_edge(edge)) while (!last_edge(edge++));
      edge--;
    }
@@ -369,7 +396,7 @@ void SquishedDawg::write_squished_dawg(FILE *file) {
    tprintf("%d edges in DAWG\n", num_edges);
  }

-  for (edge=0; edge<num_edges_; edge++) {
+  for (edge = 0; edge < num_edges_; edge++) {
    if (forward_edge(edge)) {  // write forward edges
      do {
        old_index = next_node_from_edge_rec(edges_[edge]);
@@ -379,6 +406,7 @@ void SquishedDawg::write_squished_dawg(FILE *file) {
        set_next_node(edge, old_index);
      } while (!last_edge(edge++));

+      if (edge >= num_edges_) break;
      if (backward_edge(edge))  // skip back links
        while (!last_edge(edge++));


--- a/dict/dawg.h
+++ b/dict/dawg.h
@@ -34,6 +34,7 @@
 #include "elst.h"
 #include "ratngs.h"
 #include "params.h"
+#include "tesscallback.h"

 #ifndef __GNUC__
 #ifdef __MSW32__
@@ -142,6 +143,11 @@ class Dawg {
                      const UNICHARSET &unicharset,
                      bool enable_wildcard) const;

+  // For each word in the Dawg, call the given (permanent) callback with the
+  // text (UTF-8) version of the word.
+  void iterate_words(const UNICHARSET &unicharset,
+                     TessCallback1<const char *> *cb) const;
+
  // Pure virtual function that should be implemented by the derived classes.

  /// Returns the edge that corresponds to the letter out of this node.
@@ -268,6 +274,11 @@ class Dawg {
  bool match_words(WERD_CHOICE *word, inT32 index,
                   NODE_REF node, UNICHAR_ID wildcard) const;

+  // Recursively iterate over all words in a dawg (see public iterate_words).
+  void iterate_words_rec(const WERD_CHOICE &word_so_far,
+                         NODE_REF to_explore,
+                         TessCallback1<const char *> *cb) const;
+
  // Member Variables.
  DawgType type_;
  STRING lang_;

--- a/dict/dict.cpp
+++ b/dict/dict.cpp
@@ -16,7 +16,10 @@
 //
 ///////////////////////////////////////////////////////////////////////

+#include <stdio.h>
+
 #include "dict.h"
+#include "unicodes.h"

 #ifdef _MSC_VER
 #pragma warning(disable:4244)  // Conversion warnings
@@ -41,6 +44,8 @@ Dict::Dict(Image* image_ptr)
                       getImage()->getCCUtil()->params()),
      BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.",
                       getImage()->getCCUtil()->params()),
+      BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
+                       getImage()->getCCUtil()->params()),
      BOOL_INIT_MEMBER(load_punc_dawg, true, "Load dawg with punctuation"
                       " patterns.", getImage()->getCCUtil()->params()),
      BOOL_INIT_MEMBER(load_number_dawg, true, "Load dawg with number"
@@ -48,6 +53,8 @@ Dict::Dict(Image* image_ptr)
      BOOL_INIT_MEMBER(load_fixed_length_dawgs, true, "Load fixed length dawgs"
                       " (e.g. for non-space delimited languages)",
                       getImage()->getCCUtil()->params()),
+      BOOL_INIT_MEMBER(load_bigram_dawg, false, "Load dawg with special word "
+                       "bigrams.", getImage()->getCCUtil()->params()),
      double_MEMBER(segment_penalty_dict_frequent_word, 1.0,
                    "Score multiplier for word matches which have good case and"
                    "are frequent in the given language (lower is better).",
@@ -70,6 +77,9 @@ Dict::Dict(Image* image_ptr)
                    "Score multiplier for poorly cased strings that are not in"
                    " the dictionary and generally look like garbage (lower is"
                    " better).", getImage()->getCCUtil()->params()),
+      STRING_MEMBER(output_ambig_words_file, "",
+                    "Output file for ambiguities found in the dictionary",
+                    getImage()->getCCUtil()->params()),
      INT_MEMBER(dawg_debug_level, 0, "Set to 1 for general debug info"
                 ", to 2 for more details, to 3 to see all the debug messages",
                 getImage()->getCCUtil()->params()),
@@ -104,6 +114,12 @@ Dict::Dict(Image* image_ptr)
                  "Make AcceptableChoice() always return false. Useful"
                  " when there is a need to explore all segmentations",
                  getImage()->getCCUtil()->params()),
+      double_MEMBER(stopper_ambiguity_threshold_gain, 8.0,
+                    "Gain factor for ambiguity threshold.",
+                    getImage()->getCCUtil()->params()),
+      double_MEMBER(stopper_ambiguity_threshold_offset, 1.5,
+                    "Certainty offset for ambiguity threshold.",
+                    getImage()->getCCUtil()->params()),
      BOOL_MEMBER(save_raw_choices, false, "Save all explored raw choices",
                  getImage()->getCCUtil()->params()),
      INT_MEMBER(tessedit_truncate_wordchoice_log, 10,
@@ -130,6 +146,11 @@ Dict::Dict(Image* image_ptr)
      BOOL_MEMBER(segment_segcost_rating, 0,
                  "incorporate segmentation cost in word rating?",
                  getImage()->getCCUtil()->params()),
+      BOOL_MEMBER(segment_nonalphabetic_script, false,
+                 "Don't use any alphabetic-specific tricks."
+                 "Set to true in the traineddata config file for"
+                 " scripts that are cursive or inherently fixed-pitch",
+                 getImage()->getCCUtil()->params()),
      double_MEMBER(segment_reward_script, 0.95,
                    "Score multipler for script consistency within a word. "
                    "Being a 'reward' factor, it should be <= 1. "
@@ -144,10 +165,10 @@ Dict::Dict(Image* image_ptr)
      double_MEMBER(segment_reward_chartype, 0.97,
                    "Score multipler for char type consistency within a word. ",
                    getImage()->getCCUtil()->params()),
-     double_MEMBER(segment_reward_ngram_best_choice, 0.99,
-                   "Score multipler for ngram permuter's best choice"
-                   " (only used in the Han script path).",
-                   getImage()->getCCUtil()->params()),
+      double_MEMBER(segment_reward_ngram_best_choice, 0.99,
+                    "Score multipler for ngram permuter's best choice"
+                    " (only used in the Han script path).",
+                    getImage()->getCCUtil()->params()),
      BOOL_MEMBER(save_doc_words, 0, "Save Document Words",
                  getImage()->getCCUtil()->params()),
      BOOL_MEMBER(doc_dict_enable, 1, "Enable Document Dictionary ",
@@ -182,14 +203,17 @@ Dict::Dict(Image* image_ptr)
  hyphen_unichar_id_ = INVALID_UNICHAR_ID;
  document_words_ = NULL;
  pending_words_ = NULL;
+  bigram_dawg_ = NULL;
  freq_dawg_ = NULL;
  punc_dawg_ = NULL;
  max_fixed_length_dawgs_wdlen_ = -1;
  wordseg_rating_adjust_factor_ = -1.0f;
+  output_ambig_words_file_ = NULL;
 }

 Dict::~Dict() {
  if (hyphen_word_ != NULL) delete hyphen_word_;
+  if (output_ambig_words_file_ != NULL) fclose(output_ambig_words_file_);
 }

 void Dict::Load() {
@@ -199,6 +223,10 @@ void Dict::Load() {
  if (dawgs_.length() != 0) this->End();

  hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
+
+  LoadEquivalenceList(kHyphenLikeUTF8);
+  LoadEquivalenceList(kApostropheLikeUTF8);
+
  TessdataManager &tessdata_manager =
    getImage()->getCCUtil()->tessdata_manager;

@@ -219,12 +247,26 @@ void Dict::Load() {
      new SquishedDawg(tessdata_manager.GetDataFilePtr(),
                       DAWG_TYPE_NUMBER, lang, NUMBER_PERM, dawg_debug_level);
  }
-  if (tessdata_manager.SeekToStart(TESSDATA_FREQ_DAWG)) {
+  if (load_bigram_dawg && tessdata_manager.SeekToStart(TESSDATA_BIGRAM_DAWG)) {
+    bigram_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
+                                    DAWG_TYPE_WORD, // doesn't actually matter.
+                                    lang,
+                                    COMPOUND_PERM,  // doesn't actually matter.
+                                    dawg_debug_level);
+  }
+  if (load_freq_dawg && tessdata_manager.SeekToStart(TESSDATA_FREQ_DAWG)) {
    freq_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
                                  DAWG_TYPE_WORD, lang, FREQ_DAWG_PERM,
                                  dawg_debug_level);
    dawgs_ += freq_dawg_;
  }
+  if (load_unambig_dawg &&
+      tessdata_manager.SeekToStart(TESSDATA_UNAMBIG_DAWG)) {
+    unambig_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
+                                     DAWG_TYPE_WORD, lang, SYSTEM_DAWG_PERM,
+                                     dawg_debug_level);
+    dawgs_ += unambig_dawg_;
+  }

  if (((STRING &)user_words_suffix).length() > 0) {
    Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
@@ -232,7 +274,8 @@ void Dict::Load() {
                              dawg_debug_level);
    name = getImage()->getCCUtil()->language_data_path_prefix;
    name += user_words_suffix;
-    if (!trie_ptr->read_word_list(name.string(), getUnicharset())) {
+    if (!trie_ptr->read_word_list(name.string(), getUnicharset(),
+                                  Trie::RRP_REVERSE_IF_HAS_RTL)) {
      tprintf("Error: failed to load %s\n", name.string());
      exit(1);
    }
@@ -295,6 +338,7 @@ void Dict::End() {
  dawgs_.delete_data_pointers();
  successors_.delete_data_pointers();
  dawgs_.clear();
+  delete bigram_dawg_;
  successors_.clear();
  document_words_ = NULL;
  max_fixed_length_dawgs_wdlen_ = -1;
@@ -304,12 +348,38 @@ void Dict::End() {
  }
 }

+// Create unicharset adaptations of known, short lists of UTF-8 equivalent
+// characters (think all hyphen-like symbols).  The first version of the
+// list is taken as equivalent for matching against the dictionary.
+void Dict::LoadEquivalenceList(const char *unichar_strings[]) {
+  equivalent_symbols_.push_back(GenericVectorEqEq<UNICHAR_ID>());
+  const UNICHARSET &unicharset = getUnicharset();
+  GenericVectorEqEq<UNICHAR_ID> *equiv_list = &equivalent_symbols_.back();
+  for (int i = 0; unichar_strings[i] != 0; i++) {
+    UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar_strings[i]);
+    if (unichar_id != INVALID_UNICHAR_ID) {
+      equiv_list->push_back(unichar_id);
+    }
+  }
+}
+
+// Normalize all hyphen and apostrophes to the canonicalized one for
+// matching; pass everything else through as is.
+UNICHAR_ID Dict::NormalizeUnicharIdForMatch(UNICHAR_ID unichar_id) const {
+  for (int i = 0; i < equivalent_symbols_.size(); i++) {
+    if (equivalent_symbols_[i].contains(unichar_id)) {
+      return equivalent_symbols_[i][0];
+    }
+  }
+  return unichar_id;
+}
+
 // Returns true if in light of the current state unichar_id is allowed
 // according to at least one of the dawgs in the dawgs_ vector.
 // See more extensive comments in dict.h where this function is declared.
 int Dict::def_letter_is_okay(void* void_dawg_args,
                             UNICHAR_ID unichar_id,
-                             bool word_end) {
+                             bool word_end) const {
  DawgArgs *dawg_args = reinterpret_cast<DawgArgs*>(void_dawg_args);

  if (dawg_debug_level >= 3) {
@@ -484,7 +554,7 @@ int Dict::def_letter_is_okay(void* void_dawg_args,
 void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgInfo &info,
                               UNICHAR_ID unichar_id, bool word_end,
                               DawgArgs *dawg_args,
-                               PermuterType *curr_perm) {
+                               PermuterType *curr_perm) const {
  NODE_REF node = GetStartingNode(dawg, info.ref);
  // Try to find the edge corresponding to the exact unichar_id and to all the
  // edges corresponding to the character class of unichar_id.
@@ -572,7 +642,7 @@ void Dict::WriteFixedLengthDawgs(
 // from hyphen_active_dawgs_ instead.
 void Dict::init_active_dawgs(int sought_word_length,
                             DawgInfoVector *active_dawgs,
-                             bool ambigs_mode) {
+                             bool ambigs_mode) const {
  int i;
  if (sought_word_length != kAnyWordLength) {
    // Only search one fixed word length dawg.
@@ -604,7 +674,7 @@ void Dict::init_active_dawgs(int sought_word_length,

 // If hyphenated() returns true, copy the entries from hyphen_constraints_
 // into the given constraints vector.
-void Dict::init_constraints(DawgInfoVector *constraints) {
+void Dict::init_constraints(DawgInfoVector *constraints) const {
  if (hyphenated()) {
    *constraints = hyphen_constraints_;
    if (dawg_debug_level >= 3) {
@@ -670,7 +740,7 @@ void Dict::add_document_word(const WERD_CHOICE &best_choice) {
    strcat(filename, ".doc");
    doc_word_file = open_file (filename, "a");
    fprintf(doc_word_file, "%s\n",
-            best_choice.debug_string(getUnicharset()).string());
+            best_choice.debug_string().string());
    fclose(doc_word_file);
  }
  document_words_->add_word_to_dawg(best_choice);
@@ -693,7 +763,7 @@ void Dict::adjust_word(WERD_CHOICE *word,
  float new_rating = word->rating();
  if (debug) {
    tprintf("%sWord: %s %4.2f ", nonword ? "Non-" : "",
-            word->debug_string(getUnicharset()).string(), word->rating());
+            word->debug_string().string(), word->rating());
  }
  new_rating += kRatingPad;
  if (nonword) {  // non-dictionary word
@@ -733,9 +803,9 @@ void Dict::adjust_word(WERD_CHOICE *word,
  LogNewChoice(adjust_factor, certainty_array, false, word);
 }

-int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) {
+int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const {
  const WERD_CHOICE *word_ptr = &word;
-  WERD_CHOICE temp_word;
+  WERD_CHOICE temp_word(word.unicharset());
  if (hyphenated()) {
    copy_hyphen_info(&temp_word);
    temp_word += word;
@@ -775,10 +845,40 @@ int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) {
    dawg_args.permuter : NO_PERM;
 }

+bool Dict::valid_bigram(const WERD_CHOICE &word1,
+                        const WERD_CHOICE &word2) const {
+  if (bigram_dawg_ == NULL) return false;
+
+  // Extract the core word from the middle of each word with any digits
+  //         replaced with question marks.
+  int w1start, w1end, w2start, w2end;
+  word1.punct_stripped(&w1start, &w1end);
+  word2.punct_stripped(&w2start, &w2end);
+
+  // We don't want to penalize a single guillemet, hyphen, etc.
+  // But our bigram list doesn't have any information about punctuation.
+  if (w1start >= w1end) return word1.length() < 3;
+  if (w2start >= w2end) return word2.length() < 3;
+
+  const UNICHARSET& uchset = getUnicharset();
+  STRING bigram_string;
+  for (int i = w1start; i < w1end; i++) {
+    UNICHAR_ID ch = NormalizeUnicharIdForMatch(word1.unichar_id(i));
+    bigram_string += uchset.get_isdigit(ch) ? "?" : uchset.id_to_unichar(ch);
+  }
+  bigram_string += " ";
+  for (int i = w2start; i < w2end; i++) {
+    UNICHAR_ID ch = NormalizeUnicharIdForMatch(word2.unichar_id(i));
+    bigram_string += uchset.get_isdigit(ch) ? "?" : uchset.id_to_unichar(ch);
+  }
+  WERD_CHOICE normalized_word(bigram_string.string(), uchset);
+  return bigram_dawg_->word_in_dawg(normalized_word);
+}
+
 bool Dict::valid_punctuation(const WERD_CHOICE &word) {
  if (word.length() == 0) return NO_PERM;
  int i;
-  WERD_CHOICE new_word;
+  WERD_CHOICE new_word(word.unicharset());
  int last_index = word.length() - 1;
  int new_len = 0;
  for (i = 0; i <= last_index; ++i) {

--- a/dict/dict.h
+++ b/dict/dict.h
@@ -89,16 +89,17 @@ struct DawgArgs {

 class Dict {
 public:
-  // Gain factor for ambiguity threshold.
-  static const float kStopperAmbiguityThresholdGain;
-  // Certainty offset for ambiguity threshold.
-  static const float kStopperAmbiguityThresholdOffset;
-
  Dict(Image* image_ptr);
  ~Dict();
+  const Image* getImage() const {
+    return image_ptr_;
+  }
  Image* getImage() {
    return image_ptr_;
  }
+  const UNICHARSET& getUnicharset() const {
+    return getImage()->getCCUtil()->unicharset;
+  }
  UNICHARSET& getUnicharset() {
    return getImage()->getCCUtil()->unicharset;
  }
@@ -114,17 +115,17 @@ class Dict {
  /* hyphen.cpp ************************************************************/

  /// Returns true if we've recorded the beginning of a hyphenated word.
-  inline bool hyphenated() { return
+  inline bool hyphenated() const { return
    !last_word_on_line_ && hyphen_word_ && GetMaxFixedLengthDawgIndex() < 0;
  }
  /// Size of the base word (the part on the line before) of a hyphenated word.
-  inline int hyphen_base_size() {
+  inline int hyphen_base_size() const {
    return this->hyphenated() ? hyphen_word_->length() : 0;
  }
  /// If this word is hyphenated copy the base word (the part on
  /// the line before) of a hyphenated word into the given word.
  /// This function assumes that word is not NULL.
-  inline void copy_hyphen_info(WERD_CHOICE *word) {
+  inline void copy_hyphen_info(WERD_CHOICE *word) const {
    if (this->hyphenated()) {
      *word = *hyphen_word_;
      if (hyphen_debug_level) word->print("copy_hyphen_info: ");
@@ -133,19 +134,19 @@ class Dict {
  /// Erase the unichar ids corresponding to the portion of the word
  /// from the previous line. The word is not changed if it is not
  /// split between lines and hyphenated.
-  inline void remove_hyphen_head(WERD_CHOICE *word) {
+  inline void remove_hyphen_head(WERD_CHOICE *word) const {
    if (this->hyphenated()) {
      word->remove_unichar_ids(0, hyphen_word_->length());
      if (hyphen_debug_level) hyphen_word_->print("remove_hyphen_head: ");
    }
  }
  /// Check whether the word has a hyphen at the end.
-  inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) {
+  inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const {
    return (last_word_on_line_ && !first_pos &&
            unichar_id == hyphen_unichar_id_);
  }
  /// Same as above, but check the unichar at the end of the word.
-  inline bool has_hyphen_end(const WERD_CHOICE &word) {
+  inline bool has_hyphen_end(const WERD_CHOICE &word) const {
    int word_index = word.length() - 1;
    return has_hyphen_end(word.unichar_id(word_index), word_index == 0);
  }
@@ -171,12 +172,14 @@ class Dict {
  /// from hyphen_active_dawgs_ instead.
  void init_active_dawgs(int sought_word_length,
                         DawgInfoVector *active_dawgs,
-                         bool ambigs_mode);
+                         bool ambigs_mode) const;
  /// If hyphenated() returns true, copy the entries from hyphen_constraints_
  /// into the given constraints vector.
-  void init_constraints(DawgInfoVector *constraints);
+  void init_constraints(DawgInfoVector *constraints) const;
  /// Returns true if we are operating in ambigs mode.
-  inline bool ambigs_mode(float rating_limit) { return rating_limit <= 0.0; }
+  inline bool ambigs_mode(float rating_limit) {
+    return rating_limit <= 0.0;
+  }
  /// Recursively explore all the possible character combinations in
  /// the given char_choices. Use go_deeper_dawg_fxn() to explore all the
  /// dawgs in the dawgs_ vector in parallel and discard invalid words.
@@ -316,6 +319,15 @@ class Dict {
                        bool fix_replaceable,
                        BLOB_CHOICE_LIST_VECTOR *Choices,
                        bool *modified_blobs);
+  double StopperAmbigThreshold(double f1, double f2) {
+    return (f2 - f1) * stopper_ambiguity_threshold_gain -
+        stopper_ambiguity_threshold_offset;
+  }
+  // If the certainty of any chunk in Choice (item1) is not ambiguous with the
+  // corresponding chunk in the best choice (item2), frees Choice and
+  // returns true.
+  int FreeBadChoice(void *item1,   // VIABLE_CHOICE Choice
+                    void *item2);  // EXPANDED_CHOICE *BestChoice
  /// Replaces the corresponding wrong ngram in werd_choice with the correct
  /// one. We indicate that this newly inserted ngram unichar is composed from
  /// several fragments and modify the corresponding entries in blob_choices to
@@ -401,7 +413,7 @@ class Dict {
  /// and Certainties.
  void FillViableChoice(const WERD_CHOICE &WordChoice,
                        FLOAT32 AdjustFactor, const float Certainties[],
-                        bool SameString, VIABLE_CHOICE ViableChoice);
+                        VIABLE_CHOICE ViableChoice);
  /// Returns true if there are no alternative choices for the current word
  /// or if all alternatives have an adjust factor worse than Threshold.
  bool AlternativeChoicesWorseThan(FLOAT32 Threshold);
@@ -467,6 +479,15 @@ class Dict {
      document_words_->clear();
  }

+  // Create unicharset adaptations of known, short lists of UTF-8 equivalent
+  // characters (think all hyphen-like symbols).  The first version of the
+  // list is taken as equivalent for matching against the dictionary.
+  void LoadEquivalenceList(const char *unichar_strings[]);
+
+  // Normalize all hyphen and apostrophes to the canonicalized one for
+  // matching; pass everything else through as is.  See LoadEquivalenceList().
+  UNICHAR_ID NormalizeUnicharIdForMatch(UNICHAR_ID unichar_id) const;
+
  /**
   * Returns the maximal permuter code (from ccstruct/ratngs.h) if in light
   * of the current state the letter at word_index in the given word
@@ -531,13 +552,13 @@ class Dict {

  //
  int def_letter_is_okay(void* void_dawg_args,
-                         UNICHAR_ID unichar_id, bool word_end);
+                         UNICHAR_ID unichar_id, bool word_end) const;

  int (Dict::*letter_is_okay_)(void* void_dawg_args,
-                               UNICHAR_ID unichar_id, bool word_end);
+                               UNICHAR_ID unichar_id, bool word_end) const;
  /// Calls letter_is_okay_ member function.
  int LetterIsOkay(void* void_dawg_args,
-                   UNICHAR_ID unichar_id, bool word_end) {
+                   UNICHAR_ID unichar_id, bool word_end) const {
    return (this->*letter_is_okay_)(void_dawg_args, unichar_id, word_end);
  }

@@ -581,6 +602,8 @@ class Dict {
  inline const Dawg *GetDawg(int index) const { return dawgs_[index]; }
  /// Return the points to the punctuation dawg.
  inline const Dawg *GetPuncDawg() const { return punc_dawg_; }
+  /// Return the points to the unambiguous words dawg.
+  inline const Dawg *GetUnambigDawg() const { return unambig_dawg_; }
  /// Return the pointer to the Dawg that contains words of length word_length.
  inline const Dawg *GetFixedLengthDawg(int word_length) const {
    if (word_length > max_fixed_length_dawgs_wdlen_) return NULL;
@@ -603,7 +626,7 @@ class Dict {
  /// leading punctuation is found this would ensure that we are not
  /// expecting any particular trailing punctuation after the word).
  inline bool ConstraintsOk(const DawgInfoVector &constraints,
-                            int word_end, DawgType current_dawg_type) {
+                            int word_end, DawgType current_dawg_type) const {
    if (!word_end) return true;
    if (current_dawg_type == DAWG_TYPE_PUNCTUATION) return true;
    for (int c = 0; c < constraints.length(); ++c) {
@@ -627,7 +650,8 @@ class Dict {
  /// edges were found.
  void ProcessPatternEdges(const Dawg *dawg, const DawgInfo &info,
                           UNICHAR_ID unichar_id, bool word_end,
-                           DawgArgs *dawg_args, PermuterType *current_permuter);
+                           DawgArgs *dawg_args,
+                           PermuterType *current_permuter) const;

  /// Read/Write/Access special purpose dawgs which contain words
  /// only of a certain length (used for phrase search for
@@ -649,23 +673,25 @@ class Dict {
      int num_dawgs, int debug_level, FILE *output_file);

  /// Check all the DAWGs to see if this word is in any of them.
-  inline bool valid_word_permuter(uinT8 perm, bool numbers_ok) {
+  inline static bool valid_word_permuter(uinT8 perm, bool numbers_ok) {
    return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
            perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
            perm == USER_PATTERN_PERM || (numbers_ok && perm == NUMBER_PERM));
  }
-  int valid_word(const WERD_CHOICE &word, bool numbers_ok);
-  int valid_word(const WERD_CHOICE &word) {
+  int valid_word(const WERD_CHOICE &word, bool numbers_ok) const;
+  int valid_word(const WERD_CHOICE &word) const {
    return valid_word(word, false);  // return NO_PERM for words with digits
  }
-  int valid_word_or_number(const WERD_CHOICE &word) {
+  int valid_word_or_number(const WERD_CHOICE &word) const {
    return valid_word(word, true);  // return NUMBER_PERM for valid numbers
  }
  /// This function is used by api/tesseract_cube_combiner.cpp
-  int valid_word(const char *string) {
+  int valid_word(const char *string) const {
    WERD_CHOICE word(string, getUnicharset());
    return valid_word(word);
  }
+  // Do the two WERD_CHOICEs form a meaningful bigram?
+  bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const;
  /// Returns true if the word contains a valid punctuation pattern.
  /// Note: Since the domains of punctuation symbols and symblos
  /// used in numbers are not disjoint, a valid number might contain
@@ -691,6 +717,8 @@ class Dict {
  inline void SetWordsegRatingAdjustFactor(float f) {
    wordseg_rating_adjust_factor_ = f;
  }
+  // Accessor for best_choices_.
+  const LIST &getBestChoices() { return best_choices_; }

 private:
  /** Private member variables. */
@@ -723,15 +751,27 @@ class Dict {
  DawgInfoVector hyphen_active_dawgs_;
  DawgInfoVector hyphen_constraints_;
  bool last_word_on_line_;
+  // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary
+  // matching.  The first member of each list is taken as canonical.  For
+  // example, the first list contains hyphens and dashes with the first symbol
+  // being the ASCII hyphen minus.
+  GenericVector<GenericVectorEqEq<UNICHAR_ID> > equivalent_symbols_;
  // Dawgs.
  DawgVector dawgs_;
  SuccessorListsVector successors_;
  Trie *pending_words_;
+  // bigram_dawg_ points to a dawg of two-word bigrams which always supercede if
+  // any of them are present on the best choices list for a word pair.
+  // the bigrams are stored as space-separated words where:
+  // (1) leading and trailing punctuation has been removed from each word and
+  // (2) any digits have been replaced with '?' marks.
+  Dawg *bigram_dawg_;
  /// The following pointers are only cached for convenience.
  /// The dawgs will be deleted when dawgs_ vector is destroyed.
  // TODO(daria): need to support multiple languages in the future,
  // so maybe will need to maintain a list of dawgs of each kind.
  Dawg *freq_dawg_;
+  Dawg *unambig_dawg_;
  Dawg *punc_dawg_;
  Trie *document_words_;
  /// Maximum word length of fixed-length word dawgs.
@@ -740,6 +780,8 @@ class Dict {
  /// Current segmentation cost adjust factor for word rating.
  /// See comments in incorporate_segcost.
  float wordseg_rating_adjust_factor_;
+  // File for recording ambiguities discovered during dictionary search.
+  FILE *output_ambig_words_file_;

 public:
  /// Variable members.
@@ -750,11 +792,14 @@ class Dict {
               "A list of user-provided patterns.");
  BOOL_VAR_H(load_system_dawg, true, "Load system word dawg.");
  BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg.");
+  BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg.");
  BOOL_VAR_H(load_punc_dawg, true,
             "Load dawg with punctuation patterns.");
  BOOL_VAR_H(load_number_dawg, true, "Load dawg with number patterns.");
  BOOL_VAR_H(load_fixed_length_dawgs, true,  "Load fixed length"
             " dawgs (e.g. for non-space delimited languages)");
+  BOOL_VAR_H(load_bigram_dawg, false,
+             "Load dawg with special word bigrams.");
  double_VAR_H(segment_penalty_dict_frequent_word, 1.0,
               "Score multiplier for word matches which have good case and"
               "are frequent in the given language (lower is better).");
@@ -779,6 +824,8 @@ class Dict {
               "Score multiplier for poorly cased strings that are not in"
               " the dictionary and generally look like garbage (lower is"
               " better).");
+  STRING_VAR_H(output_ambig_words_file, "",
+               "Output file for ambiguities found in the dictionary");
  INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info"
            ", to 2 for more details, to 3 to see all the debug messages");
  INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words.");
@@ -801,6 +848,10 @@ class Dict {
  BOOL_VAR_H(stopper_no_acceptable_choices, false,
             "Make AcceptableChoice() always return false. Useful"
             " when there is a need to explore all segmentations");
+  double_VAR_H(stopper_ambiguity_threshold_gain, 8.0,
+               "Gain factor for ambiguity threshold.");
+  double_VAR_H(stopper_ambiguity_threshold_offset, 1.5,
+               "Certainty offset for ambiguity threshold.");
  BOOL_VAR_H(save_raw_choices, false, "Save all explored raw choices");
  INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
  STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information"
@@ -816,6 +867,10 @@ class Dict {
             "Turn on word script consistency permuter");
  BOOL_VAR_H(segment_segcost_rating, 0,
             "incorporate segmentation cost in word rating?");
+  BOOL_VAR_H(segment_nonalphabetic_script, false,
+             "Don't use any alphabetic-specific tricks."
+             "Set to true in the traineddata config file for"
+             " scripts that are cursive or inherently fixed-pitch");
  double_VAR_H(segment_reward_script, 0.95,
               "Score multipler for script consistency within a word. "
               "Being a 'reward' factor, it should be <= 1. "

--- a/dict/hyphen.cpp
+++ b/dict/hyphen.cpp
@@ -51,7 +51,7 @@ void Dict::set_hyphen_word(const WERD_CHOICE &word,
                           const DawgInfoVector &active_dawgs,
                           const DawgInfoVector &constraints) {
  if (hyphen_word_ == NULL) {
-    hyphen_word_ = new WERD_CHOICE();
+    hyphen_word_ = new WERD_CHOICE(word.unicharset());
    hyphen_word_->make_bad();
  }
  if (hyphen_word_->rating() > word.rating()) {

--- a/dict/matchdefs.h
+++ b/dict/matchdefs.h
@@ -28,7 +28,7 @@
 /* define the maximum number of classes defined for any matcher
  and the maximum class id for any matcher. This must be changed
  if more different classes need to be classified */
-#define MAX_NUM_CLASSES   8192
+#define MAX_NUM_CLASSES   12288
 #define MAX_CLASS_ID    (MAX_NUM_CLASSES - 1)

 /** a CLASS_ID is the ascii character to be associated with a class */

--- a/dict/permdawg.cpp
+++ b/dict/permdawg.cpp
@@ -86,7 +86,7 @@ void Dict::go_deeper_dawg_fxn(
        if (permute_debug && dawg_debug_level) {
          tprintf("early pruned word rating=%4.2f,"
                  " permdawg_limit=%4.2f, word=%s\n", word->rating(),
-                  permdawg_limit, word->debug_string(getUnicharset()).string());
+                  permdawg_limit, word->debug_string().string());
        }
        return;
      }
@@ -106,8 +106,7 @@ void Dict::go_deeper_dawg_fxn(
    }
    if (clean_active_dawgs.size() > 0) {
      if (permute_debug && dawg_debug_level)
-        tprintf("new hyphen choice = %s\n",
-                word->debug_string(getUnicharset()).string());
+        tprintf("new hyphen choice = %s\n", word->debug_string().string());
      word->set_permuter(more_args->permuter);
      adjust_word(word, certainties, permute_debug);
      set_hyphen_word(*word, *(more_args->active_dawgs),
@@ -190,11 +189,26 @@ void Dict::go_deeper_dawg_fxn(
      // Add a new word choice
      if (word_ending) {
        if (permute_debug && dawg_debug_level) {
-          tprintf("found word = %s\n",
-                  word->debug_string(getUnicharset()).string());
+          tprintf("found word = %s\n", word->debug_string().string());
+        }
+        if (ambigs_mode(*limit) &&
+            strcmp(output_ambig_words_file.string(), "") != 0) {
+          if (output_ambig_words_file_ == NULL) {
+            output_ambig_words_file_ =
+                fopen(output_ambig_words_file.string(), "w+");
+            if (output_ambig_words_file_ == NULL) {
+              tprintf("Failed to open output_ambig_words_file %s\n",
+                      output_ambig_words_file.string());
+              exit(1);
+            }
+          }
+          STRING word_str;
+          word->string_and_lengths(&word_str, NULL);
+          word_str += " ";
+          fprintf(output_ambig_words_file_, word_str.string());
        }
        WERD_CHOICE *adjusted_word = word;
-        WERD_CHOICE hyphen_tail_word;
+        WERD_CHOICE hyphen_tail_word(&getUnicharset());
        if (hyphen_base_size() > 0) {
          hyphen_tail_word = *word;
          remove_hyphen_head(&hyphen_tail_word);
@@ -226,7 +240,7 @@ void Dict::go_deeper_dawg_fxn(
    } else {
      if (permute_debug && dawg_debug_level) {
        tprintf("last unichar not OK at index %d in %s\n",
-                word_index, word->debug_string(getUnicharset()).string());
+                word_index, word->debug_string().string());
      }
    }
  }
@@ -249,7 +263,7 @@ void Dict::go_deeper_dawg_fxn(
 WERD_CHOICE *Dict::dawg_permute_and_select(
    const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit,
    int sought_word_length, int start_char_choice_index) {
-  WERD_CHOICE *best_choice = new WERD_CHOICE();
+  WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset());
  best_choice->make_bad();
  best_choice->set_rating(rating_limit);
  if (char_choices.length() == 0) return best_choice;
@@ -272,7 +286,7 @@ WERD_CHOICE *Dict::dawg_permute_and_select(
                     (segment_penalty_dict_case_bad /
                      segment_penalty_dict_case_ok),
                     NO_PERM, sought_word_length, end_char_choice_index);
-  WERD_CHOICE word(MAX_WERD_LENGTH);
+  WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH);
  copy_hyphen_info(&word);
  // Discard rating and certainty of the hyphen base (if any).
  word.set_rating(0.0);

--- a/dict/permute.cpp
+++ b/dict/permute.cpp
@@ -126,12 +126,13 @@ int find_choice_by_uid(BLOB_CHOICE_LIST *blob_list, UNICHAR_ID target_uid) {
 * 1st choice of char 3, 2nd choice of char 4, 3rd choice of char 5, 2nd choice
 * of char 6.  If n > number of choice, the closest (last) one is used.
 */
-WERD_CHOICE* get_choice_from_posstr(const BLOB_CHOICE_LIST_VECTOR &char_choices,
+WERD_CHOICE* get_choice_from_posstr(const UNICHARSET *unicharset,
+                                    const BLOB_CHOICE_LIST_VECTOR &char_choices,
                                    int start_pos,
                                    const char* pos_str,
                                    float *certainties) {
  int pos_str_len = strlen(pos_str);
-  WERD_CHOICE* wchoice = new WERD_CHOICE();
+  WERD_CHOICE* wchoice = new WERD_CHOICE(unicharset);
  if (start_pos + pos_str_len > char_choices.length()) {
    wchoice->make_bad();
    return wchoice;
@@ -228,6 +229,7 @@ BLOB_CHOICE* find_choice_by_script(


 PermuterState::PermuterState() {
+  unicharset_ = NULL;
  char_choices_ = NULL;
  adjust_factor_ = 1.0f;
  allow_collision_ = false;
@@ -240,6 +242,7 @@ void PermuterState::Init(const BLOB_CHOICE_LIST_VECTOR& char_choices,
                         float default_bias,
                         bool debug) {
  ASSERT_HOST(char_choices.length() < MAX_PERM_LENGTH);
+  unicharset_ = &unicharset;
  char_choices_ = &char_choices;
  word_length_ = char_choices.length();
  for (int i = 0; i < word_length_; ++i)
@@ -300,9 +303,8 @@ void PermuterState::AddPreference(int char_pos, BLOB_CHOICE* blob_choice,
 WERD_CHOICE* PermuterState::GetPermutedWord(float *certainties,
                                            float *adjust_factor) {
  ASSERT_HOST(char_choices_ != NULL);
-  WERD_CHOICE *word_choice = get_choice_from_posstr(*char_choices_,
-                                                    0, perm_state_,
-                                                    certainties);
+  WERD_CHOICE *word_choice = get_choice_from_posstr(
+      unicharset_, *char_choices_, 0, perm_state_, certainties);
  float rating = word_choice->rating() * adjust_factor_;
  word_choice->set_rating(rating);
  *adjust_factor = adjust_factor_;
@@ -431,7 +433,8 @@ WERD_CHOICE* Dict::permute_fixed_length_words(
  if (permute_debug)
    print_char_choices_list("\n\nPermute FixedLength Word",
                            char_choices, getUnicharset(), false);
-  WERD_CHOICE* best_choice = new WERD_CHOICE(char_choices.length());
+  WERD_CHOICE* best_choice =
+      new WERD_CHOICE(&getUnicharset(), char_choices.length());
  const int max_dict_len = max_fixed_length_dawgs_wdlen_;
  const int min_dict_len = 2;
  char posstr[256];
@@ -461,7 +464,7 @@ WERD_CHOICE* Dict::permute_fixed_length_words(
     }

     if (part_choice && step > 1) {   // found lexicon match
-       part_choice->populate_unichars(getUnicharset());
+       part_choice->populate_unichars();
       get_posstr_from_choice(char_choices, part_choice, anchor_pos, posstr);
       float adjust_factor = pow(0.95, 1.0 + step*2.0/char_choices.length());
       if (permuter_state)
@@ -472,8 +475,8 @@ WERD_CHOICE* Dict::permute_fixed_length_words(
                 part_choice->unichar_string().string());
     } else {     // no lexicon match
       step = 1;
-       part_choice =
-         get_choice_from_posstr(char_choices, anchor_pos, "0", NULL);
+       part_choice = get_choice_from_posstr(&getUnicharset(), char_choices,
+                                            anchor_pos, "0", NULL);
       if (permute_debug)
         tprintf("Single char %d %s\n", anchor_pos,
                 part_choice->unichar_string().string());
@@ -493,7 +496,7 @@ WERD_CHOICE* Dict::permute_fixed_length_words(
              best_choice->rating(), match_score, adjusted_score);
    best_choice->set_rating(adjusted_score);
  }
-  best_choice->populate_unichars(getUnicharset());
+  best_choice->populate_unichars();
  if (permute_debug)
    tprintf("Found Best CJK word %f: %s\n",
            best_choice->rating(), best_choice->unichar_string().string());
@@ -554,11 +557,12 @@ WERD_CHOICE* Dict::permute_chartype_words(
    print_char_choices_list("", char_choices, getUnicharset(), true);
  }

-  WERD_CHOICE *current_word = new WERD_CHOICE();
+  WERD_CHOICE *current_word = new WERD_CHOICE(&getUnicharset());
  BLOB_CHOICE_IT blob_choice_it;
  const UNICHARSET& unicharset = getUnicharset();
  bool replaced = false;        // has any character choice been replaced
  int prev_unambig_type = 0;    // the last chartype of an unambiguous char
+  float certainties[MAX_PERM_LENGTH + 1];
  for (int x = 0; x < char_choices.length(); ++x) {
    BLOB_CHOICE_LIST* pos_choice = char_choices.get(x);
    UNICHAR_ID unichar_id = get_top_choice_uid(pos_choice);
@@ -640,12 +644,12 @@ WERD_CHOICE* Dict::permute_chartype_words(
    current_word->append_unichar_id(first_choice->unichar_id(), 1,
                                    first_choice->rating(),
                                    first_choice->certainty());
+    certainties[x] = first_choice->certainty();
  }
  // All permuter choices should go through adjust_non_word so the choice
  // rating would be adjusted on the same scale.
-  float certainties[MAX_PERM_LENGTH + 1];
  adjust_non_word(current_word, certainties, permute_debug);
-  current_word->populate_unichars(unicharset);
+  current_word->populate_unichars();
  if (replaced) {
    // Apply a reward multiplier on rating if an chartype permutation is made.
    float rating = current_word->rating();
@@ -682,10 +686,11 @@ WERD_CHOICE* Dict::permute_script_words(
                            permute_debug > 1);
  }

-  WERD_CHOICE *current_word = new WERD_CHOICE();
+  WERD_CHOICE *current_word = new WERD_CHOICE(&getUnicharset());
  BLOB_CHOICE_IT blob_choice_it;
  bool replaced = false;
  bool prev_is_consistent = false;
+  float certainties[MAX_PERM_LENGTH + 1];
  for (int x = 0; x < char_choices.length(); ++x) {
    blob_choice_it.set_to_list(char_choices.get(x));
    BLOB_CHOICE *first_choice = blob_choice_it.data();
@@ -737,13 +742,13 @@ WERD_CHOICE* Dict::permute_script_words(
    current_word->append_unichar_id(first_choice->unichar_id(), 1,
                                    first_choice->rating(),
                                    first_choice->certainty());
+    certainties[x] = first_choice->certainty();
    prev_is_consistent = sid_consistent;
  }
  // All permuter choices should go through adjust_non_word so the choice
  // rating would be adjusted on the same scale.
-  float certainties[MAX_PERM_LENGTH + 1];
  adjust_non_word(current_word, certainties, permute_debug);
-  current_word->populate_unichars(getUnicharset());
+  current_word->populate_unichars();
  if (replaced) {
    // Apply a reward multiplier on rating if an script permutation is made.
    float rating = current_word->rating();
@@ -780,19 +785,19 @@ bool Dict::permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
    // Populate unichars_ and unichar_lengths_ of raw_choice. This is
    // needed for various components that still work with unichars rather
    // than unichar ids (e.g. LearnWord).
-    raw_choice->populate_unichars(getUnicharset());
+    raw_choice->populate_unichars();
  }
  if (this_choice && this_choice->rating() < best_choice->rating()) {
    *best_choice = *this_choice;
    // Populate unichars_ and unichar_lengths_ of best_choice. This is
    // needed for various components that still work with unichars rather
    // than unichar ids (dawg, *_ok functions, various hard-coded hacks).
-    best_choice->populate_unichars(getUnicharset());
+    best_choice->populate_unichars();

    if (permute_debug) {
      best_choice->print("\n**** Populate BestChoice");
      cprintf("populate best_choice\n\t%s\n",
-              best_choice->debug_string(getUnicharset()).string());
+              best_choice->debug_string().string());
    }
    delete this_choice;
    return true;
@@ -811,13 +816,13 @@ WERD_CHOICE *Dict::permute_compound_words(
    float rating_limit) {
  BLOB_CHOICE *first_choice;
  WERD_CHOICE *best_choice = NULL;
-  WERD_CHOICE current_word(MAX_WERD_LENGTH);
+  WERD_CHOICE current_word(&getUnicharset(), MAX_WERD_LENGTH);
  int first_index = 0;
  int x;
  BLOB_CHOICE_IT blob_choice_it;

  if (char_choices.length() > MAX_WERD_LENGTH) {
-    WERD_CHOICE *bad_word_choice = new WERD_CHOICE();
+    WERD_CHOICE *bad_word_choice = new WERD_CHOICE(&getUnicharset());
    bad_word_choice->make_bad();
    return bad_word_choice;
  }
@@ -874,7 +879,7 @@ void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
  int x;
  BLOB_CHOICE_LIST_VECTOR subchoices;
  WERD_CHOICE *best_choice = NULL;
-  WERD_CHOICE raw_choice;
+  WERD_CHOICE raw_choice(&getUnicharset());
  raw_choice.make_bad();

  DisableChoiceAccum();
@@ -886,7 +891,7 @@ void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
  }

  if (!subchoices.empty()) {
-    WERD_CHOICE initial_choice;
+    WERD_CHOICE initial_choice(&getUnicharset());
    initial_choice.make_bad();
    initial_choice.set_rating(rating_limit);

@@ -906,10 +911,10 @@ void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,

  if (segment_debug && current_word->rating() < MAX_FLOAT32) {
    cprintf ("Subword permuted = %s, %5.2f, %5.2f\n\n",
-             current_word->debug_string(getUnicharset()).string(),
+             current_word->debug_string().string(),
             current_word->rating(), current_word->certainty());
  }
-  current_word->populate_unichars(getUnicharset());
+  current_word->populate_unichars();

  EnableChoiceAccum();
 }
@@ -919,7 +924,7 @@ void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
 */
 WERD_CHOICE *Dict::get_top_choice_word(
    const BLOB_CHOICE_LIST_VECTOR &char_choices) {
-  WERD_CHOICE *top_word = new WERD_CHOICE(MAX_PERM_LENGTH);
+  WERD_CHOICE *top_word = new WERD_CHOICE(&getUnicharset(), MAX_PERM_LENGTH);
  float certainties[MAX_PERM_LENGTH];
  top_word->set_permuter(TOP_CHOICE_PERM);
  for (int x = 0; x < char_choices.length(); x++) {
@@ -956,11 +961,11 @@ WERD_CHOICE *Dict::permute_top_choice(
  const char *next_char = "";         //next in word
  const char *next_next_char = "";    //after next next in word

-  WERD_CHOICE word(MAX_PERM_LENGTH);
+  WERD_CHOICE word(&getUnicharset(), MAX_PERM_LENGTH);
  word.set_permuter(TOP_CHOICE_PERM);
-  WERD_CHOICE capital_word(MAX_PERM_LENGTH);
+  WERD_CHOICE capital_word(&getUnicharset(), MAX_PERM_LENGTH);
  capital_word.set_permuter(UPPER_CASE_PERM);
-  WERD_CHOICE lower_word(MAX_PERM_LENGTH);
+  WERD_CHOICE lower_word(&getUnicharset(), MAX_PERM_LENGTH);
  lower_word.set_permuter(LOWER_CASE_PERM);

  int x;
@@ -1023,7 +1028,7 @@ WERD_CHOICE *Dict::permute_top_choice(
    if (first_choice == NULL) {
      cprintf("Permuter found only fragments for"
              " character at position %d; word=%s\n",
-              x, word.debug_string(getUnicharset()).string());
+              x, word.debug_string().string());
    }
    ASSERT_HOST(first_choice != NULL);

@@ -1132,7 +1137,7 @@ WERD_CHOICE *Dict::permute_top_choice(
    }
  }

-  if (word.rating() < raw_choice->rating()) {
+  if (raw_choice != NULL && word.rating() < raw_choice->rating()) {
    *raw_choice = word;
    LogNewChoice(1.0, certainties, true, raw_choice);
  }
@@ -1423,9 +1428,9 @@ WERD_CHOICE *Dict::top_fragments_permute_and_select(
    frag_char_choices += frag_choices;
  }

-  WERD_CHOICE *best_choice = new WERD_CHOICE();
+  WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset());
  best_choice->make_bad();
-  WERD_CHOICE word(MAX_PERM_LENGTH);
+  WERD_CHOICE word(&getUnicharset(), MAX_PERM_LENGTH);
  word.set_permuter(TOP_CHOICE_PERM);
  float certainties[MAX_PERM_LENGTH];
  this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_top_fragments_fxn;
@@ -1459,7 +1464,7 @@ void Dict::permute_choices(
    tprintf("%s permute_choices: char_choice_index=%d"
            " limit=%g rating=%g, certainty=%g word=%s\n",
            debug, char_choice_index, *limit, word->rating(),
-            word->certainty(), word->debug_string(getUnicharset()).string());
+            word->certainty(), word->debug_string().string());
  }
  if (char_choice_index < char_choices.length()) {
    BLOB_CHOICE_IT blob_choice_it;
@@ -1554,7 +1559,7 @@ void Dict::go_deeper_top_fragments_fxn(
    if (word_ending) {
      if (fragments_debug > 1) {
        tprintf("fragments_debug new choice = %s\n",
-                word->debug_string(getUnicharset()).string());
+                word->debug_string().string());
      }
      *limit = word->rating();
      adjust_non_word(word, certainties, permute_debug);
@@ -1567,8 +1572,7 @@ void Dict::go_deeper_top_fragments_fxn(
  } else {
    if (fragments_debug > 1) {
      tprintf("fragments_debug pruned word (%s, rating=%4.2f, limit=%4.2f)\n",
-              word->debug_string(getUnicharset()).string(),
-              word->rating(), *limit);
+              word->debug_string().string(), word->rating(), *limit);
    }
  }
 }

--- a/dict/permute.h
+++ b/dict/permute.h
@@ -133,6 +133,8 @@ class PermuterState {
 private:
  static const char kPosFree = '.';

+  const UNICHARSET *unicharset_;
+
  const BLOB_CHOICE_LIST_VECTOR *char_choices_;   // reference pointer only
                            // does not need to be allocated or freed
  char perm_state_[MAX_PERM_LENGTH];   // handles upto MAX_PERM_LENGTH-1 states

--- a/dict/states.cpp
+++ b/dict/states.cpp
@@ -241,6 +241,19 @@ void print_state(const char *label, STATE *state, int num_joints) {
  new_line();
 }

+// Prints out the number of fragments in each segment in a state to
+// toappend.
+void print_state(STATE *state, int num_joints, STRING *toappend) {
+  PIECES_STATE pieces;
+  bin_to_pieces(state, num_joints, pieces);
+  for (int i = 0; pieces[i] > 0; i++) {
+   if (i > 0) {
+     toappend->add_str_int(" ", pieces[i]);
+   } else {
+     toappend->add_str_int("", pieces[i]);
+   }
+  }
+}

 /**
 * set_n_ones

--- a/dict/states.h
+++ b/dict/states.h
@@ -29,6 +29,7 @@
              I n c l u d e s
 ----------------------------------------------------------------------*/
 #include "host.h"
+#include "strngs.h"

 /*----------------------------------------------------------------------
              T y p e s
@@ -64,6 +65,8 @@ int ones_in_state(STATE *state, int num_joints);

 void print_state(const char *label, STATE *state, int num_joints);

+void print_state(STATE *state, int num_joints, STRING *toappend);
+
 void set_n_ones(STATE *state, int n);

 extern void free_state(STATE *);

--- a/dict/stopper.cpp
+++ b/dict/stopper.cpp
@@ -17,13 +17,11 @@
 ******************************************************************************/

 #include "stopper.h"
-#include "emalloc.h"
 #include "matchdefs.h"
 #include "callcpp.h"
 #include "permute.h"
 #include "danerror.h"
 #include "const.h"
-#include "freelist.h"
 #include "efio.h"
 #include "scanutils.h"
 #include "unichar.h"
@@ -58,6 +56,10 @@ typedef struct
  UNICHAR_ID ChunkClass[MAX_NUM_CHUNKS];
 } EXPANDED_CHOICE;

+void DeleteViableChoiceStruct(void *vcs) {
+  delete (static_cast<VIABLE_CHOICE_STRUCT *>(vcs));
+}
+
 #define BestCertainty(Choices) \
  (((VIABLE_CHOICE) first_node (Choices))->Certainty)

@@ -66,10 +68,6 @@ typedef struct
 #define BestFactor(Choices) \
  (((VIABLE_CHOICE) first_node (Choices))->AdjustFactor)

-#define AmbigThreshold(F1,F2) \
-  (((F2) - (F1)) * tesseract::Dict::kStopperAmbiguityThresholdGain - \
-   tesseract::Dict::kStopperAmbiguityThresholdOffset)
-
 /**----------------------------------------------------------------------------
              Private Code
 ----------------------------------------------------------------------------**/
@@ -100,23 +98,72 @@ static void ExpandChoice(VIABLE_CHOICE Choice,
  }
 }

+VIABLE_CHOICE_STRUCT::VIABLE_CHOICE_STRUCT(int length)
+    : Length(length) {
+  Blob = new CHAR_CHOICE[length];
+  segmentation_state = new uinT8[length];
+}
+
+VIABLE_CHOICE_STRUCT::VIABLE_CHOICE_STRUCT() : Length(0) {
+  Blob = NULL;
+  segmentation_state = NULL;
+}
+
+VIABLE_CHOICE_STRUCT::~VIABLE_CHOICE_STRUCT() {
+  delete []Blob;
+  delete []segmentation_state;
+}
+
+void VIABLE_CHOICE_STRUCT::Init(
+    const WERD_CHOICE &word_choice,
+    const PIECES_STATE &pieces_state,
+    const float certainties[],
+    FLOAT32 adjust_factor) {
+  this->Rating = word_choice.rating();
+  this->Certainty = word_choice.certainty();
+  this->AdjustFactor = adjust_factor;
+  this->ComposedFromCharFragments = false;
+  ASSERT_HOST(this->Length == word_choice.length());
+
+  for (int i = 0, bw_idx = 0; i < word_choice.length(); i++, bw_idx++) {
+    int blob_width = pieces_state[bw_idx];
+    CHAR_CHOICE *blob_choice = &this->Blob[i];
+    blob_choice->Class = word_choice.unichar_id(i);
+    blob_choice->NumChunks = blob_width;
+    blob_choice->Certainty = certainties[i];
+    for (int f = 1; f < word_choice.fragment_length(i); ++f) {
+      blob_width = pieces_state[++bw_idx];
+      assert(blob_width > 0);
+      blob_choice->NumChunks += blob_width;
+      this->ComposedFromCharFragments = true;
+    }
+    this->segmentation_state[i] = blob_choice->NumChunks;
+  }
+}
+
+
+namespace tesseract {
+
 // If the certainty of any chunk in Choice (item1) is not ambiguous with the
 // corresponding chunk in the best choice (item2), frees Choice and
 // returns true.
-static int FreeBadChoice(void *item1,    // VIABLE_CHOICE Choice,
-                         void *item2) {  // EXPANDED_CHOICE *BestChoice
+int Dict::FreeBadChoice(
+    void *item1,    // VIABLE_CHOICE Choice,
+    void *item2) {  // EXPANDED_CHOICE *BestChoice
  int i, j, Chunk;
  FLOAT32 Threshold;
  VIABLE_CHOICE Choice = reinterpret_cast<VIABLE_CHOICE>(item1);
  EXPANDED_CHOICE *BestChoice = reinterpret_cast<EXPANDED_CHOICE *>(item2);
-  Threshold = AmbigThreshold(BestChoice->Choice->AdjustFactor,
-                             Choice->AdjustFactor);
+  Threshold = StopperAmbigThreshold(BestChoice->Choice->AdjustFactor,
+                                    Choice->AdjustFactor);
  for (i = 0, Chunk = 0; i < Choice->Length; i++) {
-    for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++){
+    for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
      if (Choice->Blob[i].Class != BestChoice->ChunkClass[Chunk] &&
          Choice->Blob[i].Certainty - BestChoice->ChunkCertainty[Chunk] <
          Threshold) {
-        memfree(Choice);
+        if (stopper_debug_level >= 2)
+          PrintViableChoice(stderr, "\nDiscarding bad choice:  ", Choice);
+        delete Choice;
        return true;
      }
    }
@@ -124,11 +171,6 @@ static int FreeBadChoice(void *item1,    // VIABLE_CHOICE Choice,
  return false;
 }

-namespace tesseract {
-
-const float Dict::kStopperAmbiguityThresholdGain = 8.0;
-const float Dict::kStopperAmbiguityThresholdOffset = 1.5;
-
 bool Dict::AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices,
                            WERD_CHOICE *BestChoice,
                            DANGERR *fixpt,
@@ -158,7 +200,7 @@ bool Dict::AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices,

  if (stopper_debug_level >= 1)
    tprintf("\nStopper:  %s (word=%c, case=%c)\n",
-            BestChoice->debug_string(getUnicharset()).string(),
+            BestChoice->debug_string().string(),
            (is_valid_word ? 'y' : 'n'),
            (is_case_ok ? 'y' : 'n'));

@@ -198,7 +240,7 @@ bool Dict::AcceptableResult(const WERD_CHOICE &BestChoice) {

  if (stopper_debug_level >= 1) {
    tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c)\n",
-            BestChoice.debug_string(getUnicharset()).string(),
+            BestChoice.debug_string().string(),
            (valid_word(BestChoice) ? 'y' : 'n'),
            (case_ok(BestChoice, getUnicharset()) ? 'y' : 'n'),
            ((list_rest (best_choices_) != NIL_LIST) ? 'n' : 'y'));
@@ -320,10 +362,16 @@ void Dict::FilterWordChoices() {
    return;

  // Compute certainties and class for each chunk in best choice.
-  ExpandChoice((VIABLE_CHOICE_STRUCT *)first_node(best_choices_), &BestChoice);
-
-  set_rest (best_choices_, delete_d(list_rest (best_choices_),
-                                    &BestChoice, FreeBadChoice));
+  VIABLE_CHOICE_STRUCT *best_choice =
+      (VIABLE_CHOICE_STRUCT *)first_node(best_choices_);
+  ExpandChoice(best_choice, &BestChoice);
+  if (stopper_debug_level >= 2)
+    PrintViableChoice(stderr, "\nFiltering against best choice: ", best_choice);
+  TessResultCallback2<int, void*, void*>* is_bad =
+      NewPermanentTessCallback(this, &Dict::FreeBadChoice);
+  set_rest(best_choices_, delete_d(list_rest(best_choices_),
+                                   &BestChoice, is_bad));
+  delete is_bad;
 }

 void Dict::FindClassifierErrors(FLOAT32 MinRating,
@@ -371,15 +419,15 @@ void Dict::InitChoiceAccum() {
  BLOB_WIDTH *BlobWidth, *End;

  if (best_raw_choice_)
-    memfree(best_raw_choice_);
+    delete best_raw_choice_;
  best_raw_choice_ = NULL;

  if (best_choices_)
-    destroy_nodes(best_choices_, memfree);
+    destroy_nodes(best_choices_, DeleteViableChoiceStruct);
  best_choices_ = NIL_LIST;

  if (raw_choices_)
-    destroy_nodes(raw_choices_, memfree);
+    destroy_nodes(raw_choices_, DeleteViableChoiceStruct);
  raw_choices_ = NIL_LIST;

  EnableChoiceAccum();
@@ -391,7 +439,7 @@ void Dict::InitChoiceAccum() {
 }

 void Dict::ClearBestChoiceAccum() {
-  if (best_choices_) destroy_nodes(best_choices_, memfree);
+  if (best_choices_) destroy_nodes(best_choices_, DeleteViableChoiceStruct);
  best_choices_ = NIL_LIST;
 }

@@ -420,7 +468,6 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor,
                        const float Certainties[],
                        bool raw_choice,
                        WERD_CHOICE *WordChoice) {
-  VIABLE_CHOICE NewChoice;
  LIST ChoicesList;
  LIST Choices;
  FLOAT32 Threshold;
@@ -429,14 +476,15 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor,
    return;

  if (raw_choice) {
-    if (!best_raw_choice_)
-      best_raw_choice_ = NewViableChoice(*WordChoice, AdjustFactor, Certainties);
-    else if (WordChoice->rating() < best_raw_choice_->Rating) {
-      if (ChoiceSameAs(*WordChoice, best_raw_choice_))
-        FillViableChoice(*WordChoice, AdjustFactor, Certainties, true,
+    if (!best_raw_choice_) {
+      best_raw_choice_ =
+          NewViableChoice(*WordChoice, AdjustFactor, Certainties);
+    } else if (WordChoice->rating() < best_raw_choice_->Rating) {
+      if (ChoiceSameAs(*WordChoice, best_raw_choice_)) {
+        FillViableChoice(*WordChoice, AdjustFactor, Certainties,
                         best_raw_choice_);
-      else {
-        memfree(best_raw_choice_);
+      } else {
+        delete best_raw_choice_;
        best_raw_choice_ =
          NewViableChoice(*WordChoice, AdjustFactor, Certainties);
      }
@@ -449,16 +497,20 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor,

  // Throw out obviously bad choices to save some work.
  if (ChoicesList != NIL_LIST) {
-    Threshold = AmbigThreshold (BestFactor (ChoicesList), AdjustFactor);
-    if (Threshold > -kStopperAmbiguityThresholdOffset)
-      Threshold = -kStopperAmbiguityThresholdOffset;
+    Threshold = StopperAmbigThreshold(BestFactor(ChoicesList), AdjustFactor);
+    if (Threshold > -stopper_ambiguity_threshold_offset)
+      Threshold = -stopper_ambiguity_threshold_offset;
    if (WordChoice->certainty() - BestCertainty (ChoicesList) < Threshold) {
      // Set the rating of the word to be terrible, so that it does not
      // get chosen as the best choice.
      if (stopper_debug_level >= 2) {
-        tprintf("Discarding a choice with an overly low certainty"
-                " %.4f vs best choice certainty %.4f\n",
-                WordChoice->certainty(), BestCertainty(ChoicesList));
+        STRING bad_string;
+        WordChoice->string_and_lengths(&bad_string, NULL);
+        tprintf("Discarding choice \"%s\" with an overly low certainty"
+                " %.4f vs best choice certainty %.4f (Threshold: %.4f)\n",
+                bad_string.string(), WordChoice->certainty(),
+                BestCertainty(ChoicesList),
+                Threshold + BestCertainty(ChoicesList));
      }
      WordChoice->set_rating(WERD_CHOICE::kBadRating);
      return;
@@ -466,7 +518,7 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor,
  }

  // See if a choice with the same text string has already been found.
-  NewChoice = NULL;
+  VIABLE_CHOICE NewChoice = NULL;
  Choices = ChoicesList;

  iterate(Choices) {
@@ -480,11 +532,10 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor,
  }

  if (NewChoice) {
-    FillViableChoice(*WordChoice, AdjustFactor, Certainties, true, NewChoice);
+    FillViableChoice(*WordChoice, AdjustFactor, Certainties, NewChoice);
    ChoicesList = delete_d(ChoicesList, NewChoice, is_same_node);
-  }
-  else {
-    NewChoice = NewViableChoice (*WordChoice, AdjustFactor, Certainties);
+  } else {
+    NewChoice = NewViableChoice(*WordChoice, AdjustFactor, Certainties);
  }

  ChoicesList = s_adjoin (ChoicesList, NewChoice, CmpChoiceRatings);
@@ -494,7 +545,7 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor,
  if (count (ChoicesList) > tessedit_truncate_wordchoice_log) {
    Choices =
      (LIST) nth_cell (ChoicesList, tessedit_truncate_wordchoice_log);
-    destroy_nodes (list_rest (Choices), Efree);
+    destroy_nodes(list_rest (Choices), DeleteViableChoiceStruct);
    set_rest(Choices, NIL_LIST);
  }

@@ -513,7 +564,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
                            bool *modified_blobs) {
  if (stopper_debug_level > 2) {
    tprintf("\nRunning NoDangerousAmbig() for %s\n",
-            best_choice->debug_string(getUnicharset()).string());
+            best_choice->debug_string().string());
  }

  // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
@@ -549,8 +600,10 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
      for (i = 0; i < best_choice->length(); ++i) {
        BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST();
        BLOB_CHOICE_IT lst_it(lst);
+        // TODO(rays/antonova) Should these BLOB_CHOICEs use real xheights
+        // or are these fake ones good enough?
        lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i),
-                                          0.0, 0.0, -1, -1, -1));
+                                          0.0, 0.0, -1, -1, -1, 0, 1, false));
        ambig_blob_choices.push_back(lst);
      }
    }
@@ -630,7 +683,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
              BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
              bc_it.add_to_end(new BLOB_CHOICE(
                  ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
-                  -1, -1, -1));
+                  -1, -1, -1, 0, 1, false));
            }
          }
          spec_it.forward();
@@ -650,7 +703,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
      }  // end searching AmbigSpec_LIST
    }  // end searching best_choice
  }  // end searching replace and dangerous ambigs
-  if (modified_best_choice) best_choice->populate_unichars(getUnicharset());
+  if (modified_best_choice) best_choice->populate_unichars();
  // If any ambiguities were found permute the constructed ambig_blob_choices
  // to see if an alternative dictionary word can be found.
  if (ambigs_found) {
@@ -666,7 +719,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
    if (ambigs_found) {
      if (stopper_debug_level >= 1) {
        tprintf ("Stopper: Possible ambiguous word = %s\n",
-                 alt_word->debug_string(getUnicharset()).string());
+                 alt_word->debug_string().string());
      }
      if (fixpt != NULL) {
        // Note: Currently character choices combined from fragments can only
@@ -691,6 +744,10 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
    }
    delete alt_word;
  }
+  if (output_ambig_words_file_ != NULL) {
+    fprintf(output_ambig_words_file_, "\n");
+  }
+
  ambig_blob_choices.delete_data_pointers();
  return !ambigs_found;
 }
@@ -714,7 +771,6 @@ void Dict::AddNewChunk(VIABLE_CHOICE Choice, int Blob) {
      return;
    }
  }
-  mem_tidy (1);
  cprintf ("AddNewChunk failed:Choice->Length=%d, LastChunk=%d, Blob=%d\n",
           Choice->Length, LastChunk, Blob);
  assert(false);  // this should never get executed
@@ -748,7 +804,7 @@ void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
      for (i = 0; i < fraglen; ++i) {
        if (fraglen > 1) {
          STRING frag_str =
-            CHAR_FRAGMENT::to_string(temp_uch, i, fraglen);
+            CHAR_FRAGMENT::to_string(temp_uch, i, fraglen, false);
          getUnicharset().unichar_insert(frag_str.string());
          uch_id = getUnicharset().unichar_to_id(frag_str.string());
        }
@@ -756,7 +812,7 @@ void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
        STRING correct_frag_uch =
          CHAR_FRAGMENT::to_string(correct_ngram_str,
                                   temp_blob_index - begin_blob_index,
-                                   num_blobs_to_replace);
+                                   num_blobs_to_replace, false);
        getUnicharset().unichar_insert(correct_frag_uch.string());
        UNICHAR_ID correct_frag_uch_id =
          getUnicharset().unichar_to_id(correct_frag_uch.string());
@@ -825,10 +881,9 @@ VIABLE_CHOICE Dict::NewViableChoice(const WERD_CHOICE &WordChoice,
                                    const float Certainties[]) {
  int Length = WordChoice.length();
  assert (Length <= MAX_NUM_CHUNKS && Length > 0);
-  VIABLE_CHOICE NewChoice = (VIABLE_CHOICE) Emalloc (
-      sizeof (VIABLE_CHOICE_STRUCT) + (Length - 1) * sizeof (CHAR_CHOICE));
-  FillViableChoice(WordChoice, AdjustFactor, Certainties, false, NewChoice);
-  return (NewChoice);
+  VIABLE_CHOICE NewChoice = new VIABLE_CHOICE_STRUCT(Length);
+  FillViableChoice(WordChoice, AdjustFactor, Certainties, NewChoice);
+  return NewChoice;
 }

 void Dict::PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice) {
@@ -864,35 +919,10 @@ void Dict::PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice

 void Dict::FillViableChoice(const WERD_CHOICE &WordChoice,
                            FLOAT32 AdjustFactor, const float Certainties[],
-                            bool SameString, VIABLE_CHOICE ViableChoice) {
-  CHAR_CHOICE *NewChar;
-  BLOB_WIDTH *BlobWidth;
-  int x;
-
-  ViableChoice->Rating = WordChoice.rating();
-  ViableChoice->Certainty = WordChoice.certainty();
-  ViableChoice->AdjustFactor = AdjustFactor;
-  ViableChoice->ComposedFromCharFragments = false;
-  if (!SameString) {
-    ViableChoice->Length = WordChoice.length();
-  }
-  for (x = 0,
-       NewChar = &(ViableChoice->Blob[0]),
-       BlobWidth = current_segmentation_;
-       x < WordChoice.length();
-       x++, NewChar++, Certainties++, BlobWidth++) {
-    if (!SameString) {
-      NewChar->Class = WordChoice.unichar_id(x);
-    }
-    NewChar->NumChunks = *BlobWidth;
-    NewChar->Certainty = *Certainties;
-    for (int i = 1; i < WordChoice.fragment_length(x); ++i) {
-      BlobWidth++;
-      assert(*BlobWidth > 0);
-      NewChar->NumChunks += *BlobWidth;
-      ViableChoice->ComposedFromCharFragments = true;
-    }
-  }
+                            VIABLE_CHOICE ViableChoice) {
+  ViableChoice->Init(WordChoice, current_segmentation_, Certainties,
+                     AdjustFactor);
+
 }

 bool Dict::StringSameAs(const WERD_CHOICE &WordChoice,

--- a/dict/stopper.h
+++ b/dict/stopper.h
@@ -27,6 +27,8 @@
 #include "states.h"
 #include "unichar.h"

+class WERD_CHOICE;
+
 typedef uinT8 BLOB_WIDTH;

 struct DANGERR_INFO {
@@ -50,13 +52,36 @@ struct CHAR_CHOICE {
  float Certainty;
 };

-struct VIABLE_CHOICE_STRUCT {
+class VIABLE_CHOICE_STRUCT {
+ public:
+  VIABLE_CHOICE_STRUCT();
+  explicit VIABLE_CHOICE_STRUCT(int length);
+  ~VIABLE_CHOICE_STRUCT();
+
+  // Fill in the data with these values.
+  void Init(const WERD_CHOICE& word_choice,
+            const PIECES_STATE& pieces_state,
+            const float certainties[],
+            FLOAT32 adjust_factor);
+
+  int Length;
  float Rating;
  float Certainty;
  FLOAT32 AdjustFactor;
-  int Length;
  bool ComposedFromCharFragments;
-  CHAR_CHOICE Blob[1];
+  CHAR_CHOICE *Blob;
+
+  // segmentation_state: for each choice, how many consecutive blobs
+  //     does it use?
+  uinT8 *segmentation_state;
+
+ private:
+  // Disallow assignment and copy construction
+  VIABLE_CHOICE_STRUCT(const VIABLE_CHOICE_STRUCT &other)
+      : Length(0), Blob(NULL), segmentation_state(NULL) {}
+  VIABLE_CHOICE_STRUCT &operator=(const VIABLE_CHOICE_STRUCT &other) {
+    return *this;
+  }
 };

 typedef VIABLE_CHOICE_STRUCT *VIABLE_CHOICE;

--- a/dict/trie.cpp
+++ b/dict/trie.cpp
@@ -40,6 +40,16 @@

 namespace tesseract {

+const char kDoNotReverse[] = "RRP_DO_NO_REVERSE";
+const char kReverseIfHasRTL[] = "RRP_REVERSE_IF_HAS_RTL";
+const char kForceReverse[] = "RRP_FORCE_REVERSE";
+
+const char * const RTLReversePolicyNames[] = {
+  kDoNotReverse,
+  kReverseIfHasRTL,
+  kForceReverse
+};
+
 const char Trie::kAlphaPatternUnicode[] = "\u2000";
 const char Trie::kDigitPatternUnicode[] = "\u2001";
 const char Trie::kAlphanumPatternUnicode[] = "\u2002";
@@ -47,6 +57,10 @@ const char Trie::kPuncPatternUnicode[] = "\u2003";
 const char Trie::kLowerPatternUnicode[] = "\u2004";
 const char Trie::kUpperPatternUnicode[] = "\u2005";

+const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) {
+  return RTLReversePolicyNames[reverse_policy];
+}
+
 // Reset the Trie to empty.
 void Trie::clear() {
  nodes_.delete_data_pointers();
@@ -156,10 +170,15 @@ void Trie::add_word_ending(EDGE_RECORD *edge_ptr,
  *edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
 }

-void Trie::add_word_to_dawg(const WERD_CHOICE &word,
+bool Trie::add_word_to_dawg(const WERD_CHOICE &word,
                            const GenericVector<bool> *repetitions) {
-  if (word.length() <= 0) return;  // can't add empty words
+  if (word.length() <= 0) return false;  // can't add empty words
  if (repetitions != NULL) ASSERT_HOST(repetitions->size() == word.length());
+  // Make sure the word does not contain invalid unchar ids.
+  for (int i = 0; i < word.length(); ++i) {
+    if (word.unichar_id(i) < 0 ||
+        word.unichar_id(i) >= unicharset_size_) return false;
+  }

  EDGE_RECORD *edge_ptr;
  NODE_REF last_node = 0;
@@ -233,6 +252,9 @@ void Trie::add_word_to_dawg(const WERD_CHOICE &word,
  if (add_failed) {
    tprintf("Re-initializing document dictionary...\n");
    clear();
+    return false;
+  } else {
+    return true;
  }
 }

@@ -244,7 +266,8 @@ NODE_REF Trie::new_dawg_node() {
 }

 bool Trie::read_word_list(const char *filename,
-                          const UNICHARSET &unicharset) {
+                          const UNICHARSET &unicharset,
+                          Trie::RTLReversePolicy reverse_policy) {
  FILE *word_file;
  char string[CHARS_PER_LINE];
  int  word_count = 0;
@@ -254,6 +277,11 @@ bool Trie::read_word_list(const char *filename,
  while (fgets(string, CHARS_PER_LINE, word_file) != NULL) {
    chomp_string(string);  // remove newline
    WERD_CHOICE word(string, unicharset);
+    if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL &&
+        word.has_rtl_unichar_id()) ||
+        reverse_policy == RRP_FORCE_REVERSE) {
+      word.reverse_and_mirror_unichar_ids();
+    }
    ++word_count;
    if (debug_level_ && word_count % 10000 == 0)
      tprintf("Read %d words so far\n", word_count);
@@ -290,6 +318,7 @@ void Trie::initialize_patterns(UNICHARSET *unicharset) {
  unicharset->unichar_insert(kUpperPatternUnicode);
  upper_pattern_ = unicharset->unichar_to_id(kUpperPatternUnicode);
  initialized_patterns_ = true;
+  unicharset_size_ = unicharset->size();
 }

 void Trie::unichar_id_to_patterns(UNICHAR_ID unichar_id,
@@ -351,7 +380,7 @@ bool Trie::read_pattern_list(const char *filename,
    chomp_string(string);  // remove newline
    // Parse the pattern and construct a unichar id vector.
    // Record the number of repetitions of each unichar in the parallel vector.
-    WERD_CHOICE word;
+    WERD_CHOICE word(&unicharset);
    GenericVector<bool> repetitions_vec;
    const char *str_ptr = string;
    int step = unicharset.step(str_ptr);
@@ -397,7 +426,7 @@ bool Trie::read_pattern_list(const char *filename,
    // Insert the pattern into the trie.
    if (debug_level_ > 2) {
      tprintf("Inserting expanded user pattern %s\n",
-              word.debug_string(unicharset).string());
+              word.debug_string().string());
    }
    if (!this->word_in_dawg(word)) {
      this->add_word_to_dawg(word, &repetitions_vec);

--- a/dict/trie.h
+++ b/dict/trie.h
@@ -61,6 +61,12 @@ namespace tesseract {
 */
 class Trie : public Dawg {
 public:
+  enum RTLReversePolicy {
+    RRP_DO_NO_REVERSE,
+    RRP_REVERSE_IF_HAS_RTL,
+    RRP_FORCE_REVERSE,
+  };
+
  // Minimum number of concrete characters at the beginning of user patterns.
  static const int kSaneNumConcreteChars = 4;
  // Various unicode whitespace characters are used to denote unichar patterns,
@@ -73,6 +79,9 @@ class Trie : public Dawg {
  static const char kLowerPatternUnicode[];
  static const char kUpperPatternUnicode[];

+  static const char *get_reverse_policy_name(
+      RTLReversePolicy reverse_policy);
+
  // max_num_edges argument allows limiting the amount of memory this
  // Trie can consume (if a new word insert would cause the Trie to
  // contain more edges than max_num_edges, all the edges are cleared
@@ -86,7 +95,7 @@ class Trie : public Dawg {
    new_dawg_node();  // need to allocate node 0
    initialized_patterns_ = false;
  }
-  ~Trie() { nodes_.delete_data_pointers(); }
+  virtual ~Trie() { nodes_.delete_data_pointers(); }

  // Reset the Trie to empty.
  void clear();
@@ -149,8 +158,11 @@ class Trie : public Dawg {
  SquishedDawg *trie_to_dawg();

  // Inserts the list of words from the given file into the Trie.
+  // If reverse is true, calls WERD_CHOICE::reverse_unichar_ids_if_rtl()
+  // on each word before inserting it into the Trie.
  bool read_word_list(const char *filename,
-                      const UNICHARSET &unicharset);
+                      const UNICHARSET &unicharset,
+                      Trie::RTLReversePolicy reverse);

  // Inserts the list of patterns from the given file into the Trie.
  // The pattern list file should contain one pattern per line in UTF-8 format.
@@ -225,10 +237,13 @@ class Trie : public Dawg {
  // whether the unichar id with the corresponding index in the word is allowed
  // to repeat an unlimited number of times. For each entry that is true, MARKER
  // flag of the corresponding edge created for this unichar id is set to true).
-  void add_word_to_dawg(const WERD_CHOICE &word,
+  //
+  // Return true if add succeeded, false otherwise (e.g. when a word contained
+  // an invalid unichar id or the trie was getting too large and was cleared).
+  bool add_word_to_dawg(const WERD_CHOICE &word,
                        const GenericVector<bool> *repetitions);
-  void add_word_to_dawg(const WERD_CHOICE &word) {
-    add_word_to_dawg(word, NULL);
+  bool add_word_to_dawg(const WERD_CHOICE &word) {
+    return add_word_to_dawg(word, NULL);
  }

 protected:
@@ -377,11 +392,11 @@ class Trie : public Dawg {
  UNICHAR_ID character_class_to_pattern(char ch);

  // Member variables
-  TRIE_NODES nodes_;              ///< vector of nodes in the Trie
-  uinT64 num_edges_;              ///< sum of all edges (forward and backward)
-  uinT64 max_num_edges_;          ///< maximum number of edges allowed
-  uinT64 deref_direction_mask_;   ///< mask for EDGE_REF to extract direction
-  uinT64 deref_node_index_mask_;  ///< mask for EDGE_REF to extract node index
+  TRIE_NODES nodes_;              // vector of nodes in the Trie
+  uinT64 num_edges_;              // sum of all edges (forward and backward)
+  uinT64 max_num_edges_;          // maximum number of edges allowed
+  uinT64 deref_direction_mask_;   // mask for EDGE_REF to extract direction
+  uinT64 deref_node_index_mask_;  // mask for EDGE_REF to extract node index
  // Variables for translating character class codes denoted in user patterns
  // file to the unichar ids used to represent them in a Trie.
  bool initialized_patterns_;