Fixed multilang for LSTM, pushed cube to one side without actually deleting it

5deebe6c · Ray Smith · 798d79aa · 5deebe6c · 5deebe6c · 5deebe6c
14 changed file
--- a/api/tesseractmain.cpp
+++ b/api/tesseractmain.cpp
@@ -123,10 +123,9 @@ void PrintHelpForOEM() {
  const char* msg =
      "OCR Engine modes:\n"
      "  0    Original Tesseract only.\n"
-      "  1    Cube only.\n"
-      "  2    Tesseract + cube.\n"
-      "  3    Default, based on what is available.\n"
-      "  4    Neural nets (LSTM) only.\n";
+      "  1    Neural nets LSTM only.\n"
+      "  2    Tesseract + LSTM.\n"
+      "  3    Default, based on what is available.\n";

  printf("%s", msg);
 }

--- a/ccmain/control.cpp
+++ b/ccmain/control.cpp
@@ -31,21 +31,22 @@
 #include <errno.h>
 #endif
 #include <ctype.h>
-#include "ocrclass.h"
-#include "werdit.h"
+#include "callcpp.h"
+#include "control.h"
+#include "docqual.h"
 #include "drawfx.h"
-#include "tessbox.h"
-#include "tessvars.h"
-#include "pgedit.h"
-#include "reject.h"
 #include "fixspace.h"
-#include "docqual.h"
-#include "control.h"
-#include "output.h"
-#include "callcpp.h"
 #include "globals.h"
+#include "lstmrecognizer.h"
+#include "ocrclass.h"
+#include "output.h"
+#include "pgedit.h"
+#include "reject.h"
 #include "sorthelper.h"
+#include "tessbox.h"
 #include "tesseractclass.h"
+#include "tessvars.h"
+#include "werdit.h"

 #define MIN_FONT_ROW_COUNT  8
 #define MAX_XHEIGHT_DIFF  3
@@ -192,8 +193,8 @@ void Tesseract::SetupWordPassN(int pass_n, WordData* word) {
      WERD_RES* word_res = new WERD_RES;
      word_res->InitForRetryRecognition(*word->word);
      word->lang_words.push_back(word_res);
-      // Cube doesn't get setup for pass2.
-      if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_CUBE_ONLY) {
+      // LSTM doesn't get setup for pass2.
+      if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
        word_res->SetupForRecognition(
              lang_t->unicharset, lang_t, BestPix(),
              lang_t->tessedit_ocr_engine_mode, NULL,
@@ -301,16 +302,6 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
                                const TBOX* target_word_box,
                                const char* word_config,
                                int dopasses) {
-  // PSM_RAW_LINE is a special-case mode in which the layout analysis is
-  // completely ignored and LSTM is run on the raw image. There is no hope
-  // of running normal tesseract in this situation or of integrating output.
-#ifndef ANDROID_BUILD
-  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY &&
-      tessedit_pageseg_mode == PSM_RAW_LINE) {
-    RecogRawLine(page_res);
-    return true;
-  }
-#endif
  PAGE_RES_IT page_res_it(page_res);

  if (tessedit_minimal_rej_pass1) {
@@ -397,8 +388,7 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
    if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) return false;
  }

-  // The next passes can only be run if tesseract has been used, as cube
-  // doesn't set all the necessary outputs in WERD_RES.
+  // The next passes are only required for Tess-only.
  if (AnyTessLang() && !AnyLSTMLang()) {
    // ****************** Pass 3 *******************
    // Fix fuzzy spaces.
@@ -451,8 +441,13 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
  for (page_res_it.restart_page(); page_res_it.word() != NULL;
       page_res_it.forward()) {
    WERD_RES* word = page_res_it.word();
-    if (word->best_choice == NULL || word->best_choice->length() == 0)
+    POLY_BLOCK* pb = page_res_it.block()->block != NULL
+                         ? page_res_it.block()->block->poly_block()
+                         : NULL;
+    if (word->best_choice == NULL || word->best_choice->length() == 0 ||
+        (word->best_choice->IsAllSpaces() && (pb == NULL || pb->IsText()))) {
      page_res_it.DeleteCurrentWord();
+    }
  }

  if (monitor != NULL) {
@@ -1376,12 +1371,20 @@ void Tesseract::classify_word_pass1(const WordData& word_data,
    cube_word_pass1(block, row, *in_word);
    return;
  }
-  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
-    if (!(*in_word)->odd_size) {
+#endif
+#ifndef ANDROID_BUILD
+  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
+      tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
+    if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
      LSTMRecognizeWord(*block, row, *in_word, out_words);
      if (!out_words->empty())
        return;  // Successful lstm recognition.
    }
+    if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
+      // No fallback allowed, so use a fake.
+      (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset());
+      return;
+    }
    // Fall back to tesseract for failed words or odd words.
    (*in_word)->SetupForRecognition(unicharset, this, BestPix(),
                                    OEM_TESSERACT_ONLY, NULL,
@@ -1523,7 +1526,7 @@ void Tesseract::classify_word_pass2(const WordData& word_data,
                                    WERD_RES** in_word,
                                    PointerVector<WERD_RES>* out_words) {
  // Return if we do not want to run Tesseract.
-  if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {
+  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
    return;
  }
  ROW* row = word_data.row;
@@ -1908,7 +1911,7 @@ static void find_modal_font(           //good chars in word
 * Get the fonts for the word.
 */
 void Tesseract::set_word_fonts(WERD_RES *word) {
-  // Don't try to set the word fonts for a cube word, as the configs
+  // Don't try to set the word fonts for an lstm word, as the configs
  // will be meaningless.
  if (word->chopped_word == NULL) return;
  ASSERT_HOST(word->best_choice != NULL);

--- a/ccmain/linerec.cpp
+++ b/ccmain/linerec.cpp
@@ -219,19 +219,6 @@ ImageData* Tesseract::GetRectImage(const TBOX& box, const BLOCK& block,
 }

 #ifndef ANDROID_BUILD
-// Top-level function recognizes a single raw line.
-void Tesseract::RecogRawLine(PAGE_RES* page_res) {
-  PAGE_RES_IT it(page_res);
-  PointerVector<WERD_RES> words;
-  LSTMRecognizeWord(*it.block()->block, it.row()->row, it.word(), &words);
-  if (getDict().stopper_debug_level >= 1) {
-    for (int w = 0; w < words.size(); ++w) {
-      words[w]->DebugWordChoices(true, NULL);
-    }
-  }
-  it.ReplaceCurrentWord(&words);
-}
-
 // Recognizes a word or group of words, converting to WERD_RES in *words.
 // Analogous to classify_word_pass1, but can handle a group of words as well.
 void Tesseract::LSTMRecognizeWord(const BLOCK& block, ROW *row, WERD_RES *word,
@@ -268,7 +255,17 @@ void Tesseract::SearchWords(PointerVector<WERD_RES>* words) {
  // for each of the output words.
  // If we drop a word as junk, then there is always a space in front of the
  // next.
-  bool deleted_prev = false;
+  const Dict* stopper_dict = lstm_recognizer_->GetDict();
+  if (stopper_dict == nullptr) stopper_dict = &getDict();
+  bool any_nonspace_delimited = false;
+  for (int w = 0; w < words->size(); ++w) {
+    WERD_RES* word = (*words)[w];
+    if (word->best_choice != nullptr &&
+        word->best_choice->ContainsAnyNonSpaceDelimited()) {
+      any_nonspace_delimited = true;
+      break;
+    }
+  }
  for (int w = 0; w < words->size(); ++w) {
    WERD_RES* word = (*words)[w];
    if (word->best_choice == NULL) {
@@ -284,9 +281,7 @@ void Tesseract::SearchWords(PointerVector<WERD_RES>* words) {
    }
    if (word->best_choice == NULL) {
      // It is a dud.
-      words->remove(w);
-      --w;
-      deleted_prev = true;
+      word->SetupFake(lstm_recognizer_->GetUnicharset());
    } else {
      // Set the best state.
      for (int i = 0; i < word->best_choice->length(); ++i) {
@@ -314,22 +309,21 @@ void Tesseract::SearchWords(PointerVector<WERD_RES>* words) {
        word->best_choice->print();
      }
      // Discard words that are impossibly bad, but allow a bit more for
-      // dictionary words.
+      // dictionary words, and keep bad words in non-space-delimited langs.
      if (word_certainty >= RecodeBeamSearch::kMinCertainty ||
+          any_nonspace_delimited ||
          (word_certainty >= kWorstDictCertainty &&
           Dict::valid_word_permuter(word->best_choice->permuter(), true))) {
-        word->best_choice->set_certainty(word_certainty);
-        if (deleted_prev) word->word->set_blanks(1);
+        word->tess_accepted = stopper_dict->AcceptableResult(word);
      } else {
        if (getDict().stopper_debug_level >= 1) {
          tprintf("Deleting word with certainty %g\n", word_certainty);
          word->best_choice->print();
        }
        // It is a dud.
-        words->remove(w);
-        --w;
-        deleted_prev = true;
+        word->SetupFake(lstm_recognizer_->GetUnicharset());
      }
+      word->best_choice->set_certainty(word_certainty);
    }
  }
 }

--- a/ccmain/tessedit.cpp
+++ b/ccmain/tessedit.cpp
@@ -161,7 +161,7 @@ bool Tesseract::init_tesseract_lang_data(
  // Determine which ocr engine(s) should be loaded and used for recognition.
  if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
  if (tessdata_manager_debug_level) {
-    tprintf("Loading Tesseract/Cube with tessedit_ocr_engine_mode %d\n",
+    tprintf("Loading Tesseract/LSTM with tessedit_ocr_engine_mode %d\n",
            static_cast<int>(tessedit_ocr_engine_mode));
  }

@@ -174,9 +174,37 @@ bool Tesseract::init_tesseract_lang_data(
    return true;
  }

+// The various OcrEngineMode settings (see publictypes.h) determine which
+// engine-specific data files need to be loaded. Currently everything needs
+// the base tesseract data, which supplies other useful information, but
+// alternative engines, such as LSTM are optional.
+#ifndef ANDROID_BUILD
+  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
+      tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
+    if (tessdata_manager.swap()) {
+      tprintf("Error: LSTM requested on big-endian hardware!!\n");
+      tprintf("Big-endian not yet supported! Loading tesseract.\n");
+      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
+    } else if (tessdata_manager.SeekToStart(TESSDATA_LSTM)) {
+      lstm_recognizer_ = new LSTMRecognizer;
+      TFile fp;
+      fp.Open(tessdata_manager.GetDataFilePtr(), -1);
+      ASSERT_HOST(lstm_recognizer_->DeSerialize(tessdata_manager.swap(), &fp));
+      if (lstm_use_matrix)
+        lstm_recognizer_->LoadDictionary(tessdata_path.string(), language);
+    } else {
+      tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
+      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
+    }
+  }
+#endif
+
  // Load the unicharset
-  if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) ||
-      !unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) {
+  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
+    // Avoid requiring a unicharset when we aren't running base tesseract.
+    unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
+  } else if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) ||
+             !unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) {
    return false;
  }
  if (unicharset.size() > MAX_NUM_CLASSES) {
@@ -203,11 +231,6 @@ bool Tesseract::init_tesseract_lang_data(
        ambigs_debug_level, use_ambigs_for_adaption, &unicharset);
    if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n");
  }
-
-  // The various OcrEngineMode settings (see publictypes.h) determine which
-  // engine-specific data files need to be loaded. Currently everything needs
-  // the base tesseract data, which supplies other useful information, but
-  // alternative engines, such as cube and LSTM are optional.
 #ifndef NO_CUBE_BUILD
  if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {
    ASSERT_HOST(init_cube_objects(false, &tessdata_manager));
@@ -217,22 +240,6 @@ bool Tesseract::init_tesseract_lang_data(
    ASSERT_HOST(init_cube_objects(true, &tessdata_manager));
    if (tessdata_manager_debug_level)
      tprintf("Loaded Cube with combiner\n");
-  } else if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
-    if (tessdata_manager.swap()) {
-      tprintf("Error: LSTM requested on big-endian hardware!!\n");
-      tprintf("Big-endian not yet supported! Loading tesseract.\n");
-      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
-    } else if (tessdata_manager.SeekToStart(TESSDATA_LSTM)) {
-      lstm_recognizer_ = new LSTMRecognizer;
-      TFile fp;
-      fp.Open(tessdata_manager.GetDataFilePtr(), -1);
-      ASSERT_HOST(lstm_recognizer_->DeSerialize(tessdata_manager.swap(), &fp));
-      if (lstm_use_matrix)
-        lstm_recognizer_->LoadDictionary(tessdata_path.string(), language);
-    } else {
-      tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
-      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
-    }
  }
 #endif
  // Init ParamsModel.
@@ -425,16 +432,16 @@ int Tesseract::init_tesseract_internal(
    tessdata_manager.End();
    return 0;
  }
-  // If only Cube will be used, skip loading Tesseract classifier's
-  // pre-trained templates.
-  bool init_tesseract_classifier =
-    tessedit_ocr_engine_mode != OEM_CUBE_ONLY;
-  // If only Cube will be used and if it has its own Unicharset,
-  // skip initializing permuter and loading Tesseract Dawgs.
-  bool init_dict =
-    !(tessedit_ocr_engine_mode == OEM_CUBE_ONLY &&
-      tessdata_manager.SeekToStart(TESSDATA_CUBE_UNICHARSET));
-  program_editup(textbase, init_tesseract_classifier, init_dict);
+  // If only LSTM will be used, skip loading Tesseract classifier's
+  // pre-trained templates and dictionary.
+  bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY &&
+                        tessedit_ocr_engine_mode != OEM_CUBE_ONLY;
+  bool init_dict = init_tesseract;
+  if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY &&
+      !tessdata_manager.SeekToStart(TESSDATA_CUBE_UNICHARSET)) {
+    init_dict = true;
+  }
+  program_editup(textbase, init_tesseract, init_dict);
  tessdata_manager.End();
  return 0;                      //Normal exit
 }

--- a/ccmain/tesseract_cube_combiner.cpp
+++ b/ccmain/tesseract_cube_combiner.cpp
@@ -21,6 +21,8 @@
 // the recognition results of Tesseract and Cube at the word level

 #include <algorithm>
+#include <string>
+#include <vector>
 #include <wctype.h>

 #include "tesseract_cube_combiner.h"
@@ -125,12 +127,10 @@ bool TesseractCubeCombiner::ValidWord(const string &str) {
 // Public method for computing the combiner features. The agreement
 // output parameter will be true if both answers are identical,
 // and false otherwise.
-bool TesseractCubeCombiner::ComputeCombinerFeatures(const string &tess_str,
-                                                    int tess_confidence,
-                                                    CubeObject *cube_obj,
-                                                    WordAltList *cube_alt_list,
-                                                    vector<double> *features,
-                                                    bool *agreement) {
+bool TesseractCubeCombiner::ComputeCombinerFeatures(
+    const string &tess_str, int tess_confidence, CubeObject *cube_obj,
+    WordAltList *cube_alt_list, std::vector<double> *features,
+    bool *agreement) {
  features->clear();
  *agreement = false;
  if (cube_alt_list == NULL || cube_alt_list->AltCount() <= 0)

--- a/ccmain/tesseractclass.cpp
+++ b/ccmain/tesseractclass.cpp
@@ -81,9 +81,9 @@ Tesseract::Tesseract()
          " (Values from PageSegMode enum in publictypes.h)",
          this->params()),
      INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY,
-                      "Which OCR engine(s) to run (Tesseract, Cube, both)."
+                      "Which OCR engine(s) to run (Tesseract, LSTM, both)."
                      " Defaults to loading and running only Tesseract"
-                      " (no Cube,no combiner)."
+                      " (no LSTM,no combiner)."
                      " Values from OcrEngineMode enum in tesseractclass.h)",
                      this->params()),
      STRING_MEMBER(tessedit_char_blacklist, "",

--- a/ccmain/tesseractclass.h
+++ b/ccmain/tesseractclass.h
@@ -210,6 +210,9 @@ class Tesseract : public Wordrec {
  void set_pix_original(Pix* original_pix) {
    pixDestroy(&pix_original_);
    pix_original_ = original_pix;
+    // Clone to sublangs as well.
+    for (int i = 0; i < sub_langs_.size(); ++i)
+      sub_langs_[i]->set_pix_original(pixClone(original_pix));
  }
  // Returns a pointer to a Pix representing the best available (original) image
  // of the page. Can be of any bit depth, but never color-mapped, as that has
@@ -261,20 +264,19 @@ class Tesseract : public Wordrec {
  Tesseract* get_sub_lang(int index) const {
    return sub_langs_[index];
  }
-  // Returns true if any language uses Tesseract (as opposed to cube).
+  // Returns true if any language uses Tesseract (as opposed to LSTM).
  bool AnyTessLang() const {
-    if (tessedit_ocr_engine_mode != OEM_CUBE_ONLY) return true;
+    if (tessedit_ocr_engine_mode != OEM_LSTM_ONLY) return true;
    for (int i = 0; i < sub_langs_.size(); ++i) {
-      if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_CUBE_ONLY)
-        return true;
+      if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) return true;
    }
    return false;
  }
  // Returns true if any language uses the LSTM.
  bool AnyLSTMLang() const {
-    if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) return true;
+    if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY) return true;
    for (int i = 0; i < sub_langs_.size(); ++i) {
-      if (sub_langs_[i]->tessedit_ocr_engine_mode == OEM_LSTM_ONLY)
+      if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY)
        return true;
    }
    return false;
@@ -340,8 +342,6 @@ class Tesseract : public Wordrec {
  // is also returned to enable calculation of output bounding boxes.
  ImageData* GetRectImage(const TBOX& box, const BLOCK& block, int padding,
                          TBOX* revised_box) const;
-  // Top-level function recognizes a single raw line.
-  void RecogRawLine(PAGE_RES* page_res);
  // Recognizes a word or group of words, converting to WERD_RES in *words.
  // Analogous to classify_word_pass1, but can handle a group of words as well.
  void LSTMRecognizeWord(const BLOCK& block, ROW *row, WERD_RES *word,
@@ -850,8 +850,8 @@ class Tesseract : public Wordrec {
            " 5=line, 6=word, 7=char"
            " (Values from PageSegMode enum in publictypes.h)");
  INT_VAR_H(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY,
-            "Which OCR engine(s) to run (Tesseract, Cube, both). Defaults"
-            " to loading and running only Tesseract (no Cube, no combiner)."
+            "Which OCR engine(s) to run (Tesseract, LSTM, both). Defaults"
+            " to loading and running only Tesseract (no LSTM, no combiner)."
            " (Values from OcrEngineMode enum in tesseractclass.h)");
  STRING_VAR_H(tessedit_char_blacklist, "",
               "Blacklist of chars not to recognize");

--- a/ccstruct/pageres.cpp
+++ b/ccstruct/pageres.cpp
@@ -884,6 +884,7 @@ void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) {
  }
  FakeWordFromRatings(TOP_CHOICE_PERM);
  reject_map.initialise(blob_count);
+  best_state.init_to_size(blob_count, 1);
  done = true;
 }


--- a/ccstruct/publictypes.h
+++ b/ccstruct/publictypes.h
@@ -255,8 +255,9 @@ enum ParagraphJustification {
 */
 enum OcrEngineMode {
  OEM_TESSERACT_ONLY,           // Run Tesseract only - fastest
-  OEM_CUBE_ONLY,                // Run Cube only - better accuracy, but slower
-  OEM_TESSERACT_CUBE_COMBINED,  // Run both and combine results - best accuracy
+  OEM_LSTM_ONLY,                // Run just the LSTM line recognizer.
+  OEM_TESSERACT_LSTM_COMBINED,  // Run the LSTM recognizer, but allow fallback
+                                // to Tesseract when things get difficult.
  OEM_DEFAULT,                  // Specify this mode when calling init_*(),
                                // to indicate that any of the above modes
                                // should be automatically inferred from the
@@ -264,14 +265,8 @@ enum OcrEngineMode {
                                // command-line configs, or if not specified
                                // in any of the above should be set to the
                                // default OEM_TESSERACT_ONLY.
-  // OEM_LSTM_ONLY will fall back (with a warning) to OEM_TESSERACT_ONLY where
-  // there is no network model available. This allows use of a mix of languages,
-  // some of which contain a network model, and some of which do not. Since the
-  // tesseract model is required for the LSTM to fall back to for "difficult"
-  // words anyway, this seems like a reasonable approach, but leaves the danger
-  // of not noticing that it is using the wrong engine if the warning is
-  // ignored.
-  OEM_LSTM_ONLY,                // Run just the LSTM line recognizer.
+  OEM_CUBE_ONLY,                // Run Cube only - better accuracy, but slower
+  OEM_TESSERACT_CUBE_COMBINED,  // Run both and combine results - best accuracy
 };

 }  // namespace tesseract.

--- a/ccstruct/ratngs.h
+++ b/ccstruct/ratngs.h
@@ -508,6 +508,20 @@ class WERD_CHOICE : public ELIST_LINK {
    }
    return word_str;
  }
+  // Returns true if any unichar_id in the word is a non-space-delimited char.
+  bool ContainsAnyNonSpaceDelimited() const {
+    for (int i = 0; i < length_; ++i) {
+      if (!unicharset_->IsSpaceDelimited(unichar_ids_[i])) return true;
+    }
+    return false;
+  }
+  // Returns true if the word is all spaces.
+  bool IsAllSpaces() const {
+    for (int i = 0; i < length_; ++i) {
+      if (unichar_ids_[i] != UNICHAR_SPACE) return false;
+    }
+    return true;
+  }

  // Call this to override the default (strict left to right graphemes)
  // with the fact that some engine produces a "reading order" set of

--- a/dict/context.cpp
+++ b/dict/context.cpp
@@ -49,7 +49,7 @@ const int case_state_table[6][4] = {
     5, -1, 2, -1},
 };

-int Dict::case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) {
+int Dict::case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) const {
  int state = 0;
  int x;
  for (x = 0; x < word.length(); ++x) {

--- a/dict/dict.h
+++ b/dict/dict.h
@@ -260,7 +260,7 @@ class Dict {
                    MATRIX *ratings);

  /// Returns the length of the shortest alpha run in WordChoice.
-  int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice);
+  int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const;
  /// Returns true if the certainty of the BestChoice word is within a
  /// reasonable range of the average certainties for the best choices for
  /// each character in the segmentation.  This test is used to catch words
@@ -275,7 +275,7 @@ class Dict {
  /// Returns false if the best choice for the current word is questionable
  /// and should be tried again on the second pass or should be flagged to
  /// the user.
-  bool AcceptableResult(WERD_RES* word);
+  bool AcceptableResult(WERD_RES *word) const;
  void EndDangerousAmbigs();
  /// Prints the current choices for this word to stdout.
  void DebugWordChoices();
@@ -285,7 +285,7 @@ class Dict {
  void SettupStopperPass2();
  /* context.cpp *************************************************************/
  /// Check a string to see if it matches a set of lexical rules.
-  int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset);
+  int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) const;
  /// Returns true if the word looks like an absolute garbage
  /// (e.g. image mistakenly recognized as text).
  bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset);

--- a/dict/stopper.cpp
+++ b/dict/stopper.cpp
@@ -107,7 +107,7 @@ bool Dict::AcceptableChoice(const WERD_CHOICE& best_choice,
  }
 }

-bool Dict::AcceptableResult(WERD_RES* word) {
+bool Dict::AcceptableResult(WERD_RES *word) const {
  if (word->best_choice == NULL) return false;
  float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
  int WordSize;
@@ -448,7 +448,7 @@ void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
  }
 }

-int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) {
+int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const {
  int shortest = MAX_INT32;
  int curr_len = 0;
  for (int w = 0; w < WordChoice.length(); ++w) {

--- a/lstm/lstmrecognizer.h
+++ b/lstm/lstmrecognizer.h
@@ -141,6 +141,8 @@ class LSTMRecognizer {
  bool IsUsingAdaGrad() const { return network_->TestFlag(NF_ADA_GRAD); }
  // Provides access to the UNICHARSET that this classifier works with.
  const UNICHARSET& GetUnicharset() const { return ccutil_.unicharset; }
+  // Provides access to the Dict that this classifier works with.
+  const Dict* GetDict() const { return dict_; }
  // Sets the sample iteration to the given value. The sample_iteration_
  // determines the seed for the random number generator. The training
  // iteration is incremented only by a successful training iteration.