Made LSTM the default engine, pushed cube out

13e46ae1 · Ray Smith · dc124043 · 13e46ae1 · 13e46ae1 · 13e46ae1
9 changed file
--- a/ccmain/linerec.cpp
+++ b/ccmain/linerec.cpp
@@ -288,6 +288,7 @@ void Tesseract::SearchWords(PointerVector<WERD_RES>* words) {
        int length = word->best_choice->state(i);
        word->best_state.push_back(length);
      }
+      word->reject_map.initialise(word->best_choice->length());
      word->tess_failed = false;
      word->tess_accepted = true;
      word->tess_would_adapt = false;

--- a/ccmain/tessedit.cpp
+++ b/ccmain/tessedit.cpp
@@ -109,6 +109,17 @@ bool Tesseract::init_tesseract_lang_data(
                             tessdata_manager_debug_level)) {
    return false;
  }
+  if (oem == OEM_DEFAULT) {
+    // Set the engine mode from availablity, which can then be overidden by
+    // the config file when we read it below.
+    if (!tessdata_manager.IsLSTMAvailable()) {
+      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
+    } else if (!tessdata_manager.IsBaseAvailable()) {
+      tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
+    } else {
+      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_LSTM_COMBINED);
+    }
+  }

  // If a language specific config file (lang.config) exists, load it in.
  if (tessdata_manager.SeekToStart(TESSDATA_LANG_CONFIG)) {
@@ -175,9 +186,8 @@ bool Tesseract::init_tesseract_lang_data(
  }

 // The various OcrEngineMode settings (see publictypes.h) determine which
-// engine-specific data files need to be loaded. Currently everything needs
-// the base tesseract data, which supplies other useful information, but
-// alternative engines, such as LSTM are optional.
+// engine-specific data files need to be loaded.
+// If LSTM_ONLY is requested, the base Tesseract files are *Not* required.
 #ifndef ANDROID_BUILD
  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
      tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {

--- a/ccmain/tesseractclass.cpp
+++ b/ccmain/tesseractclass.cpp
@@ -80,11 +80,10 @@ Tesseract::Tesseract()
          " 5=line, 6=word, 7=char"
          " (Values from PageSegMode enum in publictypes.h)",
          this->params()),
-      INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY,
+      INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_DEFAULT,
                      "Which OCR engine(s) to run (Tesseract, LSTM, both)."
-                      " Defaults to loading and running only Tesseract"
-                      " (no LSTM,no combiner)."
-                      " Values from OcrEngineMode enum in tesseractclass.h)",
+                      " Defaults to loading and running the most accurate"
+                      "available.",
                      this->params()),
      STRING_MEMBER(tessedit_char_blacklist, "",
                    "Blacklist of chars not to recognize", this->params()),

--- a/ccmain/tesseractclass.h
+++ b/ccmain/tesseractclass.h
@@ -849,10 +849,9 @@ class Tesseract : public Wordrec {
            "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
            " 5=line, 6=word, 7=char"
            " (Values from PageSegMode enum in publictypes.h)");
-  INT_VAR_H(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY,
+  INT_VAR_H(tessedit_ocr_engine_mode, tesseract::OEM_DEFAULT,
            "Which OCR engine(s) to run (Tesseract, LSTM, both). Defaults"
-            " to loading and running only Tesseract (no LSTM, no combiner)."
-            " (Values from OcrEngineMode enum in tesseractclass.h)");
+            " to loading and running the most accurate available.");
  STRING_VAR_H(tessedit_char_blacklist, "",
               "Blacklist of chars not to recognize");
  STRING_VAR_H(tessedit_char_whitelist, "",

--- a/ccutil/tessdatamanager.cpp
+++ b/ccutil/tessdatamanager.cpp
@@ -162,21 +162,14 @@ bool TessdataManager::CombineDataFiles(
  }

  // Make sure that the required components are present.
-  if (file_ptr[TESSDATA_UNICHARSET] == NULL) {
-    tprintf("Error opening %sunicharset file\n", language_data_path_prefix);
+  if (!IncludesBaseComponents(offset_table) &&
+      !IncludesLSTMComponents(offset_table)) {
+    tprintf(
+        "Error: traineddata file must contain at least (a unicharset file"
+        "and inttemp) OR an lstm file.\n");
    fclose(output_file);
    return false;
  }
-  if (file_ptr[TESSDATA_INTTEMP] != NULL &&
-      (file_ptr[TESSDATA_PFFMTABLE] == NULL ||
-       file_ptr[TESSDATA_NORMPROTO] == NULL)) {
-    tprintf("Error opening %spffmtable and/or %snormproto files"
-            " while %sinttemp file was present\n", language_data_path_prefix,
-            language_data_path_prefix, language_data_path_prefix);
-    fclose(output_file);
-    return false;
-  }
-
  return WriteMetadata(offset_table, language_data_path_prefix, output_file);
 }

@@ -256,6 +249,19 @@ bool TessdataManager::TessdataTypeFromFileName(
  return TessdataTypeFromFileSuffix(suffix, type, text_file);
 }

+// Returns true if the base Tesseract components are present.
+/* static */
+bool TessdataManager::IncludesBaseComponents(const inT64 *offset_table) {
+  return offset_table[TESSDATA_UNICHARSET] >= 0 &&
+         offset_table[TESSDATA_INTTEMP] >= 0;
+}
+
+// Returns true if the LSTM components are present.
+/* static */
+bool TessdataManager::IncludesLSTMComponents(const inT64 *offset_table) {
+  return offset_table[TESSDATA_LSTM] >= 0;
+}
+
 bool TessdataManager::ExtractToFile(const char *filename) {
  TessdataType type = TESSDATA_NUM_ENTRIES;
  bool text_file = false;

--- a/ccutil/tessdatamanager.h
+++ b/ccutil/tessdatamanager.h
@@ -66,8 +66,8 @@ enum TessdataType {
  TESSDATA_NUMBER_DAWG,         // 8
  TESSDATA_FREQ_DAWG,           // 9
  TESSDATA_FIXED_LENGTH_DAWGS,  // 10  // deprecated
-  TESSDATA_CUBE_UNICHARSET,     // 11
-  TESSDATA_CUBE_SYSTEM_DAWG,    // 12
+  TESSDATA_CUBE_UNICHARSET,     // 11  // deprecated
+  TESSDATA_CUBE_SYSTEM_DAWG,    // 12  // deprecated
  TESSDATA_SHAPE_TABLE,         // 13
  TESSDATA_BIGRAM_DAWG,         // 14
  TESSDATA_UNAMBIG_DAWG,        // 15
@@ -96,8 +96,8 @@ static const char *const kTessdataFileSuffixes[] = {
    kNumberDawgFileSuffix,        // 8
    kFreqDawgFileSuffix,          // 9
    kFixedLengthDawgsFileSuffix,  // 10  // deprecated
-    kCubeUnicharsetFileSuffix,    // 11
-    kCubeSystemDawgFileSuffix,    // 12
+    kCubeUnicharsetFileSuffix,    // 11  // deprecated
+    kCubeSystemDawgFileSuffix,    // 12  // deprecated
    kShapeTableFileSuffix,        // 13
    kBigramDawgFileSuffix,        // 14
    kUnambigDawgFileSuffix,       // 15
@@ -124,8 +124,8 @@ static const bool kTessdataFileIsText[] = {
    false,  // 8
    false,  // 9
    false,  // 10  // deprecated
-    true,   // 11
-    false,  // 12
+    true,   // 11  // deprecated
+    false,  // 12  // deprecated
    false,  // 13
    false,  // 14
    false,  // 15
@@ -164,6 +164,12 @@ class TessdataManager {
   */
  bool Init(const char *data_file_name, int debug_level);

+  // Returns true if the base Tesseract components are present.
+  bool IsBaseAvailable() const { return IncludesBaseComponents(offset_table_); }
+
+  // Returns true if the LSTM components are present.
+  bool IsLSTMAvailable() const { return IncludesLSTMComponents(offset_table_); }
+
  // Return the name of the underlying data file.
  const STRING &GetDataFileName() const { return data_file_name_; }

@@ -280,6 +286,10 @@ class TessdataManager {
                                       bool *text_file);

 private:
+  // Returns true if the base Tesseract components are present.
+  static bool IncludesBaseComponents(const inT64 *offset_table);
+  // Returns true if the LSTM components are present.
+  static bool IncludesLSTMComponents(const inT64 *offset_table);

  /**
   * Opens the file whose name is a concatenation of language_data_path_prefix

--- a/lstm/input.cpp
+++ b/lstm/input.cpp
@@ -101,7 +101,7 @@ Pix* Input::PrepareLSTMInputs(const ImageData& image_data,
    tprintf("Bad pix from ImageData!\n");
    return nullptr;
  }
-  if (width <= min_width) {
+  if (width <= min_width || height < min_width) {
    tprintf("Image too small to scale!! (%dx%d vs min width of %d)\n", width,
            height, min_width);
    pixDestroy(&pix);

--- a/lstm/recodebeam.cpp
+++ b/lstm/recodebeam.cpp
--- a/lstm/recodebeam.h
+++ b/lstm/recodebeam.h
@@ -31,6 +31,60 @@

 namespace tesseract {

+// Enum describing what can follow the current node.
+// Consider the following softmax outputs:
+// Timestep    0    1    2    3    4    5    6    7    8
+// X-score    0.01 0.55 0.98 0.42 0.01 0.01 0.40 0.95 0.01
+// Y-score    0.00 0.01 0.01 0.01 0.01 0.97 0.59 0.04 0.01
+// Null-score 0.99 0.44 0.01 0.57 0.98 0.02 0.01 0.01 0.98
+// Then the correct CTC decoding (in which adjacent equal classes are folded,
+// and then all nulls are dropped) is clearly XYX, but simple decoding (taking
+// the max at each timestep) leads to:
+// Null@0.99 X@0.55 X@0.98 Null@0.57 Null@0.98 Y@0.97 Y@0.59 X@0.95 Null@0.98,
+// which folds to the correct XYX. The conversion to Tesseract rating and
+// certainty uses the sum of the log probs (log of the product of probabilities)
+// for the Rating and the minimum log prob for the certainty, but that yields a
+// minimum certainty of log(0.55), which is poor for such an obvious case.
+// CTC says that the probability of the result is the SUM of the products of the
+// probabilities over ALL PATHS that decode to the same result, which includes:
+// NXXNNYYXN, NNXNNYYN, NXXXNYYXN, NNXXNYXXN, and others including XXXXXYYXX.
+// That is intractable, so some compromise between simple and ideal is needed.
+// Observing that evenly split timesteps rarely happen next to each other, we
+// allow scores at a transition between classes to be added for decoding thus:
+// N@0.99 (N+X)@0.99 X@0.98 (N+X)@0.99 N@0.98 Y@0.97 (X+Y+N)@1.00 X@0.95 N@0.98.
+// This works because NNX and NXX both decode to X, so in the middle we can use
+// N+X. Note that the classes either side of a sum must stand alone, i.e. use a
+// single score, to force all paths to pass through them and decode to the same
+// result. Also in the special case of a transition from X to Y, with only one
+// timestep between, it is possible to add X+Y+N, since XXY, XYY, and XNY all
+// decode to XY.
+// An important condition is that we cannot combine X and Null between two
+// stand-alone Xs, since that can decode as XNX->XX or XXX->X, so the scores for
+// X and Null have to go in separate paths. Combining scores in this way
+// provides a much better minimum certainty of log(0.95).
+// In the implementation of the beam search, we have to place the possibilities
+// X, X+N and X+Y+N in the beam under appropriate conditions of the previous
+// node, and constrain what can follow, to enforce the rules explained above.
+// We therefore have 3 different types of node determined by what can follow:
+enum NodeContinuation {
+  NC_ANYTHING,  // This node used just its own score, so anything can follow.
+  NC_ONLY_DUP,  // The current node combined another score with the score for
+                // itself, without a stand-alone duplicate before, so must be
+                // followed by a stand-alone duplicate.
+  NC_NO_DUP,    // The current node combined another score with the score for
+                // itself, after a stand-alone, so can only be followed by
+                // something other than a duplicate of the current node.
+  NC_COUNT
+};
+
+// Enum describing the top-n status of a code.
+enum TopNState {
+  TN_TOP2,      // Winner or 2nd.
+  TN_TOPN,      // Runner up in top-n, but not 1st or 2nd.
+  TN_ALSO_RAN,  // Not in the top-n.
+  TN_COUNT
+};
+
 // Lattice element for Re-encode beam search.
 struct RecodeNode {
  RecodeNode()
@@ -44,10 +98,11 @@ struct RecodeNode {
        certainty(0.0f),
        score(0.0f),
        prev(NULL),
-        dawgs(NULL) {}
+        dawgs(NULL),
+        code_hash(0) {}
  RecodeNode(int c, int uni_id, PermuterType perm, bool dawg_start,
             bool word_start, bool end, bool dup, float cert, float s,
-             const RecodeNode* p, DawgPositionVector* d)
+             const RecodeNode* p, DawgPositionVector* d, uinT64 hash)
      : code(c),
        unichar_id(uni_id),
        permuter(perm),
@@ -58,7 +113,8 @@ struct RecodeNode {
        certainty(cert),
        score(s),
        prev(p),
-        dawgs(d) {}
+        dawgs(d),
+        code_hash(hash) {}
  // NOTE: If we could use C++11, then this would be a move constructor.
  // Instead we have copy constructor that does a move!! This is because we
  // don't want to copy the whole DawgPositionVector each time, and true
@@ -75,6 +131,8 @@ struct RecodeNode {
    return *this;
  }
  ~RecodeNode() { delete dawgs; }
+  // Prints details of the node.
+  void Print(int null_char, const UNICHARSET& unicharset, int depth) const;

  // The re-encoded code here = index to network output.
  int code;
@@ -93,10 +151,10 @@ struct RecodeNode {
  // necessarily mark the end of a word, since a word can be extended beyond a
  // candidate end by a continuation, eg 'the' continues to 'these'.
  bool end_of_word;
-  // True if this is a duplicate of prev in all respects. Some training modes
+  // True if this->code is a duplicate of prev->code. Some training modes
  // allow the network to output duplicate characters and crush them with CTC,
-  // but that would mess up the decoding, so we just smash them together on the
-  // fly using the duplicate flag.
+  // but that would mess up the dictionary search, so we just smash them
+  // together on the fly using the duplicate flag.
  bool duplicate;
  // Certainty (log prob) of (just) this position.
  float certainty;
@@ -106,6 +164,9 @@ struct RecodeNode {
  const RecodeNode* prev;
  // The currently active dawgs at this position. Owned pointer.
  DawgPositionVector* dawgs;
+  // A hash of all codes in the prefix and this->code as well. Used for
+  // duplicate path removal.
+  uinT64 code_hash;
 };

 typedef KDPairInc<double, RecodeNode> RecodePair;
@@ -150,6 +211,23 @@ class RecodeBeamSearch {
  // Supposedly on a uniform scale that can be compared across languages and
  // engines.
  static const float kMinCertainty;
+  // Number of different code lengths for which we have a separate beam.
+  static const int kNumLengths = RecodedCharID::kMaxCodeLen + 1;
+  // Total number of beams: dawg/nodawg * number of NodeContinuation * number
+  // of different lengths.
+  static const int kNumBeams = 2 * NC_COUNT * kNumLengths;
+  // Returns the relevant factor in the beams_ index.
+  static int LengthFromBeamsIndex(int index) { return index % kNumLengths; }
+  static NodeContinuation ContinuationFromBeamsIndex(int index) {
+    return static_cast<NodeContinuation>((index / kNumLengths) % NC_COUNT);
+  }
+  static bool IsDawgFromBeamsIndex(int index) {
+    return index / (kNumLengths * NC_COUNT) > 0;
+  }
+  // Computes a beams_ index from the given factors.
+  static int BeamIndex(bool is_dawg, NodeContinuation cont, int length) {
+    return (is_dawg * NC_COUNT + cont) * kNumLengths + length;
+  }

 private:
  // Struct for the Re-encode beam search. This struct holds the data for
@@ -158,31 +236,31 @@ class RecodeBeamSearch {
  struct RecodeBeam {
    // Resets to the initial state without deleting all the memory.
    void Clear() {
-      for (int i = 0; i <= RecodedCharID::kMaxCodeLen; ++i) {
+      for (int i = 0; i < kNumBeams; ++i) {
        beams_[i].clear();
-        dawg_beams_[i].clear();
      }
      RecodeNode empty;
-      best_initial_dawg_ = empty;
+      for (int i = 0; i < NC_COUNT; ++i) {
+        best_initial_dawgs_[i] = empty;
+      }
    }
-    // A separate beam for each code position. Since there aren't that many
-    // code positions, this allows the beam to be quite narrow, and yet still
-    // have a low chance of losing the best path.
+
+    // A separate beam for each combination of code length,
+    // NodeContinuation, and dictionary flag. Separating out all these types
+    // allows the beam to be quite narrow, and yet still have a low chance of
+    // losing the best path.
+    // We have to keep all these beams separate, since the highest scoring paths
+    // come from the paths that are most likely to dead-end at any time, like
+    // dawg paths, NC_ONLY_DUP etc.
    // Each heap is stored with the WORST result at the top, so we can quickly
    // get the top-n values.
-    RecodeHeap beams_[RecodedCharID::kMaxCodeLen + 1];
-    // Although, we can only use complete codes in the dawg, we have to separate
-    // partial code paths that lead back to a mid-dawg word from paths that are
-    // not part of a dawg word, as they have a different score. Since a dawg
-    // word can dead-end at any point, we need to keep the non dawg path going
-    // so the dawg beams_ are totally separate set with a heap for each length
-    // just like the non-dawg beams.
-    RecodeHeap dawg_beams_[RecodedCharID::kMaxCodeLen + 1];
+    RecodeHeap beams_[kNumBeams];
    // While the language model is only a single word dictionary, we can use
    // word starts as a choke point in the beam, and keep only a single dict
-    // start node at each step, so we find the best one here and push it on
-    // the heap, if it qualifies, after processing all of the step.
-    RecodeNode best_initial_dawg_;
+    // start node at each step (for each NodeContinuation type), so we find the
+    // best one here and push it on the heap, if it qualifies, after processing
+    // all of the step.
+    RecodeNode best_initial_dawgs_[NC_COUNT];
  };
  typedef KDPairInc<float, int> TopPair;

@@ -216,43 +294,50 @@ class RecodeBeamSearch {
                  const UNICHARSET* charset);

  // Adds to the appropriate beams the legal (according to recoder)
-  // continuations of context prev, which is of the given length, using the
-  // given network outputs to provide scores to the choices. Uses only those
-  // choices for which top_n_flags[index] == top_n_flag.
-  void ContinueContext(const RecodeNode* prev, int length, const float* outputs,
-                       bool use_dawgs, bool top_n_flag, double dict_ratio,
+  // continuations of context prev, which is from the given index to beams_,
+  // using the given network outputs to provide scores to the choices. Uses only
+  // those choices for which top_n_flags[code] == top_n_flag.
+  void ContinueContext(const RecodeNode* prev, int index, const float* outputs,
+                       TopNState top_n_flag, double dict_ratio,
                       double cert_offset, double worst_dict_cert,
                       RecodeBeam* step);
-  // Adds a RecodeNode composed of the tuple (code, unichar_id, cert, prev,
-  // appropriate-dawg-args, cert) to the given heap (dawg_beam_) if unichar_id
-  // is a valid continuation of whatever is in prev.
-  void ContinueDawg(int max_size, int code, int unichar_id, float cert,
-                    const RecodeNode* prev, RecodeHeap* heap, RecodeBeam* step);
-  // Adds a RecodeNode composed of the tuple (code, unichar_id,
-  // initial-dawg-state, prev, cert) to the given heap if/ there is room or if
-  // better than the current worst element if already full.
+  // Continues for a new unichar, using dawg or non-dawg as per flag.
+  void ContinueUnichar(int code, int unichar_id, float cert,
+                       float worst_dict_cert, float dict_ratio, bool use_dawgs,
+                       NodeContinuation cont, const RecodeNode* prev,
+                       RecodeBeam* step);
+  // Adds a RecodeNode composed of the args to the correct heap in step if
+  // unichar_id is a valid dictionary continuation of whatever is in prev.
+  void ContinueDawg(int code, int unichar_id, float cert, NodeContinuation cont,
+                    const RecodeNode* prev, RecodeBeam* step);
+  // Sets the correct best_initial_dawgs_ with a RecodeNode composed of the args
+  // if better than what is already there.
  void PushInitialDawgIfBetter(int code, int unichar_id, PermuterType permuter,
                               bool start, bool end, float cert,
-                               const RecodeNode* prev,
-                               RecodeNode* best_initial_dawg);
-  // Adds a copy of the given prev as a duplicate of and successor to prev, if
-  // there is room or if better than the current worst element if already full.
-  static void PushDupIfBetter(int max_size, float cert, const RecodeNode* prev,
-                              RecodeHeap* heap);
-  // Adds a RecodeNode composed of the tuple (code, unichar_id, permuter,
-  // false, false, false, false, cert, prev, NULL) to heap if there is room
-  // or if better than the current worst element if already full.
-  static void PushNoDawgIfBetter(int max_size, int code, int unichar_id,
-                                 PermuterType permuter, float cert,
-                                 const RecodeNode* prev, RecodeHeap* heap);
-  // Adds a RecodeNode composed of the tuple (code, unichar_id, permuter,
-  // dawg_start, word_start, end, dup, cert, prev, d) to heap if there is room
+                               NodeContinuation cont, const RecodeNode* prev,
+                               RecodeBeam* step);
+  // Adds a RecodeNode composed of the args to the correct heap in step for
+  // partial unichar or duplicate if there is room or if better than the
+  // current worst element if already full.
+  void PushDupOrNoDawgIfBetter(int length, bool dup, int code, int unichar_id,
+                               float cert, float worst_dict_cert,
+                               float dict_ratio, bool use_dawgs,
+                               NodeContinuation cont, const RecodeNode* prev,
+                               RecodeBeam* step);
+  // Adds a RecodeNode composed of the args to the correct heap in step if there
+  // is room or if better than the current worst element if already full.
+  void PushHeapIfBetter(int max_size, int code, int unichar_id,
+                        PermuterType permuter, bool dawg_start, bool word_start,
+                        bool end, bool dup, float cert, const RecodeNode* prev,
+                        DawgPositionVector* d, RecodeHeap* heap);
+  // Adds a RecodeNode to heap if there is room
  // or if better than the current worst element if already full.
-  static void PushHeapIfBetter(int max_size, int code, int unichar_id,
-                               PermuterType permuter, bool dawg_start,
-                               bool word_start, bool end, bool dup, float cert,
-                               const RecodeNode* prev, DawgPositionVector* d,
-                               RecodeHeap* heap);
+  void PushHeapIfBetter(int max_size, RecodeNode* node, RecodeHeap* heap);
+  // Searches the heap for an entry matching new_node, and updates the entry
+  // with reshuffle if needed. Returns true if there was a match.
+  bool UpdateHeapIfMatched(RecodeNode* new_node, RecodeHeap* heap);
+  // Computes and returns the code-hash for the given code and prev.
+  uinT64 ComputeCodeHash(int code, bool dup, const RecodeNode* prev) const;
  // Backtracks to extract the best path through the lattice that was built
  // during Decode. On return the best_nodes vector essentially contains the set
  // of code, score pairs that make the optimal path with the constraint that
@@ -284,7 +369,10 @@ class RecodeBeamSearch {
  int beam_size_;
  // A flag to indicate which outputs are the top-n choices. Current timestep
  // only.
-  GenericVector<bool> top_n_flags_;
+  GenericVector<TopNState> top_n_flags_;
+  // A record of the highest and second scoring codes.
+  int top_code_;
+  int second_code_;
  // Heap used to compute the top_n_flags_.
  GenericHeap<TopPair> top_heap_;
  // Borrowed pointer to the dictionary to use in the search.