提交 13e46ae1 编写于 作者: R Ray Smith

Made LSTM the default engine, pushed cube out

上级 dc124043
......@@ -288,6 +288,7 @@ void Tesseract::SearchWords(PointerVector<WERD_RES>* words) {
int length = word->best_choice->state(i);
word->best_state.push_back(length);
}
word->reject_map.initialise(word->best_choice->length());
word->tess_failed = false;
word->tess_accepted = true;
word->tess_would_adapt = false;
......
......@@ -109,6 +109,17 @@ bool Tesseract::init_tesseract_lang_data(
tessdata_manager_debug_level)) {
return false;
}
if (oem == OEM_DEFAULT) {
// Set the engine mode from availablity, which can then be overidden by
// the config file when we read it below.
if (!tessdata_manager.IsLSTMAvailable()) {
tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
} else if (!tessdata_manager.IsBaseAvailable()) {
tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
} else {
tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_LSTM_COMBINED);
}
}
// If a language specific config file (lang.config) exists, load it in.
if (tessdata_manager.SeekToStart(TESSDATA_LANG_CONFIG)) {
......@@ -175,9 +186,8 @@ bool Tesseract::init_tesseract_lang_data(
}
// The various OcrEngineMode settings (see publictypes.h) determine which
// engine-specific data files need to be loaded. Currently everything needs
// the base tesseract data, which supplies other useful information, but
// alternative engines, such as LSTM are optional.
// engine-specific data files need to be loaded.
// If LSTM_ONLY is requested, the base Tesseract files are *Not* required.
#ifndef ANDROID_BUILD
if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
......
......@@ -80,11 +80,10 @@ Tesseract::Tesseract()
" 5=line, 6=word, 7=char"
" (Values from PageSegMode enum in publictypes.h)",
this->params()),
INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY,
INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_DEFAULT,
"Which OCR engine(s) to run (Tesseract, LSTM, both)."
" Defaults to loading and running only Tesseract"
" (no LSTM,no combiner)."
" Values from OcrEngineMode enum in tesseractclass.h)",
" Defaults to loading and running the most accurate"
"available.",
this->params()),
STRING_MEMBER(tessedit_char_blacklist, "",
"Blacklist of chars not to recognize", this->params()),
......
......@@ -849,10 +849,9 @@ class Tesseract : public Wordrec {
"Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
" 5=line, 6=word, 7=char"
" (Values from PageSegMode enum in publictypes.h)");
INT_VAR_H(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY,
INT_VAR_H(tessedit_ocr_engine_mode, tesseract::OEM_DEFAULT,
"Which OCR engine(s) to run (Tesseract, LSTM, both). Defaults"
" to loading and running only Tesseract (no LSTM, no combiner)."
" (Values from OcrEngineMode enum in tesseractclass.h)");
" to loading and running the most accurate available.");
STRING_VAR_H(tessedit_char_blacklist, "",
"Blacklist of chars not to recognize");
STRING_VAR_H(tessedit_char_whitelist, "",
......
......@@ -162,21 +162,14 @@ bool TessdataManager::CombineDataFiles(
}
// Make sure that the required components are present.
if (file_ptr[TESSDATA_UNICHARSET] == NULL) {
tprintf("Error opening %sunicharset file\n", language_data_path_prefix);
if (!IncludesBaseComponents(offset_table) &&
!IncludesLSTMComponents(offset_table)) {
tprintf(
"Error: traineddata file must contain at least (a unicharset file"
"and inttemp) OR an lstm file.\n");
fclose(output_file);
return false;
}
if (file_ptr[TESSDATA_INTTEMP] != NULL &&
(file_ptr[TESSDATA_PFFMTABLE] == NULL ||
file_ptr[TESSDATA_NORMPROTO] == NULL)) {
tprintf("Error opening %spffmtable and/or %snormproto files"
" while %sinttemp file was present\n", language_data_path_prefix,
language_data_path_prefix, language_data_path_prefix);
fclose(output_file);
return false;
}
return WriteMetadata(offset_table, language_data_path_prefix, output_file);
}
......@@ -256,6 +249,19 @@ bool TessdataManager::TessdataTypeFromFileName(
return TessdataTypeFromFileSuffix(suffix, type, text_file);
}
// Returns true if the base Tesseract components are present.
/* static */
bool TessdataManager::IncludesBaseComponents(const inT64 *offset_table) {
return offset_table[TESSDATA_UNICHARSET] >= 0 &&
offset_table[TESSDATA_INTTEMP] >= 0;
}
// Returns true if the LSTM components are present.
/* static */
bool TessdataManager::IncludesLSTMComponents(const inT64 *offset_table) {
return offset_table[TESSDATA_LSTM] >= 0;
}
bool TessdataManager::ExtractToFile(const char *filename) {
TessdataType type = TESSDATA_NUM_ENTRIES;
bool text_file = false;
......
......@@ -66,8 +66,8 @@ enum TessdataType {
TESSDATA_NUMBER_DAWG, // 8
TESSDATA_FREQ_DAWG, // 9
TESSDATA_FIXED_LENGTH_DAWGS, // 10 // deprecated
TESSDATA_CUBE_UNICHARSET, // 11
TESSDATA_CUBE_SYSTEM_DAWG, // 12
TESSDATA_CUBE_UNICHARSET, // 11 // deprecated
TESSDATA_CUBE_SYSTEM_DAWG, // 12 // deprecated
TESSDATA_SHAPE_TABLE, // 13
TESSDATA_BIGRAM_DAWG, // 14
TESSDATA_UNAMBIG_DAWG, // 15
......@@ -96,8 +96,8 @@ static const char *const kTessdataFileSuffixes[] = {
kNumberDawgFileSuffix, // 8
kFreqDawgFileSuffix, // 9
kFixedLengthDawgsFileSuffix, // 10 // deprecated
kCubeUnicharsetFileSuffix, // 11
kCubeSystemDawgFileSuffix, // 12
kCubeUnicharsetFileSuffix, // 11 // deprecated
kCubeSystemDawgFileSuffix, // 12 // deprecated
kShapeTableFileSuffix, // 13
kBigramDawgFileSuffix, // 14
kUnambigDawgFileSuffix, // 15
......@@ -124,8 +124,8 @@ static const bool kTessdataFileIsText[] = {
false, // 8
false, // 9
false, // 10 // deprecated
true, // 11
false, // 12
true, // 11 // deprecated
false, // 12 // deprecated
false, // 13
false, // 14
false, // 15
......@@ -164,6 +164,12 @@ class TessdataManager {
*/
bool Init(const char *data_file_name, int debug_level);
// Returns true if the base Tesseract components are present.
bool IsBaseAvailable() const { return IncludesBaseComponents(offset_table_); }
// Returns true if the LSTM components are present.
bool IsLSTMAvailable() const { return IncludesLSTMComponents(offset_table_); }
// Return the name of the underlying data file.
const STRING &GetDataFileName() const { return data_file_name_; }
......@@ -280,6 +286,10 @@ class TessdataManager {
bool *text_file);
private:
// Returns true if the base Tesseract components are present.
static bool IncludesBaseComponents(const inT64 *offset_table);
// Returns true if the LSTM components are present.
static bool IncludesLSTMComponents(const inT64 *offset_table);
/**
* Opens the file whose name is a concatenation of language_data_path_prefix
......
......@@ -101,7 +101,7 @@ Pix* Input::PrepareLSTMInputs(const ImageData& image_data,
tprintf("Bad pix from ImageData!\n");
return nullptr;
}
if (width <= min_width) {
if (width <= min_width || height < min_width) {
tprintf("Image too small to scale!! (%dx%d vs min width of %d)\n", width,
height, min_width);
pixDestroy(&pix);
......
此差异已折叠。
......@@ -31,6 +31,60 @@
namespace tesseract {
// Enum describing what can follow the current node.
// Consider the following softmax outputs:
// Timestep 0 1 2 3 4 5 6 7 8
// X-score 0.01 0.55 0.98 0.42 0.01 0.01 0.40 0.95 0.01
// Y-score 0.00 0.01 0.01 0.01 0.01 0.97 0.59 0.04 0.01
// Null-score 0.99 0.44 0.01 0.57 0.98 0.02 0.01 0.01 0.98
// Then the correct CTC decoding (in which adjacent equal classes are folded,
// and then all nulls are dropped) is clearly XYX, but simple decoding (taking
// the max at each timestep) leads to:
// Null@0.99 X@0.55 X@0.98 Null@0.57 Null@0.98 Y@0.97 Y@0.59 X@0.95 Null@0.98,
// which folds to the correct XYX. The conversion to Tesseract rating and
// certainty uses the sum of the log probs (log of the product of probabilities)
// for the Rating and the minimum log prob for the certainty, but that yields a
// minimum certainty of log(0.55), which is poor for such an obvious case.
// CTC says that the probability of the result is the SUM of the products of the
// probabilities over ALL PATHS that decode to the same result, which includes:
// NXXNNYYXN, NNXNNYYN, NXXXNYYXN, NNXXNYXXN, and others including XXXXXYYXX.
// That is intractable, so some compromise between simple and ideal is needed.
// Observing that evenly split timesteps rarely happen next to each other, we
// allow scores at a transition between classes to be added for decoding thus:
// N@0.99 (N+X)@0.99 X@0.98 (N+X)@0.99 N@0.98 Y@0.97 (X+Y+N)@1.00 X@0.95 N@0.98.
// This works because NNX and NXX both decode to X, so in the middle we can use
// N+X. Note that the classes either side of a sum must stand alone, i.e. use a
// single score, to force all paths to pass through them and decode to the same
// result. Also in the special case of a transition from X to Y, with only one
// timestep between, it is possible to add X+Y+N, since XXY, XYY, and XNY all
// decode to XY.
// An important condition is that we cannot combine X and Null between two
// stand-alone Xs, since that can decode as XNX->XX or XXX->X, so the scores for
// X and Null have to go in separate paths. Combining scores in this way
// provides a much better minimum certainty of log(0.95).
// In the implementation of the beam search, we have to place the possibilities
// X, X+N and X+Y+N in the beam under appropriate conditions of the previous
// node, and constrain what can follow, to enforce the rules explained above.
// We therefore have 3 different types of node determined by what can follow:
enum NodeContinuation {
NC_ANYTHING, // This node used just its own score, so anything can follow.
NC_ONLY_DUP, // The current node combined another score with the score for
// itself, without a stand-alone duplicate before, so must be
// followed by a stand-alone duplicate.
NC_NO_DUP, // The current node combined another score with the score for
// itself, after a stand-alone, so can only be followed by
// something other than a duplicate of the current node.
NC_COUNT
};
// Enum describing the top-n status of a code.
enum TopNState {
TN_TOP2, // Winner or 2nd.
TN_TOPN, // Runner up in top-n, but not 1st or 2nd.
TN_ALSO_RAN, // Not in the top-n.
TN_COUNT
};
// Lattice element for Re-encode beam search.
struct RecodeNode {
RecodeNode()
......@@ -44,10 +98,11 @@ struct RecodeNode {
certainty(0.0f),
score(0.0f),
prev(NULL),
dawgs(NULL) {}
dawgs(NULL),
code_hash(0) {}
RecodeNode(int c, int uni_id, PermuterType perm, bool dawg_start,
bool word_start, bool end, bool dup, float cert, float s,
const RecodeNode* p, DawgPositionVector* d)
const RecodeNode* p, DawgPositionVector* d, uinT64 hash)
: code(c),
unichar_id(uni_id),
permuter(perm),
......@@ -58,7 +113,8 @@ struct RecodeNode {
certainty(cert),
score(s),
prev(p),
dawgs(d) {}
dawgs(d),
code_hash(hash) {}
// NOTE: If we could use C++11, then this would be a move constructor.
// Instead we have copy constructor that does a move!! This is because we
// don't want to copy the whole DawgPositionVector each time, and true
......@@ -75,6 +131,8 @@ struct RecodeNode {
return *this;
}
~RecodeNode() { delete dawgs; }
// Prints details of the node.
void Print(int null_char, const UNICHARSET& unicharset, int depth) const;
// The re-encoded code here = index to network output.
int code;
......@@ -93,10 +151,10 @@ struct RecodeNode {
// necessarily mark the end of a word, since a word can be extended beyond a
// candidate end by a continuation, eg 'the' continues to 'these'.
bool end_of_word;
// True if this is a duplicate of prev in all respects. Some training modes
// True if this->code is a duplicate of prev->code. Some training modes
// allow the network to output duplicate characters and crush them with CTC,
// but that would mess up the decoding, so we just smash them together on the
// fly using the duplicate flag.
// but that would mess up the dictionary search, so we just smash them
// together on the fly using the duplicate flag.
bool duplicate;
// Certainty (log prob) of (just) this position.
float certainty;
......@@ -106,6 +164,9 @@ struct RecodeNode {
const RecodeNode* prev;
// The currently active dawgs at this position. Owned pointer.
DawgPositionVector* dawgs;
// A hash of all codes in the prefix and this->code as well. Used for
// duplicate path removal.
uinT64 code_hash;
};
typedef KDPairInc<double, RecodeNode> RecodePair;
......@@ -150,6 +211,23 @@ class RecodeBeamSearch {
// Supposedly on a uniform scale that can be compared across languages and
// engines.
static const float kMinCertainty;
// Number of different code lengths for which we have a separate beam.
static const int kNumLengths = RecodedCharID::kMaxCodeLen + 1;
// Total number of beams: dawg/nodawg * number of NodeContinuation * number
// of different lengths.
static const int kNumBeams = 2 * NC_COUNT * kNumLengths;
// Returns the relevant factor in the beams_ index.
static int LengthFromBeamsIndex(int index) { return index % kNumLengths; }
static NodeContinuation ContinuationFromBeamsIndex(int index) {
return static_cast<NodeContinuation>((index / kNumLengths) % NC_COUNT);
}
static bool IsDawgFromBeamsIndex(int index) {
return index / (kNumLengths * NC_COUNT) > 0;
}
// Computes a beams_ index from the given factors.
static int BeamIndex(bool is_dawg, NodeContinuation cont, int length) {
return (is_dawg * NC_COUNT + cont) * kNumLengths + length;
}
private:
// Struct for the Re-encode beam search. This struct holds the data for
......@@ -158,31 +236,31 @@ class RecodeBeamSearch {
struct RecodeBeam {
// Resets to the initial state without deleting all the memory.
void Clear() {
for (int i = 0; i <= RecodedCharID::kMaxCodeLen; ++i) {
for (int i = 0; i < kNumBeams; ++i) {
beams_[i].clear();
dawg_beams_[i].clear();
}
RecodeNode empty;
best_initial_dawg_ = empty;
for (int i = 0; i < NC_COUNT; ++i) {
best_initial_dawgs_[i] = empty;
}
}
// A separate beam for each code position. Since there aren't that many
// code positions, this allows the beam to be quite narrow, and yet still
// have a low chance of losing the best path.
// A separate beam for each combination of code length,
// NodeContinuation, and dictionary flag. Separating out all these types
// allows the beam to be quite narrow, and yet still have a low chance of
// losing the best path.
// We have to keep all these beams separate, since the highest scoring paths
// come from the paths that are most likely to dead-end at any time, like
// dawg paths, NC_ONLY_DUP etc.
// Each heap is stored with the WORST result at the top, so we can quickly
// get the top-n values.
RecodeHeap beams_[RecodedCharID::kMaxCodeLen + 1];
// Although, we can only use complete codes in the dawg, we have to separate
// partial code paths that lead back to a mid-dawg word from paths that are
// not part of a dawg word, as they have a different score. Since a dawg
// word can dead-end at any point, we need to keep the non dawg path going
// so the dawg beams_ are totally separate set with a heap for each length
// just like the non-dawg beams.
RecodeHeap dawg_beams_[RecodedCharID::kMaxCodeLen + 1];
RecodeHeap beams_[kNumBeams];
// While the language model is only a single word dictionary, we can use
// word starts as a choke point in the beam, and keep only a single dict
// start node at each step, so we find the best one here and push it on
// the heap, if it qualifies, after processing all of the step.
RecodeNode best_initial_dawg_;
// start node at each step (for each NodeContinuation type), so we find the
// best one here and push it on the heap, if it qualifies, after processing
// all of the step.
RecodeNode best_initial_dawgs_[NC_COUNT];
};
typedef KDPairInc<float, int> TopPair;
......@@ -216,43 +294,50 @@ class RecodeBeamSearch {
const UNICHARSET* charset);
// Adds to the appropriate beams the legal (according to recoder)
// continuations of context prev, which is of the given length, using the
// given network outputs to provide scores to the choices. Uses only those
// choices for which top_n_flags[index] == top_n_flag.
void ContinueContext(const RecodeNode* prev, int length, const float* outputs,
bool use_dawgs, bool top_n_flag, double dict_ratio,
// continuations of context prev, which is from the given index to beams_,
// using the given network outputs to provide scores to the choices. Uses only
// those choices for which top_n_flags[code] == top_n_flag.
void ContinueContext(const RecodeNode* prev, int index, const float* outputs,
TopNState top_n_flag, double dict_ratio,
double cert_offset, double worst_dict_cert,
RecodeBeam* step);
// Adds a RecodeNode composed of the tuple (code, unichar_id, cert, prev,
// appropriate-dawg-args, cert) to the given heap (dawg_beam_) if unichar_id
// is a valid continuation of whatever is in prev.
void ContinueDawg(int max_size, int code, int unichar_id, float cert,
const RecodeNode* prev, RecodeHeap* heap, RecodeBeam* step);
// Adds a RecodeNode composed of the tuple (code, unichar_id,
// initial-dawg-state, prev, cert) to the given heap if/ there is room or if
// better than the current worst element if already full.
// Continues for a new unichar, using dawg or non-dawg as per flag.
void ContinueUnichar(int code, int unichar_id, float cert,
float worst_dict_cert, float dict_ratio, bool use_dawgs,
NodeContinuation cont, const RecodeNode* prev,
RecodeBeam* step);
// Adds a RecodeNode composed of the args to the correct heap in step if
// unichar_id is a valid dictionary continuation of whatever is in prev.
void ContinueDawg(int code, int unichar_id, float cert, NodeContinuation cont,
const RecodeNode* prev, RecodeBeam* step);
// Sets the correct best_initial_dawgs_ with a RecodeNode composed of the args
// if better than what is already there.
void PushInitialDawgIfBetter(int code, int unichar_id, PermuterType permuter,
bool start, bool end, float cert,
const RecodeNode* prev,
RecodeNode* best_initial_dawg);
// Adds a copy of the given prev as a duplicate of and successor to prev, if
// there is room or if better than the current worst element if already full.
static void PushDupIfBetter(int max_size, float cert, const RecodeNode* prev,
RecodeHeap* heap);
// Adds a RecodeNode composed of the tuple (code, unichar_id, permuter,
// false, false, false, false, cert, prev, NULL) to heap if there is room
// or if better than the current worst element if already full.
static void PushNoDawgIfBetter(int max_size, int code, int unichar_id,
PermuterType permuter, float cert,
const RecodeNode* prev, RecodeHeap* heap);
// Adds a RecodeNode composed of the tuple (code, unichar_id, permuter,
// dawg_start, word_start, end, dup, cert, prev, d) to heap if there is room
NodeContinuation cont, const RecodeNode* prev,
RecodeBeam* step);
// Adds a RecodeNode composed of the args to the correct heap in step for
// partial unichar or duplicate if there is room or if better than the
// current worst element if already full.
void PushDupOrNoDawgIfBetter(int length, bool dup, int code, int unichar_id,
float cert, float worst_dict_cert,
float dict_ratio, bool use_dawgs,
NodeContinuation cont, const RecodeNode* prev,
RecodeBeam* step);
// Adds a RecodeNode composed of the args to the correct heap in step if there
// is room or if better than the current worst element if already full.
void PushHeapIfBetter(int max_size, int code, int unichar_id,
PermuterType permuter, bool dawg_start, bool word_start,
bool end, bool dup, float cert, const RecodeNode* prev,
DawgPositionVector* d, RecodeHeap* heap);
// Adds a RecodeNode to heap if there is room
// or if better than the current worst element if already full.
static void PushHeapIfBetter(int max_size, int code, int unichar_id,
PermuterType permuter, bool dawg_start,
bool word_start, bool end, bool dup, float cert,
const RecodeNode* prev, DawgPositionVector* d,
RecodeHeap* heap);
void PushHeapIfBetter(int max_size, RecodeNode* node, RecodeHeap* heap);
// Searches the heap for an entry matching new_node, and updates the entry
// with reshuffle if needed. Returns true if there was a match.
bool UpdateHeapIfMatched(RecodeNode* new_node, RecodeHeap* heap);
// Computes and returns the code-hash for the given code and prev.
uinT64 ComputeCodeHash(int code, bool dup, const RecodeNode* prev) const;
// Backtracks to extract the best path through the lattice that was built
// during Decode. On return the best_nodes vector essentially contains the set
// of code, score pairs that make the optimal path with the constraint that
......@@ -284,7 +369,10 @@ class RecodeBeamSearch {
int beam_size_;
// A flag to indicate which outputs are the top-n choices. Current timestep
// only.
GenericVector<bool> top_n_flags_;
GenericVector<TopNState> top_n_flags_;
// A record of the highest and second scoring codes.
int top_code_;
int second_code_;
// Heap used to compute the top_n_flags_.
GenericHeap<TopPair> top_heap_;
// Borrowed pointer to the dictionary to use in the search.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册