提交 fdd4ffe8 编写于 作者: T theraysmith@gmail.com

Fixed endian bug in dawg reader, Added word bigram correction,

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@649 d0cd1f9f-072b-0410-8dd7-cf729c803f20
上级 6e3d810c
......@@ -98,6 +98,32 @@ int Dawg::check_for_words(const char *filename,
return misses;
}
void Dawg::iterate_words(const UNICHARSET &unicharset,
TessCallback1<const char *> *cb) const {
WERD_CHOICE word(&unicharset);
iterate_words_rec(word, 0, cb);
}
void Dawg::iterate_words_rec(const WERD_CHOICE &word_so_far,
NODE_REF to_explore,
TessCallback1<const char *> *cb) const {
NodeChildVector children;
this->unichar_ids_of(to_explore, &children);
for (int i = 0; i < children.size(); i++) {
WERD_CHOICE next_word(word_so_far);
next_word.append_unichar_id(children[i].unichar_id, 1, 0.0, 0.0);
if (this->end_of_word(children[i].edge_ref)) {
STRING s;
next_word.string_and_lengths(&s, NULL);
cb->Run(s.string());
}
NODE_REF next = next_node(children[i].edge_ref);
if (next != 0) {
iterate_words_rec(next_word, next, cb);
}
}
}
bool Dawg::match_words(WERD_CHOICE *word, inT32 index,
NODE_REF node, UNICHAR_ID wildcard) const {
EDGE_REF edge;
......@@ -286,12 +312,12 @@ void SquishedDawg::read_squished_dawg(FILE *file,
int unicharset_size;
fread(&unicharset_size, sizeof(inT32), 1, file);
fread(&num_edges_, sizeof(inT32), 1, file);
ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty
if (swap) {
unicharset_size = reverse32(unicharset_size);
num_edges_ = reverse32(num_edges_);
}
ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty
Dawg::init(type, lang, perm, unicharset_size, debug_level);
edges_ = (EDGE_ARRAY) memalloc(sizeof(EDGE_RECORD) * num_edges_);
......@@ -318,13 +344,13 @@ NODE_MAP SquishedDawg::build_node_map(inT32 *num_nodes) const {
node_map = (NODE_MAP) malloc(sizeof(EDGE_REF) * num_edges_);
for (edge=0; edge < num_edges_; edge++) // init all slots
for (edge = 0; edge < num_edges_; edge++) // init all slots
node_map [edge] = -1;
node_counter = num_forward_edges(0);
*num_nodes = 0;
for (edge=0; edge < num_edges_; edge++) { // search all slots
for (edge = 0; edge < num_edges_; edge++) { // search all slots
if (forward_edge(edge)) {
(*num_nodes)++; // count nodes links
......@@ -332,6 +358,7 @@ NODE_MAP SquishedDawg::build_node_map(inT32 *num_nodes) const {
num_edges = num_forward_edges(edge);
if (edge != 0) node_counter += num_edges;
edge += num_edges;
if (edge >= num_edges_) break;
if (backward_edge(edge)) while (!last_edge(edge++));
edge--;
}
......@@ -369,7 +396,7 @@ void SquishedDawg::write_squished_dawg(FILE *file) {
tprintf("%d edges in DAWG\n", num_edges);
}
for (edge=0; edge<num_edges_; edge++) {
for (edge = 0; edge < num_edges_; edge++) {
if (forward_edge(edge)) { // write forward edges
do {
old_index = next_node_from_edge_rec(edges_[edge]);
......@@ -379,6 +406,7 @@ void SquishedDawg::write_squished_dawg(FILE *file) {
set_next_node(edge, old_index);
} while (!last_edge(edge++));
if (edge >= num_edges_) break;
if (backward_edge(edge)) // skip back links
while (!last_edge(edge++));
......
......@@ -34,6 +34,7 @@
#include "elst.h"
#include "ratngs.h"
#include "params.h"
#include "tesscallback.h"
#ifndef __GNUC__
#ifdef __MSW32__
......@@ -142,6 +143,11 @@ class Dawg {
const UNICHARSET &unicharset,
bool enable_wildcard) const;
// For each word in the Dawg, call the given (permanent) callback with the
// text (UTF-8) version of the word.
void iterate_words(const UNICHARSET &unicharset,
TessCallback1<const char *> *cb) const;
// Pure virtual function that should be implemented by the derived classes.
/// Returns the edge that corresponds to the letter out of this node.
......@@ -268,6 +274,11 @@ class Dawg {
bool match_words(WERD_CHOICE *word, inT32 index,
NODE_REF node, UNICHAR_ID wildcard) const;
// Recursively iterate over all words in a dawg (see public iterate_words).
void iterate_words_rec(const WERD_CHOICE &word_so_far,
NODE_REF to_explore,
TessCallback1<const char *> *cb) const;
// Member Variables.
DawgType type_;
STRING lang_;
......
......@@ -16,7 +16,10 @@
//
///////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include "dict.h"
#include "unicodes.h"
#ifdef _MSC_VER
#pragma warning(disable:4244) // Conversion warnings
......@@ -41,6 +44,8 @@ Dict::Dict(Image* image_ptr)
getImage()->getCCUtil()->params()),
BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.",
getImage()->getCCUtil()->params()),
BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
getImage()->getCCUtil()->params()),
BOOL_INIT_MEMBER(load_punc_dawg, true, "Load dawg with punctuation"
" patterns.", getImage()->getCCUtil()->params()),
BOOL_INIT_MEMBER(load_number_dawg, true, "Load dawg with number"
......@@ -48,6 +53,8 @@ Dict::Dict(Image* image_ptr)
BOOL_INIT_MEMBER(load_fixed_length_dawgs, true, "Load fixed length dawgs"
" (e.g. for non-space delimited languages)",
getImage()->getCCUtil()->params()),
BOOL_INIT_MEMBER(load_bigram_dawg, false, "Load dawg with special word "
"bigrams.", getImage()->getCCUtil()->params()),
double_MEMBER(segment_penalty_dict_frequent_word, 1.0,
"Score multiplier for word matches which have good case and"
"are frequent in the given language (lower is better).",
......@@ -70,6 +77,9 @@ Dict::Dict(Image* image_ptr)
"Score multiplier for poorly cased strings that are not in"
" the dictionary and generally look like garbage (lower is"
" better).", getImage()->getCCUtil()->params()),
STRING_MEMBER(output_ambig_words_file, "",
"Output file for ambiguities found in the dictionary",
getImage()->getCCUtil()->params()),
INT_MEMBER(dawg_debug_level, 0, "Set to 1 for general debug info"
", to 2 for more details, to 3 to see all the debug messages",
getImage()->getCCUtil()->params()),
......@@ -104,6 +114,12 @@ Dict::Dict(Image* image_ptr)
"Make AcceptableChoice() always return false. Useful"
" when there is a need to explore all segmentations",
getImage()->getCCUtil()->params()),
double_MEMBER(stopper_ambiguity_threshold_gain, 8.0,
"Gain factor for ambiguity threshold.",
getImage()->getCCUtil()->params()),
double_MEMBER(stopper_ambiguity_threshold_offset, 1.5,
"Certainty offset for ambiguity threshold.",
getImage()->getCCUtil()->params()),
BOOL_MEMBER(save_raw_choices, false, "Save all explored raw choices",
getImage()->getCCUtil()->params()),
INT_MEMBER(tessedit_truncate_wordchoice_log, 10,
......@@ -130,6 +146,11 @@ Dict::Dict(Image* image_ptr)
BOOL_MEMBER(segment_segcost_rating, 0,
"incorporate segmentation cost in word rating?",
getImage()->getCCUtil()->params()),
BOOL_MEMBER(segment_nonalphabetic_script, false,
"Don't use any alphabetic-specific tricks."
"Set to true in the traineddata config file for"
" scripts that are cursive or inherently fixed-pitch",
getImage()->getCCUtil()->params()),
double_MEMBER(segment_reward_script, 0.95,
"Score multipler for script consistency within a word. "
"Being a 'reward' factor, it should be <= 1. "
......@@ -144,10 +165,10 @@ Dict::Dict(Image* image_ptr)
double_MEMBER(segment_reward_chartype, 0.97,
"Score multipler for char type consistency within a word. ",
getImage()->getCCUtil()->params()),
double_MEMBER(segment_reward_ngram_best_choice, 0.99,
"Score multipler for ngram permuter's best choice"
" (only used in the Han script path).",
getImage()->getCCUtil()->params()),
double_MEMBER(segment_reward_ngram_best_choice, 0.99,
"Score multipler for ngram permuter's best choice"
" (only used in the Han script path).",
getImage()->getCCUtil()->params()),
BOOL_MEMBER(save_doc_words, 0, "Save Document Words",
getImage()->getCCUtil()->params()),
BOOL_MEMBER(doc_dict_enable, 1, "Enable Document Dictionary ",
......@@ -182,14 +203,17 @@ Dict::Dict(Image* image_ptr)
hyphen_unichar_id_ = INVALID_UNICHAR_ID;
document_words_ = NULL;
pending_words_ = NULL;
bigram_dawg_ = NULL;
freq_dawg_ = NULL;
punc_dawg_ = NULL;
max_fixed_length_dawgs_wdlen_ = -1;
wordseg_rating_adjust_factor_ = -1.0f;
output_ambig_words_file_ = NULL;
}
Dict::~Dict() {
if (hyphen_word_ != NULL) delete hyphen_word_;
if (output_ambig_words_file_ != NULL) fclose(output_ambig_words_file_);
}
void Dict::Load() {
......@@ -199,6 +223,10 @@ void Dict::Load() {
if (dawgs_.length() != 0) this->End();
hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
LoadEquivalenceList(kHyphenLikeUTF8);
LoadEquivalenceList(kApostropheLikeUTF8);
TessdataManager &tessdata_manager =
getImage()->getCCUtil()->tessdata_manager;
......@@ -219,12 +247,26 @@ void Dict::Load() {
new SquishedDawg(tessdata_manager.GetDataFilePtr(),
DAWG_TYPE_NUMBER, lang, NUMBER_PERM, dawg_debug_level);
}
if (tessdata_manager.SeekToStart(TESSDATA_FREQ_DAWG)) {
if (load_bigram_dawg && tessdata_manager.SeekToStart(TESSDATA_BIGRAM_DAWG)) {
bigram_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
DAWG_TYPE_WORD, // doesn't actually matter.
lang,
COMPOUND_PERM, // doesn't actually matter.
dawg_debug_level);
}
if (load_freq_dawg && tessdata_manager.SeekToStart(TESSDATA_FREQ_DAWG)) {
freq_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
DAWG_TYPE_WORD, lang, FREQ_DAWG_PERM,
dawg_debug_level);
dawgs_ += freq_dawg_;
}
if (load_unambig_dawg &&
tessdata_manager.SeekToStart(TESSDATA_UNAMBIG_DAWG)) {
unambig_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
DAWG_TYPE_WORD, lang, SYSTEM_DAWG_PERM,
dawg_debug_level);
dawgs_ += unambig_dawg_;
}
if (((STRING &)user_words_suffix).length() > 0) {
Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
......@@ -232,7 +274,8 @@ void Dict::Load() {
dawg_debug_level);
name = getImage()->getCCUtil()->language_data_path_prefix;
name += user_words_suffix;
if (!trie_ptr->read_word_list(name.string(), getUnicharset())) {
if (!trie_ptr->read_word_list(name.string(), getUnicharset(),
Trie::RRP_REVERSE_IF_HAS_RTL)) {
tprintf("Error: failed to load %s\n", name.string());
exit(1);
}
......@@ -295,6 +338,7 @@ void Dict::End() {
dawgs_.delete_data_pointers();
successors_.delete_data_pointers();
dawgs_.clear();
delete bigram_dawg_;
successors_.clear();
document_words_ = NULL;
max_fixed_length_dawgs_wdlen_ = -1;
......@@ -304,12 +348,38 @@ void Dict::End() {
}
}
// Create unicharset adaptations of known, short lists of UTF-8 equivalent
// characters (think all hyphen-like symbols). The first version of the
// list is taken as equivalent for matching against the dictionary.
void Dict::LoadEquivalenceList(const char *unichar_strings[]) {
equivalent_symbols_.push_back(GenericVectorEqEq<UNICHAR_ID>());
const UNICHARSET &unicharset = getUnicharset();
GenericVectorEqEq<UNICHAR_ID> *equiv_list = &equivalent_symbols_.back();
for (int i = 0; unichar_strings[i] != 0; i++) {
UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar_strings[i]);
if (unichar_id != INVALID_UNICHAR_ID) {
equiv_list->push_back(unichar_id);
}
}
}
// Normalize all hyphen and apostrophes to the canonicalized one for
// matching; pass everything else through as is.
UNICHAR_ID Dict::NormalizeUnicharIdForMatch(UNICHAR_ID unichar_id) const {
for (int i = 0; i < equivalent_symbols_.size(); i++) {
if (equivalent_symbols_[i].contains(unichar_id)) {
return equivalent_symbols_[i][0];
}
}
return unichar_id;
}
// Returns true if in light of the current state unichar_id is allowed
// according to at least one of the dawgs in the dawgs_ vector.
// See more extensive comments in dict.h where this function is declared.
int Dict::def_letter_is_okay(void* void_dawg_args,
UNICHAR_ID unichar_id,
bool word_end) {
bool word_end) const {
DawgArgs *dawg_args = reinterpret_cast<DawgArgs*>(void_dawg_args);
if (dawg_debug_level >= 3) {
......@@ -484,7 +554,7 @@ int Dict::def_letter_is_okay(void* void_dawg_args,
void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgInfo &info,
UNICHAR_ID unichar_id, bool word_end,
DawgArgs *dawg_args,
PermuterType *curr_perm) {
PermuterType *curr_perm) const {
NODE_REF node = GetStartingNode(dawg, info.ref);
// Try to find the edge corresponding to the exact unichar_id and to all the
// edges corresponding to the character class of unichar_id.
......@@ -572,7 +642,7 @@ void Dict::WriteFixedLengthDawgs(
// from hyphen_active_dawgs_ instead.
void Dict::init_active_dawgs(int sought_word_length,
DawgInfoVector *active_dawgs,
bool ambigs_mode) {
bool ambigs_mode) const {
int i;
if (sought_word_length != kAnyWordLength) {
// Only search one fixed word length dawg.
......@@ -604,7 +674,7 @@ void Dict::init_active_dawgs(int sought_word_length,
// If hyphenated() returns true, copy the entries from hyphen_constraints_
// into the given constraints vector.
void Dict::init_constraints(DawgInfoVector *constraints) {
void Dict::init_constraints(DawgInfoVector *constraints) const {
if (hyphenated()) {
*constraints = hyphen_constraints_;
if (dawg_debug_level >= 3) {
......@@ -670,7 +740,7 @@ void Dict::add_document_word(const WERD_CHOICE &best_choice) {
strcat(filename, ".doc");
doc_word_file = open_file (filename, "a");
fprintf(doc_word_file, "%s\n",
best_choice.debug_string(getUnicharset()).string());
best_choice.debug_string().string());
fclose(doc_word_file);
}
document_words_->add_word_to_dawg(best_choice);
......@@ -693,7 +763,7 @@ void Dict::adjust_word(WERD_CHOICE *word,
float new_rating = word->rating();
if (debug) {
tprintf("%sWord: %s %4.2f ", nonword ? "Non-" : "",
word->debug_string(getUnicharset()).string(), word->rating());
word->debug_string().string(), word->rating());
}
new_rating += kRatingPad;
if (nonword) { // non-dictionary word
......@@ -733,9 +803,9 @@ void Dict::adjust_word(WERD_CHOICE *word,
LogNewChoice(adjust_factor, certainty_array, false, word);
}
int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) {
int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const {
const WERD_CHOICE *word_ptr = &word;
WERD_CHOICE temp_word;
WERD_CHOICE temp_word(word.unicharset());
if (hyphenated()) {
copy_hyphen_info(&temp_word);
temp_word += word;
......@@ -775,10 +845,40 @@ int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) {
dawg_args.permuter : NO_PERM;
}
bool Dict::valid_bigram(const WERD_CHOICE &word1,
const WERD_CHOICE &word2) const {
if (bigram_dawg_ == NULL) return false;
// Extract the core word from the middle of each word with any digits
// replaced with question marks.
int w1start, w1end, w2start, w2end;
word1.punct_stripped(&w1start, &w1end);
word2.punct_stripped(&w2start, &w2end);
// We don't want to penalize a single guillemet, hyphen, etc.
// But our bigram list doesn't have any information about punctuation.
if (w1start >= w1end) return word1.length() < 3;
if (w2start >= w2end) return word2.length() < 3;
const UNICHARSET& uchset = getUnicharset();
STRING bigram_string;
for (int i = w1start; i < w1end; i++) {
UNICHAR_ID ch = NormalizeUnicharIdForMatch(word1.unichar_id(i));
bigram_string += uchset.get_isdigit(ch) ? "?" : uchset.id_to_unichar(ch);
}
bigram_string += " ";
for (int i = w2start; i < w2end; i++) {
UNICHAR_ID ch = NormalizeUnicharIdForMatch(word2.unichar_id(i));
bigram_string += uchset.get_isdigit(ch) ? "?" : uchset.id_to_unichar(ch);
}
WERD_CHOICE normalized_word(bigram_string.string(), uchset);
return bigram_dawg_->word_in_dawg(normalized_word);
}
bool Dict::valid_punctuation(const WERD_CHOICE &word) {
if (word.length() == 0) return NO_PERM;
int i;
WERD_CHOICE new_word;
WERD_CHOICE new_word(word.unicharset());
int last_index = word.length() - 1;
int new_len = 0;
for (i = 0; i <= last_index; ++i) {
......
......@@ -89,16 +89,17 @@ struct DawgArgs {
class Dict {
public:
// Gain factor for ambiguity threshold.
static const float kStopperAmbiguityThresholdGain;
// Certainty offset for ambiguity threshold.
static const float kStopperAmbiguityThresholdOffset;
Dict(Image* image_ptr);
~Dict();
const Image* getImage() const {
return image_ptr_;
}
Image* getImage() {
return image_ptr_;
}
const UNICHARSET& getUnicharset() const {
return getImage()->getCCUtil()->unicharset;
}
UNICHARSET& getUnicharset() {
return getImage()->getCCUtil()->unicharset;
}
......@@ -114,17 +115,17 @@ class Dict {
/* hyphen.cpp ************************************************************/
/// Returns true if we've recorded the beginning of a hyphenated word.
inline bool hyphenated() { return
inline bool hyphenated() const { return
!last_word_on_line_ && hyphen_word_ && GetMaxFixedLengthDawgIndex() < 0;
}
/// Size of the base word (the part on the line before) of a hyphenated word.
inline int hyphen_base_size() {
inline int hyphen_base_size() const {
return this->hyphenated() ? hyphen_word_->length() : 0;
}
/// If this word is hyphenated copy the base word (the part on
/// the line before) of a hyphenated word into the given word.
/// This function assumes that word is not NULL.
inline void copy_hyphen_info(WERD_CHOICE *word) {
inline void copy_hyphen_info(WERD_CHOICE *word) const {
if (this->hyphenated()) {
*word = *hyphen_word_;
if (hyphen_debug_level) word->print("copy_hyphen_info: ");
......@@ -133,19 +134,19 @@ class Dict {
/// Erase the unichar ids corresponding to the portion of the word
/// from the previous line. The word is not changed if it is not
/// split between lines and hyphenated.
inline void remove_hyphen_head(WERD_CHOICE *word) {
inline void remove_hyphen_head(WERD_CHOICE *word) const {
if (this->hyphenated()) {
word->remove_unichar_ids(0, hyphen_word_->length());
if (hyphen_debug_level) hyphen_word_->print("remove_hyphen_head: ");
}
}
/// Check whether the word has a hyphen at the end.
inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) {
inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const {
return (last_word_on_line_ && !first_pos &&
unichar_id == hyphen_unichar_id_);
}
/// Same as above, but check the unichar at the end of the word.
inline bool has_hyphen_end(const WERD_CHOICE &word) {
inline bool has_hyphen_end(const WERD_CHOICE &word) const {
int word_index = word.length() - 1;
return has_hyphen_end(word.unichar_id(word_index), word_index == 0);
}
......@@ -171,12 +172,14 @@ class Dict {
/// from hyphen_active_dawgs_ instead.
void init_active_dawgs(int sought_word_length,
DawgInfoVector *active_dawgs,
bool ambigs_mode);
bool ambigs_mode) const;
/// If hyphenated() returns true, copy the entries from hyphen_constraints_
/// into the given constraints vector.
void init_constraints(DawgInfoVector *constraints);
void init_constraints(DawgInfoVector *constraints) const;
/// Returns true if we are operating in ambigs mode.
inline bool ambigs_mode(float rating_limit) { return rating_limit <= 0.0; }
inline bool ambigs_mode(float rating_limit) {
return rating_limit <= 0.0;
}
/// Recursively explore all the possible character combinations in
/// the given char_choices. Use go_deeper_dawg_fxn() to explore all the
/// dawgs in the dawgs_ vector in parallel and discard invalid words.
......@@ -316,6 +319,15 @@ class Dict {
bool fix_replaceable,
BLOB_CHOICE_LIST_VECTOR *Choices,
bool *modified_blobs);
double StopperAmbigThreshold(double f1, double f2) {
return (f2 - f1) * stopper_ambiguity_threshold_gain -
stopper_ambiguity_threshold_offset;
}
// If the certainty of any chunk in Choice (item1) is not ambiguous with the
// corresponding chunk in the best choice (item2), frees Choice and
// returns true.
int FreeBadChoice(void *item1, // VIABLE_CHOICE Choice
void *item2); // EXPANDED_CHOICE *BestChoice
/// Replaces the corresponding wrong ngram in werd_choice with the correct
/// one. We indicate that this newly inserted ngram unichar is composed from
/// several fragments and modify the corresponding entries in blob_choices to
......@@ -401,7 +413,7 @@ class Dict {
/// and Certainties.
void FillViableChoice(const WERD_CHOICE &WordChoice,
FLOAT32 AdjustFactor, const float Certainties[],
bool SameString, VIABLE_CHOICE ViableChoice);
VIABLE_CHOICE ViableChoice);
/// Returns true if there are no alternative choices for the current word
/// or if all alternatives have an adjust factor worse than Threshold.
bool AlternativeChoicesWorseThan(FLOAT32 Threshold);
......@@ -467,6 +479,15 @@ class Dict {
document_words_->clear();
}
// Create unicharset adaptations of known, short lists of UTF-8 equivalent
// characters (think all hyphen-like symbols). The first version of the
// list is taken as equivalent for matching against the dictionary.
void LoadEquivalenceList(const char *unichar_strings[]);
// Normalize all hyphen and apostrophes to the canonicalized one for
// matching; pass everything else through as is. See LoadEquivalenceList().
UNICHAR_ID NormalizeUnicharIdForMatch(UNICHAR_ID unichar_id) const;
/**
* Returns the maximal permuter code (from ccstruct/ratngs.h) if in light
* of the current state the letter at word_index in the given word
......@@ -531,13 +552,13 @@ class Dict {
//
int def_letter_is_okay(void* void_dawg_args,
UNICHAR_ID unichar_id, bool word_end);
UNICHAR_ID unichar_id, bool word_end) const;
int (Dict::*letter_is_okay_)(void* void_dawg_args,
UNICHAR_ID unichar_id, bool word_end);
UNICHAR_ID unichar_id, bool word_end) const;
/// Calls letter_is_okay_ member function.
int LetterIsOkay(void* void_dawg_args,
UNICHAR_ID unichar_id, bool word_end) {
UNICHAR_ID unichar_id, bool word_end) const {
return (this->*letter_is_okay_)(void_dawg_args, unichar_id, word_end);
}
......@@ -581,6 +602,8 @@ class Dict {
inline const Dawg *GetDawg(int index) const { return dawgs_[index]; }
/// Return the points to the punctuation dawg.
inline const Dawg *GetPuncDawg() const { return punc_dawg_; }
/// Return the points to the unambiguous words dawg.
inline const Dawg *GetUnambigDawg() const { return unambig_dawg_; }
/// Return the pointer to the Dawg that contains words of length word_length.
inline const Dawg *GetFixedLengthDawg(int word_length) const {
if (word_length > max_fixed_length_dawgs_wdlen_) return NULL;
......@@ -603,7 +626,7 @@ class Dict {
/// leading punctuation is found this would ensure that we are not
/// expecting any particular trailing punctuation after the word).
inline bool ConstraintsOk(const DawgInfoVector &constraints,
int word_end, DawgType current_dawg_type) {
int word_end, DawgType current_dawg_type) const {
if (!word_end) return true;
if (current_dawg_type == DAWG_TYPE_PUNCTUATION) return true;
for (int c = 0; c < constraints.length(); ++c) {
......@@ -627,7 +650,8 @@ class Dict {
/// edges were found.
void ProcessPatternEdges(const Dawg *dawg, const DawgInfo &info,
UNICHAR_ID unichar_id, bool word_end,
DawgArgs *dawg_args, PermuterType *current_permuter);
DawgArgs *dawg_args,
PermuterType *current_permuter) const;
/// Read/Write/Access special purpose dawgs which contain words
/// only of a certain length (used for phrase search for
......@@ -649,23 +673,25 @@ class Dict {
int num_dawgs, int debug_level, FILE *output_file);
/// Check all the DAWGs to see if this word is in any of them.
inline bool valid_word_permuter(uinT8 perm, bool numbers_ok) {
inline static bool valid_word_permuter(uinT8 perm, bool numbers_ok) {
return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
perm == USER_PATTERN_PERM || (numbers_ok && perm == NUMBER_PERM));
}
int valid_word(const WERD_CHOICE &word, bool numbers_ok);
int valid_word(const WERD_CHOICE &word) {
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const;
int valid_word(const WERD_CHOICE &word) const {
return valid_word(word, false); // return NO_PERM for words with digits
}
int valid_word_or_number(const WERD_CHOICE &word) {
int valid_word_or_number(const WERD_CHOICE &word) const {
return valid_word(word, true); // return NUMBER_PERM for valid numbers
}
/// This function is used by api/tesseract_cube_combiner.cpp
int valid_word(const char *string) {
int valid_word(const char *string) const {
WERD_CHOICE word(string, getUnicharset());
return valid_word(word);
}
// Do the two WERD_CHOICEs form a meaningful bigram?
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const;
/// Returns true if the word contains a valid punctuation pattern.
/// Note: Since the domains of punctuation symbols and symblos
/// used in numbers are not disjoint, a valid number might contain
......@@ -691,6 +717,8 @@ class Dict {
inline void SetWordsegRatingAdjustFactor(float f) {
wordseg_rating_adjust_factor_ = f;
}
// Accessor for best_choices_.
const LIST &getBestChoices() { return best_choices_; }
private:
/** Private member variables. */
......@@ -723,15 +751,27 @@ class Dict {
DawgInfoVector hyphen_active_dawgs_;
DawgInfoVector hyphen_constraints_;
bool last_word_on_line_;
// List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary
// matching. The first member of each list is taken as canonical. For
// example, the first list contains hyphens and dashes with the first symbol
// being the ASCII hyphen minus.
GenericVector<GenericVectorEqEq<UNICHAR_ID> > equivalent_symbols_;
// Dawgs.
DawgVector dawgs_;
SuccessorListsVector successors_;
Trie *pending_words_;
// bigram_dawg_ points to a dawg of two-word bigrams which always supercede if
// any of them are present on the best choices list for a word pair.
// the bigrams are stored as space-separated words where:
// (1) leading and trailing punctuation has been removed from each word and
// (2) any digits have been replaced with '?' marks.
Dawg *bigram_dawg_;
/// The following pointers are only cached for convenience.
/// The dawgs will be deleted when dawgs_ vector is destroyed.
// TODO(daria): need to support multiple languages in the future,
// so maybe will need to maintain a list of dawgs of each kind.
Dawg *freq_dawg_;
Dawg *unambig_dawg_;
Dawg *punc_dawg_;
Trie *document_words_;
/// Maximum word length of fixed-length word dawgs.
......@@ -740,6 +780,8 @@ class Dict {
/// Current segmentation cost adjust factor for word rating.
/// See comments in incorporate_segcost.
float wordseg_rating_adjust_factor_;
// File for recording ambiguities discovered during dictionary search.
FILE *output_ambig_words_file_;
public:
/// Variable members.
......@@ -750,11 +792,14 @@ class Dict {
"A list of user-provided patterns.");
BOOL_VAR_H(load_system_dawg, true, "Load system word dawg.");
BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg.");
BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg.");
BOOL_VAR_H(load_punc_dawg, true,
"Load dawg with punctuation patterns.");
BOOL_VAR_H(load_number_dawg, true, "Load dawg with number patterns.");
BOOL_VAR_H(load_fixed_length_dawgs, true, "Load fixed length"
" dawgs (e.g. for non-space delimited languages)");
BOOL_VAR_H(load_bigram_dawg, false,
"Load dawg with special word bigrams.");
double_VAR_H(segment_penalty_dict_frequent_word, 1.0,
"Score multiplier for word matches which have good case and"
"are frequent in the given language (lower is better).");
......@@ -779,6 +824,8 @@ class Dict {
"Score multiplier for poorly cased strings that are not in"
" the dictionary and generally look like garbage (lower is"
" better).");
STRING_VAR_H(output_ambig_words_file, "",
"Output file for ambiguities found in the dictionary");
INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info"
", to 2 for more details, to 3 to see all the debug messages");
INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words.");
......@@ -801,6 +848,10 @@ class Dict {
BOOL_VAR_H(stopper_no_acceptable_choices, false,
"Make AcceptableChoice() always return false. Useful"
" when there is a need to explore all segmentations");
double_VAR_H(stopper_ambiguity_threshold_gain, 8.0,
"Gain factor for ambiguity threshold.");
double_VAR_H(stopper_ambiguity_threshold_offset, 1.5,
"Certainty offset for ambiguity threshold.");
BOOL_VAR_H(save_raw_choices, false, "Save all explored raw choices");
INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information"
......@@ -816,6 +867,10 @@ class Dict {
"Turn on word script consistency permuter");
BOOL_VAR_H(segment_segcost_rating, 0,
"incorporate segmentation cost in word rating?");
BOOL_VAR_H(segment_nonalphabetic_script, false,
"Don't use any alphabetic-specific tricks."
"Set to true in the traineddata config file for"
" scripts that are cursive or inherently fixed-pitch");
double_VAR_H(segment_reward_script, 0.95,
"Score multipler for script consistency within a word. "
"Being a 'reward' factor, it should be <= 1. "
......
......@@ -51,7 +51,7 @@ void Dict::set_hyphen_word(const WERD_CHOICE &word,
const DawgInfoVector &active_dawgs,
const DawgInfoVector &constraints) {
if (hyphen_word_ == NULL) {
hyphen_word_ = new WERD_CHOICE();
hyphen_word_ = new WERD_CHOICE(word.unicharset());
hyphen_word_->make_bad();
}
if (hyphen_word_->rating() > word.rating()) {
......
......@@ -28,7 +28,7 @@
/* define the maximum number of classes defined for any matcher
and the maximum class id for any matcher. This must be changed
if more different classes need to be classified */
#define MAX_NUM_CLASSES 8192
#define MAX_NUM_CLASSES 12288
#define MAX_CLASS_ID (MAX_NUM_CLASSES - 1)
/** a CLASS_ID is the ascii character to be associated with a class */
......
......@@ -86,7 +86,7 @@ void Dict::go_deeper_dawg_fxn(
if (permute_debug && dawg_debug_level) {
tprintf("early pruned word rating=%4.2f,"
" permdawg_limit=%4.2f, word=%s\n", word->rating(),
permdawg_limit, word->debug_string(getUnicharset()).string());
permdawg_limit, word->debug_string().string());
}
return;
}
......@@ -106,8 +106,7 @@ void Dict::go_deeper_dawg_fxn(
}
if (clean_active_dawgs.size() > 0) {
if (permute_debug && dawg_debug_level)
tprintf("new hyphen choice = %s\n",
word->debug_string(getUnicharset()).string());
tprintf("new hyphen choice = %s\n", word->debug_string().string());
word->set_permuter(more_args->permuter);
adjust_word(word, certainties, permute_debug);
set_hyphen_word(*word, *(more_args->active_dawgs),
......@@ -190,11 +189,26 @@ void Dict::go_deeper_dawg_fxn(
// Add a new word choice
if (word_ending) {
if (permute_debug && dawg_debug_level) {
tprintf("found word = %s\n",
word->debug_string(getUnicharset()).string());
tprintf("found word = %s\n", word->debug_string().string());
}
if (ambigs_mode(*limit) &&
strcmp(output_ambig_words_file.string(), "") != 0) {
if (output_ambig_words_file_ == NULL) {
output_ambig_words_file_ =
fopen(output_ambig_words_file.string(), "w+");
if (output_ambig_words_file_ == NULL) {
tprintf("Failed to open output_ambig_words_file %s\n",
output_ambig_words_file.string());
exit(1);
}
}
STRING word_str;
word->string_and_lengths(&word_str, NULL);
word_str += " ";
fprintf(output_ambig_words_file_, word_str.string());
}
WERD_CHOICE *adjusted_word = word;
WERD_CHOICE hyphen_tail_word;
WERD_CHOICE hyphen_tail_word(&getUnicharset());
if (hyphen_base_size() > 0) {
hyphen_tail_word = *word;
remove_hyphen_head(&hyphen_tail_word);
......@@ -226,7 +240,7 @@ void Dict::go_deeper_dawg_fxn(
} else {
if (permute_debug && dawg_debug_level) {
tprintf("last unichar not OK at index %d in %s\n",
word_index, word->debug_string(getUnicharset()).string());
word_index, word->debug_string().string());
}
}
}
......@@ -249,7 +263,7 @@ void Dict::go_deeper_dawg_fxn(
WERD_CHOICE *Dict::dawg_permute_and_select(
const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit,
int sought_word_length, int start_char_choice_index) {
WERD_CHOICE *best_choice = new WERD_CHOICE();
WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset());
best_choice->make_bad();
best_choice->set_rating(rating_limit);
if (char_choices.length() == 0) return best_choice;
......@@ -272,7 +286,7 @@ WERD_CHOICE *Dict::dawg_permute_and_select(
(segment_penalty_dict_case_bad /
segment_penalty_dict_case_ok),
NO_PERM, sought_word_length, end_char_choice_index);
WERD_CHOICE word(MAX_WERD_LENGTH);
WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH);
copy_hyphen_info(&word);
// Discard rating and certainty of the hyphen base (if any).
word.set_rating(0.0);
......
......@@ -126,12 +126,13 @@ int find_choice_by_uid(BLOB_CHOICE_LIST *blob_list, UNICHAR_ID target_uid) {
* 1st choice of char 3, 2nd choice of char 4, 3rd choice of char 5, 2nd choice
* of char 6. If n > number of choice, the closest (last) one is used.
*/
WERD_CHOICE* get_choice_from_posstr(const BLOB_CHOICE_LIST_VECTOR &char_choices,
WERD_CHOICE* get_choice_from_posstr(const UNICHARSET *unicharset,
const BLOB_CHOICE_LIST_VECTOR &char_choices,
int start_pos,
const char* pos_str,
float *certainties) {
int pos_str_len = strlen(pos_str);
WERD_CHOICE* wchoice = new WERD_CHOICE();
WERD_CHOICE* wchoice = new WERD_CHOICE(unicharset);
if (start_pos + pos_str_len > char_choices.length()) {
wchoice->make_bad();
return wchoice;
......@@ -228,6 +229,7 @@ BLOB_CHOICE* find_choice_by_script(
PermuterState::PermuterState() {
unicharset_ = NULL;
char_choices_ = NULL;
adjust_factor_ = 1.0f;
allow_collision_ = false;
......@@ -240,6 +242,7 @@ void PermuterState::Init(const BLOB_CHOICE_LIST_VECTOR& char_choices,
float default_bias,
bool debug) {
ASSERT_HOST(char_choices.length() < MAX_PERM_LENGTH);
unicharset_ = &unicharset;
char_choices_ = &char_choices;
word_length_ = char_choices.length();
for (int i = 0; i < word_length_; ++i)
......@@ -300,9 +303,8 @@ void PermuterState::AddPreference(int char_pos, BLOB_CHOICE* blob_choice,
WERD_CHOICE* PermuterState::GetPermutedWord(float *certainties,
float *adjust_factor) {
ASSERT_HOST(char_choices_ != NULL);
WERD_CHOICE *word_choice = get_choice_from_posstr(*char_choices_,
0, perm_state_,
certainties);
WERD_CHOICE *word_choice = get_choice_from_posstr(
unicharset_, *char_choices_, 0, perm_state_, certainties);
float rating = word_choice->rating() * adjust_factor_;
word_choice->set_rating(rating);
*adjust_factor = adjust_factor_;
......@@ -431,7 +433,8 @@ WERD_CHOICE* Dict::permute_fixed_length_words(
if (permute_debug)
print_char_choices_list("\n\nPermute FixedLength Word",
char_choices, getUnicharset(), false);
WERD_CHOICE* best_choice = new WERD_CHOICE(char_choices.length());
WERD_CHOICE* best_choice =
new WERD_CHOICE(&getUnicharset(), char_choices.length());
const int max_dict_len = max_fixed_length_dawgs_wdlen_;
const int min_dict_len = 2;
char posstr[256];
......@@ -461,7 +464,7 @@ WERD_CHOICE* Dict::permute_fixed_length_words(
}
if (part_choice && step > 1) { // found lexicon match
part_choice->populate_unichars(getUnicharset());
part_choice->populate_unichars();
get_posstr_from_choice(char_choices, part_choice, anchor_pos, posstr);
float adjust_factor = pow(0.95, 1.0 + step*2.0/char_choices.length());
if (permuter_state)
......@@ -472,8 +475,8 @@ WERD_CHOICE* Dict::permute_fixed_length_words(
part_choice->unichar_string().string());
} else { // no lexicon match
step = 1;
part_choice =
get_choice_from_posstr(char_choices, anchor_pos, "0", NULL);
part_choice = get_choice_from_posstr(&getUnicharset(), char_choices,
anchor_pos, "0", NULL);
if (permute_debug)
tprintf("Single char %d %s\n", anchor_pos,
part_choice->unichar_string().string());
......@@ -493,7 +496,7 @@ WERD_CHOICE* Dict::permute_fixed_length_words(
best_choice->rating(), match_score, adjusted_score);
best_choice->set_rating(adjusted_score);
}
best_choice->populate_unichars(getUnicharset());
best_choice->populate_unichars();
if (permute_debug)
tprintf("Found Best CJK word %f: %s\n",
best_choice->rating(), best_choice->unichar_string().string());
......@@ -554,11 +557,12 @@ WERD_CHOICE* Dict::permute_chartype_words(
print_char_choices_list("", char_choices, getUnicharset(), true);
}
WERD_CHOICE *current_word = new WERD_CHOICE();
WERD_CHOICE *current_word = new WERD_CHOICE(&getUnicharset());
BLOB_CHOICE_IT blob_choice_it;
const UNICHARSET& unicharset = getUnicharset();
bool replaced = false; // has any character choice been replaced
int prev_unambig_type = 0; // the last chartype of an unambiguous char
float certainties[MAX_PERM_LENGTH + 1];
for (int x = 0; x < char_choices.length(); ++x) {
BLOB_CHOICE_LIST* pos_choice = char_choices.get(x);
UNICHAR_ID unichar_id = get_top_choice_uid(pos_choice);
......@@ -640,12 +644,12 @@ WERD_CHOICE* Dict::permute_chartype_words(
current_word->append_unichar_id(first_choice->unichar_id(), 1,
first_choice->rating(),
first_choice->certainty());
certainties[x] = first_choice->certainty();
}
// All permuter choices should go through adjust_non_word so the choice
// rating would be adjusted on the same scale.
float certainties[MAX_PERM_LENGTH + 1];
adjust_non_word(current_word, certainties, permute_debug);
current_word->populate_unichars(unicharset);
current_word->populate_unichars();
if (replaced) {
// Apply a reward multiplier on rating if an chartype permutation is made.
float rating = current_word->rating();
......@@ -682,10 +686,11 @@ WERD_CHOICE* Dict::permute_script_words(
permute_debug > 1);
}
WERD_CHOICE *current_word = new WERD_CHOICE();
WERD_CHOICE *current_word = new WERD_CHOICE(&getUnicharset());
BLOB_CHOICE_IT blob_choice_it;
bool replaced = false;
bool prev_is_consistent = false;
float certainties[MAX_PERM_LENGTH + 1];
for (int x = 0; x < char_choices.length(); ++x) {
blob_choice_it.set_to_list(char_choices.get(x));
BLOB_CHOICE *first_choice = blob_choice_it.data();
......@@ -737,13 +742,13 @@ WERD_CHOICE* Dict::permute_script_words(
current_word->append_unichar_id(first_choice->unichar_id(), 1,
first_choice->rating(),
first_choice->certainty());
certainties[x] = first_choice->certainty();
prev_is_consistent = sid_consistent;
}
// All permuter choices should go through adjust_non_word so the choice
// rating would be adjusted on the same scale.
float certainties[MAX_PERM_LENGTH + 1];
adjust_non_word(current_word, certainties, permute_debug);
current_word->populate_unichars(getUnicharset());
current_word->populate_unichars();
if (replaced) {
// Apply a reward multiplier on rating if an script permutation is made.
float rating = current_word->rating();
......@@ -780,19 +785,19 @@ bool Dict::permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
// Populate unichars_ and unichar_lengths_ of raw_choice. This is
// needed for various components that still work with unichars rather
// than unichar ids (e.g. LearnWord).
raw_choice->populate_unichars(getUnicharset());
raw_choice->populate_unichars();
}
if (this_choice && this_choice->rating() < best_choice->rating()) {
*best_choice = *this_choice;
// Populate unichars_ and unichar_lengths_ of best_choice. This is
// needed for various components that still work with unichars rather
// than unichar ids (dawg, *_ok functions, various hard-coded hacks).
best_choice->populate_unichars(getUnicharset());
best_choice->populate_unichars();
if (permute_debug) {
best_choice->print("\n**** Populate BestChoice");
cprintf("populate best_choice\n\t%s\n",
best_choice->debug_string(getUnicharset()).string());
best_choice->debug_string().string());
}
delete this_choice;
return true;
......@@ -811,13 +816,13 @@ WERD_CHOICE *Dict::permute_compound_words(
float rating_limit) {
BLOB_CHOICE *first_choice;
WERD_CHOICE *best_choice = NULL;
WERD_CHOICE current_word(MAX_WERD_LENGTH);
WERD_CHOICE current_word(&getUnicharset(), MAX_WERD_LENGTH);
int first_index = 0;
int x;
BLOB_CHOICE_IT blob_choice_it;
if (char_choices.length() > MAX_WERD_LENGTH) {
WERD_CHOICE *bad_word_choice = new WERD_CHOICE();
WERD_CHOICE *bad_word_choice = new WERD_CHOICE(&getUnicharset());
bad_word_choice->make_bad();
return bad_word_choice;
}
......@@ -874,7 +879,7 @@ void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
int x;
BLOB_CHOICE_LIST_VECTOR subchoices;
WERD_CHOICE *best_choice = NULL;
WERD_CHOICE raw_choice;
WERD_CHOICE raw_choice(&getUnicharset());
raw_choice.make_bad();
DisableChoiceAccum();
......@@ -886,7 +891,7 @@ void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
}
if (!subchoices.empty()) {
WERD_CHOICE initial_choice;
WERD_CHOICE initial_choice(&getUnicharset());
initial_choice.make_bad();
initial_choice.set_rating(rating_limit);
......@@ -906,10 +911,10 @@ void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
if (segment_debug && current_word->rating() < MAX_FLOAT32) {
cprintf ("Subword permuted = %s, %5.2f, %5.2f\n\n",
current_word->debug_string(getUnicharset()).string(),
current_word->debug_string().string(),
current_word->rating(), current_word->certainty());
}
current_word->populate_unichars(getUnicharset());
current_word->populate_unichars();
EnableChoiceAccum();
}
......@@ -919,7 +924,7 @@ void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
*/
WERD_CHOICE *Dict::get_top_choice_word(
const BLOB_CHOICE_LIST_VECTOR &char_choices) {
WERD_CHOICE *top_word = new WERD_CHOICE(MAX_PERM_LENGTH);
WERD_CHOICE *top_word = new WERD_CHOICE(&getUnicharset(), MAX_PERM_LENGTH);
float certainties[MAX_PERM_LENGTH];
top_word->set_permuter(TOP_CHOICE_PERM);
for (int x = 0; x < char_choices.length(); x++) {
......@@ -956,11 +961,11 @@ WERD_CHOICE *Dict::permute_top_choice(
const char *next_char = ""; //next in word
const char *next_next_char = ""; //after next next in word
WERD_CHOICE word(MAX_PERM_LENGTH);
WERD_CHOICE word(&getUnicharset(), MAX_PERM_LENGTH);
word.set_permuter(TOP_CHOICE_PERM);
WERD_CHOICE capital_word(MAX_PERM_LENGTH);
WERD_CHOICE capital_word(&getUnicharset(), MAX_PERM_LENGTH);
capital_word.set_permuter(UPPER_CASE_PERM);
WERD_CHOICE lower_word(MAX_PERM_LENGTH);
WERD_CHOICE lower_word(&getUnicharset(), MAX_PERM_LENGTH);
lower_word.set_permuter(LOWER_CASE_PERM);
int x;
......@@ -1023,7 +1028,7 @@ WERD_CHOICE *Dict::permute_top_choice(
if (first_choice == NULL) {
cprintf("Permuter found only fragments for"
" character at position %d; word=%s\n",
x, word.debug_string(getUnicharset()).string());
x, word.debug_string().string());
}
ASSERT_HOST(first_choice != NULL);
......@@ -1132,7 +1137,7 @@ WERD_CHOICE *Dict::permute_top_choice(
}
}
if (word.rating() < raw_choice->rating()) {
if (raw_choice != NULL && word.rating() < raw_choice->rating()) {
*raw_choice = word;
LogNewChoice(1.0, certainties, true, raw_choice);
}
......@@ -1423,9 +1428,9 @@ WERD_CHOICE *Dict::top_fragments_permute_and_select(
frag_char_choices += frag_choices;
}
WERD_CHOICE *best_choice = new WERD_CHOICE();
WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset());
best_choice->make_bad();
WERD_CHOICE word(MAX_PERM_LENGTH);
WERD_CHOICE word(&getUnicharset(), MAX_PERM_LENGTH);
word.set_permuter(TOP_CHOICE_PERM);
float certainties[MAX_PERM_LENGTH];
this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_top_fragments_fxn;
......@@ -1459,7 +1464,7 @@ void Dict::permute_choices(
tprintf("%s permute_choices: char_choice_index=%d"
" limit=%g rating=%g, certainty=%g word=%s\n",
debug, char_choice_index, *limit, word->rating(),
word->certainty(), word->debug_string(getUnicharset()).string());
word->certainty(), word->debug_string().string());
}
if (char_choice_index < char_choices.length()) {
BLOB_CHOICE_IT blob_choice_it;
......@@ -1554,7 +1559,7 @@ void Dict::go_deeper_top_fragments_fxn(
if (word_ending) {
if (fragments_debug > 1) {
tprintf("fragments_debug new choice = %s\n",
word->debug_string(getUnicharset()).string());
word->debug_string().string());
}
*limit = word->rating();
adjust_non_word(word, certainties, permute_debug);
......@@ -1567,8 +1572,7 @@ void Dict::go_deeper_top_fragments_fxn(
} else {
if (fragments_debug > 1) {
tprintf("fragments_debug pruned word (%s, rating=%4.2f, limit=%4.2f)\n",
word->debug_string(getUnicharset()).string(),
word->rating(), *limit);
word->debug_string().string(), word->rating(), *limit);
}
}
}
......
......@@ -133,6 +133,8 @@ class PermuterState {
private:
static const char kPosFree = '.';
const UNICHARSET *unicharset_;
const BLOB_CHOICE_LIST_VECTOR *char_choices_; // reference pointer only
// does not need to be allocated or freed
char perm_state_[MAX_PERM_LENGTH]; // handles upto MAX_PERM_LENGTH-1 states
......
......@@ -241,6 +241,19 @@ void print_state(const char *label, STATE *state, int num_joints) {
new_line();
}
// Prints out the number of fragments in each segment in a state to
// toappend.
void print_state(STATE *state, int num_joints, STRING *toappend) {
PIECES_STATE pieces;
bin_to_pieces(state, num_joints, pieces);
for (int i = 0; pieces[i] > 0; i++) {
if (i > 0) {
toappend->add_str_int(" ", pieces[i]);
} else {
toappend->add_str_int("", pieces[i]);
}
}
}
/**
* set_n_ones
......
......@@ -29,6 +29,7 @@
I n c l u d e s
----------------------------------------------------------------------*/
#include "host.h"
#include "strngs.h"
/*----------------------------------------------------------------------
T y p e s
......@@ -64,6 +65,8 @@ int ones_in_state(STATE *state, int num_joints);
void print_state(const char *label, STATE *state, int num_joints);
void print_state(STATE *state, int num_joints, STRING *toappend);
void set_n_ones(STATE *state, int n);
extern void free_state(STATE *);
......
......@@ -17,13 +17,11 @@
******************************************************************************/
#include "stopper.h"
#include "emalloc.h"
#include "matchdefs.h"
#include "callcpp.h"
#include "permute.h"
#include "danerror.h"
#include "const.h"
#include "freelist.h"
#include "efio.h"
#include "scanutils.h"
#include "unichar.h"
......@@ -58,6 +56,10 @@ typedef struct
UNICHAR_ID ChunkClass[MAX_NUM_CHUNKS];
} EXPANDED_CHOICE;
void DeleteViableChoiceStruct(void *vcs) {
delete (static_cast<VIABLE_CHOICE_STRUCT *>(vcs));
}
#define BestCertainty(Choices) \
(((VIABLE_CHOICE) first_node (Choices))->Certainty)
......@@ -66,10 +68,6 @@ typedef struct
#define BestFactor(Choices) \
(((VIABLE_CHOICE) first_node (Choices))->AdjustFactor)
#define AmbigThreshold(F1,F2) \
(((F2) - (F1)) * tesseract::Dict::kStopperAmbiguityThresholdGain - \
tesseract::Dict::kStopperAmbiguityThresholdOffset)
/**----------------------------------------------------------------------------
Private Code
----------------------------------------------------------------------------**/
......@@ -100,23 +98,72 @@ static void ExpandChoice(VIABLE_CHOICE Choice,
}
}
VIABLE_CHOICE_STRUCT::VIABLE_CHOICE_STRUCT(int length)
: Length(length) {
Blob = new CHAR_CHOICE[length];
segmentation_state = new uinT8[length];
}
VIABLE_CHOICE_STRUCT::VIABLE_CHOICE_STRUCT() : Length(0) {
Blob = NULL;
segmentation_state = NULL;
}
VIABLE_CHOICE_STRUCT::~VIABLE_CHOICE_STRUCT() {
delete []Blob;
delete []segmentation_state;
}
void VIABLE_CHOICE_STRUCT::Init(
const WERD_CHOICE &word_choice,
const PIECES_STATE &pieces_state,
const float certainties[],
FLOAT32 adjust_factor) {
this->Rating = word_choice.rating();
this->Certainty = word_choice.certainty();
this->AdjustFactor = adjust_factor;
this->ComposedFromCharFragments = false;
ASSERT_HOST(this->Length == word_choice.length());
for (int i = 0, bw_idx = 0; i < word_choice.length(); i++, bw_idx++) {
int blob_width = pieces_state[bw_idx];
CHAR_CHOICE *blob_choice = &this->Blob[i];
blob_choice->Class = word_choice.unichar_id(i);
blob_choice->NumChunks = blob_width;
blob_choice->Certainty = certainties[i];
for (int f = 1; f < word_choice.fragment_length(i); ++f) {
blob_width = pieces_state[++bw_idx];
assert(blob_width > 0);
blob_choice->NumChunks += blob_width;
this->ComposedFromCharFragments = true;
}
this->segmentation_state[i] = blob_choice->NumChunks;
}
}
namespace tesseract {
// If the certainty of any chunk in Choice (item1) is not ambiguous with the
// corresponding chunk in the best choice (item2), frees Choice and
// returns true.
static int FreeBadChoice(void *item1, // VIABLE_CHOICE Choice,
void *item2) { // EXPANDED_CHOICE *BestChoice
int Dict::FreeBadChoice(
void *item1, // VIABLE_CHOICE Choice,
void *item2) { // EXPANDED_CHOICE *BestChoice
int i, j, Chunk;
FLOAT32 Threshold;
VIABLE_CHOICE Choice = reinterpret_cast<VIABLE_CHOICE>(item1);
EXPANDED_CHOICE *BestChoice = reinterpret_cast<EXPANDED_CHOICE *>(item2);
Threshold = AmbigThreshold(BestChoice->Choice->AdjustFactor,
Choice->AdjustFactor);
Threshold = StopperAmbigThreshold(BestChoice->Choice->AdjustFactor,
Choice->AdjustFactor);
for (i = 0, Chunk = 0; i < Choice->Length; i++) {
for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++){
for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
if (Choice->Blob[i].Class != BestChoice->ChunkClass[Chunk] &&
Choice->Blob[i].Certainty - BestChoice->ChunkCertainty[Chunk] <
Threshold) {
memfree(Choice);
if (stopper_debug_level >= 2)
PrintViableChoice(stderr, "\nDiscarding bad choice: ", Choice);
delete Choice;
return true;
}
}
......@@ -124,11 +171,6 @@ static int FreeBadChoice(void *item1, // VIABLE_CHOICE Choice,
return false;
}
namespace tesseract {
const float Dict::kStopperAmbiguityThresholdGain = 8.0;
const float Dict::kStopperAmbiguityThresholdOffset = 1.5;
bool Dict::AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices,
WERD_CHOICE *BestChoice,
DANGERR *fixpt,
......@@ -158,7 +200,7 @@ bool Dict::AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices,
if (stopper_debug_level >= 1)
tprintf("\nStopper: %s (word=%c, case=%c)\n",
BestChoice->debug_string(getUnicharset()).string(),
BestChoice->debug_string().string(),
(is_valid_word ? 'y' : 'n'),
(is_case_ok ? 'y' : 'n'));
......@@ -198,7 +240,7 @@ bool Dict::AcceptableResult(const WERD_CHOICE &BestChoice) {
if (stopper_debug_level >= 1) {
tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c)\n",
BestChoice.debug_string(getUnicharset()).string(),
BestChoice.debug_string().string(),
(valid_word(BestChoice) ? 'y' : 'n'),
(case_ok(BestChoice, getUnicharset()) ? 'y' : 'n'),
((list_rest (best_choices_) != NIL_LIST) ? 'n' : 'y'));
......@@ -320,10 +362,16 @@ void Dict::FilterWordChoices() {
return;
// Compute certainties and class for each chunk in best choice.
ExpandChoice((VIABLE_CHOICE_STRUCT *)first_node(best_choices_), &BestChoice);
set_rest (best_choices_, delete_d(list_rest (best_choices_),
&BestChoice, FreeBadChoice));
VIABLE_CHOICE_STRUCT *best_choice =
(VIABLE_CHOICE_STRUCT *)first_node(best_choices_);
ExpandChoice(best_choice, &BestChoice);
if (stopper_debug_level >= 2)
PrintViableChoice(stderr, "\nFiltering against best choice: ", best_choice);
TessResultCallback2<int, void*, void*>* is_bad =
NewPermanentTessCallback(this, &Dict::FreeBadChoice);
set_rest(best_choices_, delete_d(list_rest(best_choices_),
&BestChoice, is_bad));
delete is_bad;
}
void Dict::FindClassifierErrors(FLOAT32 MinRating,
......@@ -371,15 +419,15 @@ void Dict::InitChoiceAccum() {
BLOB_WIDTH *BlobWidth, *End;
if (best_raw_choice_)
memfree(best_raw_choice_);
delete best_raw_choice_;
best_raw_choice_ = NULL;
if (best_choices_)
destroy_nodes(best_choices_, memfree);
destroy_nodes(best_choices_, DeleteViableChoiceStruct);
best_choices_ = NIL_LIST;
if (raw_choices_)
destroy_nodes(raw_choices_, memfree);
destroy_nodes(raw_choices_, DeleteViableChoiceStruct);
raw_choices_ = NIL_LIST;
EnableChoiceAccum();
......@@ -391,7 +439,7 @@ void Dict::InitChoiceAccum() {
}
void Dict::ClearBestChoiceAccum() {
if (best_choices_) destroy_nodes(best_choices_, memfree);
if (best_choices_) destroy_nodes(best_choices_, DeleteViableChoiceStruct);
best_choices_ = NIL_LIST;
}
......@@ -420,7 +468,6 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor,
const float Certainties[],
bool raw_choice,
WERD_CHOICE *WordChoice) {
VIABLE_CHOICE NewChoice;
LIST ChoicesList;
LIST Choices;
FLOAT32 Threshold;
......@@ -429,14 +476,15 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor,
return;
if (raw_choice) {
if (!best_raw_choice_)
best_raw_choice_ = NewViableChoice(*WordChoice, AdjustFactor, Certainties);
else if (WordChoice->rating() < best_raw_choice_->Rating) {
if (ChoiceSameAs(*WordChoice, best_raw_choice_))
FillViableChoice(*WordChoice, AdjustFactor, Certainties, true,
if (!best_raw_choice_) {
best_raw_choice_ =
NewViableChoice(*WordChoice, AdjustFactor, Certainties);
} else if (WordChoice->rating() < best_raw_choice_->Rating) {
if (ChoiceSameAs(*WordChoice, best_raw_choice_)) {
FillViableChoice(*WordChoice, AdjustFactor, Certainties,
best_raw_choice_);
else {
memfree(best_raw_choice_);
} else {
delete best_raw_choice_;
best_raw_choice_ =
NewViableChoice(*WordChoice, AdjustFactor, Certainties);
}
......@@ -449,16 +497,20 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor,
// Throw out obviously bad choices to save some work.
if (ChoicesList != NIL_LIST) {
Threshold = AmbigThreshold (BestFactor (ChoicesList), AdjustFactor);
if (Threshold > -kStopperAmbiguityThresholdOffset)
Threshold = -kStopperAmbiguityThresholdOffset;
Threshold = StopperAmbigThreshold(BestFactor(ChoicesList), AdjustFactor);
if (Threshold > -stopper_ambiguity_threshold_offset)
Threshold = -stopper_ambiguity_threshold_offset;
if (WordChoice->certainty() - BestCertainty (ChoicesList) < Threshold) {
// Set the rating of the word to be terrible, so that it does not
// get chosen as the best choice.
if (stopper_debug_level >= 2) {
tprintf("Discarding a choice with an overly low certainty"
" %.4f vs best choice certainty %.4f\n",
WordChoice->certainty(), BestCertainty(ChoicesList));
STRING bad_string;
WordChoice->string_and_lengths(&bad_string, NULL);
tprintf("Discarding choice \"%s\" with an overly low certainty"
" %.4f vs best choice certainty %.4f (Threshold: %.4f)\n",
bad_string.string(), WordChoice->certainty(),
BestCertainty(ChoicesList),
Threshold + BestCertainty(ChoicesList));
}
WordChoice->set_rating(WERD_CHOICE::kBadRating);
return;
......@@ -466,7 +518,7 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor,
}
// See if a choice with the same text string has already been found.
NewChoice = NULL;
VIABLE_CHOICE NewChoice = NULL;
Choices = ChoicesList;
iterate(Choices) {
......@@ -480,11 +532,10 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor,
}
if (NewChoice) {
FillViableChoice(*WordChoice, AdjustFactor, Certainties, true, NewChoice);
FillViableChoice(*WordChoice, AdjustFactor, Certainties, NewChoice);
ChoicesList = delete_d(ChoicesList, NewChoice, is_same_node);
}
else {
NewChoice = NewViableChoice (*WordChoice, AdjustFactor, Certainties);
} else {
NewChoice = NewViableChoice(*WordChoice, AdjustFactor, Certainties);
}
ChoicesList = s_adjoin (ChoicesList, NewChoice, CmpChoiceRatings);
......@@ -494,7 +545,7 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor,
if (count (ChoicesList) > tessedit_truncate_wordchoice_log) {
Choices =
(LIST) nth_cell (ChoicesList, tessedit_truncate_wordchoice_log);
destroy_nodes (list_rest (Choices), Efree);
destroy_nodes(list_rest (Choices), DeleteViableChoiceStruct);
set_rest(Choices, NIL_LIST);
}
......@@ -513,7 +564,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
bool *modified_blobs) {
if (stopper_debug_level > 2) {
tprintf("\nRunning NoDangerousAmbig() for %s\n",
best_choice->debug_string(getUnicharset()).string());
best_choice->debug_string().string());
}
// Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
......@@ -549,8 +600,10 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
for (i = 0; i < best_choice->length(); ++i) {
BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST();
BLOB_CHOICE_IT lst_it(lst);
// TODO(rays/antonova) Should these BLOB_CHOICEs use real xheights
// or are these fake ones good enough?
lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i),
0.0, 0.0, -1, -1, -1));
0.0, 0.0, -1, -1, -1, 0, 1, false));
ambig_blob_choices.push_back(lst);
}
}
......@@ -630,7 +683,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
bc_it.add_to_end(new BLOB_CHOICE(
ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
-1, -1, -1));
-1, -1, -1, 0, 1, false));
}
}
spec_it.forward();
......@@ -650,7 +703,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
} // end searching AmbigSpec_LIST
} // end searching best_choice
} // end searching replace and dangerous ambigs
if (modified_best_choice) best_choice->populate_unichars(getUnicharset());
if (modified_best_choice) best_choice->populate_unichars();
// If any ambiguities were found permute the constructed ambig_blob_choices
// to see if an alternative dictionary word can be found.
if (ambigs_found) {
......@@ -666,7 +719,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
if (ambigs_found) {
if (stopper_debug_level >= 1) {
tprintf ("Stopper: Possible ambiguous word = %s\n",
alt_word->debug_string(getUnicharset()).string());
alt_word->debug_string().string());
}
if (fixpt != NULL) {
// Note: Currently character choices combined from fragments can only
......@@ -691,6 +744,10 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
}
delete alt_word;
}
if (output_ambig_words_file_ != NULL) {
fprintf(output_ambig_words_file_, "\n");
}
ambig_blob_choices.delete_data_pointers();
return !ambigs_found;
}
......@@ -714,7 +771,6 @@ void Dict::AddNewChunk(VIABLE_CHOICE Choice, int Blob) {
return;
}
}
mem_tidy (1);
cprintf ("AddNewChunk failed:Choice->Length=%d, LastChunk=%d, Blob=%d\n",
Choice->Length, LastChunk, Blob);
assert(false); // this should never get executed
......@@ -748,7 +804,7 @@ void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
for (i = 0; i < fraglen; ++i) {
if (fraglen > 1) {
STRING frag_str =
CHAR_FRAGMENT::to_string(temp_uch, i, fraglen);
CHAR_FRAGMENT::to_string(temp_uch, i, fraglen, false);
getUnicharset().unichar_insert(frag_str.string());
uch_id = getUnicharset().unichar_to_id(frag_str.string());
}
......@@ -756,7 +812,7 @@ void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
STRING correct_frag_uch =
CHAR_FRAGMENT::to_string(correct_ngram_str,
temp_blob_index - begin_blob_index,
num_blobs_to_replace);
num_blobs_to_replace, false);
getUnicharset().unichar_insert(correct_frag_uch.string());
UNICHAR_ID correct_frag_uch_id =
getUnicharset().unichar_to_id(correct_frag_uch.string());
......@@ -825,10 +881,9 @@ VIABLE_CHOICE Dict::NewViableChoice(const WERD_CHOICE &WordChoice,
const float Certainties[]) {
int Length = WordChoice.length();
assert (Length <= MAX_NUM_CHUNKS && Length > 0);
VIABLE_CHOICE NewChoice = (VIABLE_CHOICE) Emalloc (
sizeof (VIABLE_CHOICE_STRUCT) + (Length - 1) * sizeof (CHAR_CHOICE));
FillViableChoice(WordChoice, AdjustFactor, Certainties, false, NewChoice);
return (NewChoice);
VIABLE_CHOICE NewChoice = new VIABLE_CHOICE_STRUCT(Length);
FillViableChoice(WordChoice, AdjustFactor, Certainties, NewChoice);
return NewChoice;
}
void Dict::PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice) {
......@@ -864,35 +919,10 @@ void Dict::PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice
void Dict::FillViableChoice(const WERD_CHOICE &WordChoice,
FLOAT32 AdjustFactor, const float Certainties[],
bool SameString, VIABLE_CHOICE ViableChoice) {
CHAR_CHOICE *NewChar;
BLOB_WIDTH *BlobWidth;
int x;
ViableChoice->Rating = WordChoice.rating();
ViableChoice->Certainty = WordChoice.certainty();
ViableChoice->AdjustFactor = AdjustFactor;
ViableChoice->ComposedFromCharFragments = false;
if (!SameString) {
ViableChoice->Length = WordChoice.length();
}
for (x = 0,
NewChar = &(ViableChoice->Blob[0]),
BlobWidth = current_segmentation_;
x < WordChoice.length();
x++, NewChar++, Certainties++, BlobWidth++) {
if (!SameString) {
NewChar->Class = WordChoice.unichar_id(x);
}
NewChar->NumChunks = *BlobWidth;
NewChar->Certainty = *Certainties;
for (int i = 1; i < WordChoice.fragment_length(x); ++i) {
BlobWidth++;
assert(*BlobWidth > 0);
NewChar->NumChunks += *BlobWidth;
ViableChoice->ComposedFromCharFragments = true;
}
}
VIABLE_CHOICE ViableChoice) {
ViableChoice->Init(WordChoice, current_segmentation_, Certainties,
AdjustFactor);
}
bool Dict::StringSameAs(const WERD_CHOICE &WordChoice,
......
......@@ -27,6 +27,8 @@
#include "states.h"
#include "unichar.h"
class WERD_CHOICE;
typedef uinT8 BLOB_WIDTH;
struct DANGERR_INFO {
......@@ -50,13 +52,36 @@ struct CHAR_CHOICE {
float Certainty;
};
struct VIABLE_CHOICE_STRUCT {
class VIABLE_CHOICE_STRUCT {
public:
VIABLE_CHOICE_STRUCT();
explicit VIABLE_CHOICE_STRUCT(int length);
~VIABLE_CHOICE_STRUCT();
// Fill in the data with these values.
void Init(const WERD_CHOICE& word_choice,
const PIECES_STATE& pieces_state,
const float certainties[],
FLOAT32 adjust_factor);
int Length;
float Rating;
float Certainty;
FLOAT32 AdjustFactor;
int Length;
bool ComposedFromCharFragments;
CHAR_CHOICE Blob[1];
CHAR_CHOICE *Blob;
// segmentation_state: for each choice, how many consecutive blobs
// does it use?
uinT8 *segmentation_state;
private:
// Disallow assignment and copy construction
VIABLE_CHOICE_STRUCT(const VIABLE_CHOICE_STRUCT &other)
: Length(0), Blob(NULL), segmentation_state(NULL) {}
VIABLE_CHOICE_STRUCT &operator=(const VIABLE_CHOICE_STRUCT &other) {
return *this;
}
};
typedef VIABLE_CHOICE_STRUCT *VIABLE_CHOICE;
......
......@@ -40,6 +40,16 @@
namespace tesseract {
const char kDoNotReverse[] = "RRP_DO_NO_REVERSE";
const char kReverseIfHasRTL[] = "RRP_REVERSE_IF_HAS_RTL";
const char kForceReverse[] = "RRP_FORCE_REVERSE";
const char * const RTLReversePolicyNames[] = {
kDoNotReverse,
kReverseIfHasRTL,
kForceReverse
};
const char Trie::kAlphaPatternUnicode[] = "\u2000";
const char Trie::kDigitPatternUnicode[] = "\u2001";
const char Trie::kAlphanumPatternUnicode[] = "\u2002";
......@@ -47,6 +57,10 @@ const char Trie::kPuncPatternUnicode[] = "\u2003";
const char Trie::kLowerPatternUnicode[] = "\u2004";
const char Trie::kUpperPatternUnicode[] = "\u2005";
const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) {
return RTLReversePolicyNames[reverse_policy];
}
// Reset the Trie to empty.
void Trie::clear() {
nodes_.delete_data_pointers();
......@@ -156,10 +170,15 @@ void Trie::add_word_ending(EDGE_RECORD *edge_ptr,
*edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
}
void Trie::add_word_to_dawg(const WERD_CHOICE &word,
bool Trie::add_word_to_dawg(const WERD_CHOICE &word,
const GenericVector<bool> *repetitions) {
if (word.length() <= 0) return; // can't add empty words
if (word.length() <= 0) return false; // can't add empty words
if (repetitions != NULL) ASSERT_HOST(repetitions->size() == word.length());
// Make sure the word does not contain invalid unchar ids.
for (int i = 0; i < word.length(); ++i) {
if (word.unichar_id(i) < 0 ||
word.unichar_id(i) >= unicharset_size_) return false;
}
EDGE_RECORD *edge_ptr;
NODE_REF last_node = 0;
......@@ -233,6 +252,9 @@ void Trie::add_word_to_dawg(const WERD_CHOICE &word,
if (add_failed) {
tprintf("Re-initializing document dictionary...\n");
clear();
return false;
} else {
return true;
}
}
......@@ -244,7 +266,8 @@ NODE_REF Trie::new_dawg_node() {
}
bool Trie::read_word_list(const char *filename,
const UNICHARSET &unicharset) {
const UNICHARSET &unicharset,
Trie::RTLReversePolicy reverse_policy) {
FILE *word_file;
char string[CHARS_PER_LINE];
int word_count = 0;
......@@ -254,6 +277,11 @@ bool Trie::read_word_list(const char *filename,
while (fgets(string, CHARS_PER_LINE, word_file) != NULL) {
chomp_string(string); // remove newline
WERD_CHOICE word(string, unicharset);
if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL &&
word.has_rtl_unichar_id()) ||
reverse_policy == RRP_FORCE_REVERSE) {
word.reverse_and_mirror_unichar_ids();
}
++word_count;
if (debug_level_ && word_count % 10000 == 0)
tprintf("Read %d words so far\n", word_count);
......@@ -290,6 +318,7 @@ void Trie::initialize_patterns(UNICHARSET *unicharset) {
unicharset->unichar_insert(kUpperPatternUnicode);
upper_pattern_ = unicharset->unichar_to_id(kUpperPatternUnicode);
initialized_patterns_ = true;
unicharset_size_ = unicharset->size();
}
void Trie::unichar_id_to_patterns(UNICHAR_ID unichar_id,
......@@ -351,7 +380,7 @@ bool Trie::read_pattern_list(const char *filename,
chomp_string(string); // remove newline
// Parse the pattern and construct a unichar id vector.
// Record the number of repetitions of each unichar in the parallel vector.
WERD_CHOICE word;
WERD_CHOICE word(&unicharset);
GenericVector<bool> repetitions_vec;
const char *str_ptr = string;
int step = unicharset.step(str_ptr);
......@@ -397,7 +426,7 @@ bool Trie::read_pattern_list(const char *filename,
// Insert the pattern into the trie.
if (debug_level_ > 2) {
tprintf("Inserting expanded user pattern %s\n",
word.debug_string(unicharset).string());
word.debug_string().string());
}
if (!this->word_in_dawg(word)) {
this->add_word_to_dawg(word, &repetitions_vec);
......
......@@ -61,6 +61,12 @@ namespace tesseract {
*/
class Trie : public Dawg {
public:
enum RTLReversePolicy {
RRP_DO_NO_REVERSE,
RRP_REVERSE_IF_HAS_RTL,
RRP_FORCE_REVERSE,
};
// Minimum number of concrete characters at the beginning of user patterns.
static const int kSaneNumConcreteChars = 4;
// Various unicode whitespace characters are used to denote unichar patterns,
......@@ -73,6 +79,9 @@ class Trie : public Dawg {
static const char kLowerPatternUnicode[];
static const char kUpperPatternUnicode[];
static const char *get_reverse_policy_name(
RTLReversePolicy reverse_policy);
// max_num_edges argument allows limiting the amount of memory this
// Trie can consume (if a new word insert would cause the Trie to
// contain more edges than max_num_edges, all the edges are cleared
......@@ -86,7 +95,7 @@ class Trie : public Dawg {
new_dawg_node(); // need to allocate node 0
initialized_patterns_ = false;
}
~Trie() { nodes_.delete_data_pointers(); }
virtual ~Trie() { nodes_.delete_data_pointers(); }
// Reset the Trie to empty.
void clear();
......@@ -149,8 +158,11 @@ class Trie : public Dawg {
SquishedDawg *trie_to_dawg();
// Inserts the list of words from the given file into the Trie.
// If reverse is true, calls WERD_CHOICE::reverse_unichar_ids_if_rtl()
// on each word before inserting it into the Trie.
bool read_word_list(const char *filename,
const UNICHARSET &unicharset);
const UNICHARSET &unicharset,
Trie::RTLReversePolicy reverse);
// Inserts the list of patterns from the given file into the Trie.
// The pattern list file should contain one pattern per line in UTF-8 format.
......@@ -225,10 +237,13 @@ class Trie : public Dawg {
// whether the unichar id with the corresponding index in the word is allowed
// to repeat an unlimited number of times. For each entry that is true, MARKER
// flag of the corresponding edge created for this unichar id is set to true).
void add_word_to_dawg(const WERD_CHOICE &word,
//
// Return true if add succeeded, false otherwise (e.g. when a word contained
// an invalid unichar id or the trie was getting too large and was cleared).
bool add_word_to_dawg(const WERD_CHOICE &word,
const GenericVector<bool> *repetitions);
void add_word_to_dawg(const WERD_CHOICE &word) {
add_word_to_dawg(word, NULL);
bool add_word_to_dawg(const WERD_CHOICE &word) {
return add_word_to_dawg(word, NULL);
}
protected:
......@@ -377,11 +392,11 @@ class Trie : public Dawg {
UNICHAR_ID character_class_to_pattern(char ch);
// Member variables
TRIE_NODES nodes_; ///< vector of nodes in the Trie
uinT64 num_edges_; ///< sum of all edges (forward and backward)
uinT64 max_num_edges_; ///< maximum number of edges allowed
uinT64 deref_direction_mask_; ///< mask for EDGE_REF to extract direction
uinT64 deref_node_index_mask_; ///< mask for EDGE_REF to extract node index
TRIE_NODES nodes_; // vector of nodes in the Trie
uinT64 num_edges_; // sum of all edges (forward and backward)
uinT64 max_num_edges_; // maximum number of edges allowed
uinT64 deref_direction_mask_; // mask for EDGE_REF to extract direction
uinT64 deref_node_index_mask_; // mask for EDGE_REF to extract node index
// Variables for translating character class codes denoted in user patterns
// file to the unichar ids used to represent them in a Trie.
bool initialized_patterns_;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册