From 0d9fa6a040c9c5f24dc29c29b3f37fdc3707a4f4 Mon Sep 17 00:00:00 2001 From: theraysmith Date: Wed, 18 Jul 2007 01:01:50 +0000 Subject: [PATCH] Fixed portability problems with VC++ 6 and VC++ express. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@83 d0cd1f9f-072b-0410-8dd7-cf729c803f20 --- ccmain/fixxht.cpp | 162 +++++++++++++++++++++++++----------------- ccutil/mfcpch.h | 4 +- cutil/danerror.cpp | 9 ++- dict/dawg.h | 125 +++++++++++++++++++++++++++----- training/training.cpp | 2 +- 5 files changed, 212 insertions(+), 90 deletions(-) diff --git a/ccmain/fixxht.cpp b/ccmain/fixxht.cpp index f1339dab..6dfb42b5 100644 --- a/ccmain/fixxht.cpp +++ b/ccmain/fixxht.cpp @@ -139,6 +139,7 @@ void re_estimate_x_ht( //improve for 1 word const char *word_str; INT16 i; + INT16 offset; STATS all_blobs_ht (0, 300); //every blob in word STATS x_ht (0, 300); //confirmed pts in wd @@ -174,8 +175,9 @@ void re_estimate_x_ht( //improve for 1 word Cycle blobs, allocating to one of the stats sets when possible. */ blob_it.set_to_list (word_res->outword->blob_list ()); - for (blob_it.mark_cycle_pt (), i = 0; - !blob_it.cycled_list (); blob_it.forward (), i++) { + for (blob_it.mark_cycle_pt (), i = 0, offset = 0; + !blob_it.cycled_list (); blob_it.forward (), + offset += word_res->best_choice->lengths()[i++]) { if (!dodgy_blob (blob_it.data ())) { blob_box = blob_it.data ()->bounding_box (); blob_ht_above_baseline = blob_box.top () - bln_baseline_offset; @@ -189,22 +191,22 @@ void re_estimate_x_ht( //improve for 1 word rej_blobs_max_area = blob_box.area (); } else { - if (STRING (chs_non_ambig_x_ht).contains (word_str[i])) + if (STRING (chs_non_ambig_x_ht).contains (word_str[offset])) x_ht.add (blob_ht_above_baseline, 1); - if (STRING (chs_non_ambig_caps_ht).contains (word_str[i])) + if (STRING (chs_non_ambig_caps_ht).contains (word_str[offset])) caps_ht.add (blob_ht_above_baseline, 1); - if (STRING (chs_ambig_caps_x).contains (word_str[i])) { + if (STRING (chs_ambig_caps_x).contains (word_str[offset])) { case_ambig.add (blob_ht_above_baseline, 1); - if (STRING (chs_x_ht).contains (word_str[i])) + if (STRING (chs_x_ht).contains (word_str[offset])) x_ht_ambigs++; else caps_ht_ambigs++; } - if (STRING (chs_bl_ambig_caps_x).contains (word_str[i])) { - if (STRING (chs_x_ht).contains (word_str[i])) { + if (STRING (chs_bl_ambig_caps_x).contains (word_str[offset])) { + if (STRING (chs_x_ht).contains (word_str[offset])) { /* confirm x_height provided > 15% total height below baseline */ if ((bln_baseline_offset - blob_box.bottom ()) / (float) blob_box.height () > 0.15) @@ -222,7 +224,7 @@ void re_estimate_x_ht( //improve for 1 word } est_caps_ht = estimate_from_stats (caps_ht); est_x_ht = estimate_from_stats (x_ht); - est_ambigs(word_res, case_ambig, &ambig_lc_x_est, &ambig_uc_caps_est); + est_ambigs(word_res, case_ambig, &ambig_lc_x_est, &ambig_uc_caps_est); max_blob_ht = all_blobs_ht.ile (0.9999); #ifndef SECURE_NAMES @@ -265,7 +267,7 @@ void re_estimate_x_ht( //improve for 1 word est_caps_ht = est_x_ht / x_ht_fraction_of_caps_ht; } if (case_ambig.get_total () > 0) - improve_estimate(word_res, est_x_ht, est_caps_ht, x_ht, caps_ht); + improve_estimate(word_res, est_x_ht, est_caps_ht, x_ht, caps_ht); est_caps_ht_certain = caps_ht.get_total () > 0; #ifndef SECURE_NAMES if (debug_x_ht_level >= 20) @@ -281,7 +283,7 @@ void re_estimate_x_ht( //improve for 1 word else est_x_ht = est_caps_ht * x_ht_fraction_of_caps_ht; if (ambig_lc_x_est + ambig_uc_caps_est > 0) - improve_estimate(word_res, est_x_ht, est_caps_ht, x_ht, caps_ht); + improve_estimate(word_res, est_x_ht, est_caps_ht, x_ht, caps_ht); est_x_ht_certain = x_ht.get_total () > 0; #ifndef SECURE_NAMES if (debug_x_ht_level >= 20) @@ -454,7 +456,7 @@ void re_estimate_x_ht( //improve for 1 word word_res->reject_map.print (debug_fp); } #endif - reject_ambigs(word_res); + reject_ambigs(word_res); if (debug_x_ht_level >= 2) { tprintf (" "); word_res->reject_map.print (debug_fp); @@ -495,19 +497,25 @@ void re_estimate_x_ht( //improve for 1 word * case of case ambiguous chars as required. *************************************************************************/ -void check_block_occ(WERD_RES *word_res) { +void check_block_occ(WERD_RES *word_res) { PBLOB_IT blob_it; STRING new_string; + STRING new_string_lengths(word_res->best_choice->lengths()); +// char new_string_lengths[word_res->best_choice->lengths().length() + 1]; REJMAP new_map = word_res->reject_map; WERD_CHOICE *new_choice; const char *word_str = word_res->best_choice->string ().string (); INT16 i; + INT16 offset; INT16 reject_count = 0; - char confirmed_char; + char confirmed_char[UNICHAR_LEN + 1]; + char temp_char[UNICHAR_LEN + 1]; float x_ht; float caps_ht; + new_string_lengths[0] = 0; + if (word_res->x_height > 0) x_ht = word_res->x_height * word_res->denorm.scale (); else @@ -520,24 +528,31 @@ void check_block_occ(WERD_RES *word_res) { blob_it.set_to_list (word_res->outword->blob_list ()); - for (blob_it.mark_cycle_pt (), i = 0; - !blob_it.cycled_list (); blob_it.forward (), i++) { - new_string += word_str[i]; //default copy + for (blob_it.mark_cycle_pt (), i = 0, offset = 0; + !blob_it.cycled_list (); blob_it.forward (), + offset += word_res->best_choice->lengths()[i++]) { + strncpy(temp_char, word_str + offset, + word_res->best_choice->lengths()[i]); //default copy + temp_char[word_res->best_choice->lengths()[i]] = '\0'; if (word_res->reject_map[i].accepted ()) { - confirmed_char = check_blob_occ (word_str[i], - blob_it.data ()->bounding_box (). - top () - bln_baseline_offset, x_ht, - caps_ht); + check_blob_occ (temp_char, + blob_it.data ()->bounding_box (). + top () - bln_baseline_offset, x_ht, + caps_ht, confirmed_char); - if (confirmed_char == '\0') { + if (strcmp(confirmed_char, "") == 0) { if (rej_use_check_block_occ) { new_map[i].setrej_xht_fixup (); reject_count++; } } else - new_string[i] = confirmed_char; + strcpy(temp_char, confirmed_char); } + new_string += temp_char; + new_string_lengths[i] = strlen(temp_char); + new_string_lengths[i + 1] = 0; + } if ((reject_count > 0) || (new_string != word_str)) { if (debug_x_ht_level >= 2) { @@ -548,9 +563,10 @@ void check_block_occ(WERD_RES *word_res) { tprintf ("\n"); } new_choice = new WERD_CHOICE (new_string.string (), - word_res->best_choice->rating (), - word_res->best_choice->certainty (), - word_res->best_choice->permuter ()); + new_string_lengths.string(), + word_res->best_choice->rating (), + word_res->best_choice->certainty (), + word_res->best_choice->permuter ()); delete word_res->best_choice; word_res->best_choice = new_choice; word_res->reject_map = new_map; @@ -562,13 +578,14 @@ void check_block_occ(WERD_RES *word_res) { * check_blob_occ() * * Checks blob for position relative to position above baseline - * Returns 0 for reject, or (possibly case shifted) confirmed char + * Return 0 for reject, or (possibly case shifted) confirmed char *************************************************************************/ -char check_blob_occ(char proposed_char, +void check_blob_occ(char* proposed_char, INT16 blob_ht_above_baseline, float x_ht, - float caps_ht) { + float caps_ht, + char* confirmed_char) { BOOL8 blob_definite_x_ht; BOOL8 blob_definite_caps_ht; float acceptable_variation; @@ -593,41 +610,51 @@ char check_blob_occ(char proposed_char, blob_definite_caps_ht = blob_ht_above_baseline >= caps_ht - acceptable_variation; - if (STRING (chs_ambig_caps_x).contains (proposed_char)) { + if (STRING (chs_ambig_caps_x).contains (*proposed_char)) { if ((!blob_definite_x_ht && !blob_definite_caps_ht) || - (proposed_char == '0' && !blob_definite_caps_ht) || - (proposed_char == 'o' && !blob_definite_x_ht)) - return '\0'; + ((strcmp(proposed_char, "0") == 0) && !blob_definite_caps_ht) || + ((strcmp(proposed_char, "o") == 0) && !blob_definite_x_ht)) { + strcpy(confirmed_char, ""); + return; + } else if (blob_definite_caps_ht && - STRING (chs_x_ht).contains (proposed_char)) { - if (x_ht_case_flip) + STRING (chs_x_ht).contains (*proposed_char)) { + if (x_ht_case_flip) { //flip to upper case - return (char) toupper (proposed_char); - else - return '\0'; + proposed_char[0] = (char) toupper (*proposed_char); + return; + } else { + strcpy(confirmed_char, ""); + return; + } } else if (blob_definite_x_ht && - !STRING (chs_x_ht).contains (proposed_char)) { - if (x_ht_case_flip) + !STRING (chs_x_ht).contains (*proposed_char)) { + if (x_ht_case_flip) { //flip to lower case - return (char) tolower (proposed_char); - else - return '\0'; + proposed_char[0] = (char) tolower (*proposed_char); + } else { + strcpy(confirmed_char, ""); + return; + } } } else - if ((STRING (chs_non_ambig_x_ht).contains (proposed_char) + if ((STRING (chs_non_ambig_x_ht).contains (*proposed_char) && !blob_definite_x_ht) - || (STRING (chs_non_ambig_caps_ht).contains (proposed_char) - && !blob_definite_caps_ht)) - return '\0'; - return proposed_char; + || (STRING (chs_non_ambig_caps_ht).contains (*proposed_char) + && !blob_definite_caps_ht)) { + strcpy(confirmed_char, ""); + return; + } + strcpy(confirmed_char, proposed_char); + return; } -float estimate_from_stats(STATS &stats) { +float estimate_from_stats(STATS &stats) { if (stats.get_total () <= 0) return 0.0; else if (stats.get_total () >= 3) @@ -647,8 +674,10 @@ void improve_estimate(WERD_RES *word_res, const char *word_str; INT16 i; + INT16 offset; BOX blob_box; //blob bounding box - char confirmed_char; + char confirmed_char[UNICHAR_LEN + 1]; + char temp_char[UNICHAR_LEN + 1]; float new_val; /* IMPROVE estimates here - if good estimates, and case ambig chars, @@ -658,17 +687,21 @@ void improve_estimate(WERD_RES *word_res, blob_it.set_to_list (word_res->outword->blob_list ()); word_str = word_res->best_choice->string ().string (); - for (blob_it.mark_cycle_pt (), i = 0; - !blob_it.cycled_list (); blob_it.forward (), i++) { - if ((STRING (chs_ambig_caps_x).contains (word_str[i])) && + for (blob_it.mark_cycle_pt (), i = 0, offset = 0; + !blob_it.cycled_list (); blob_it.forward (), + offset += word_res->best_choice->lengths()[i++]) { + if ((STRING (chs_ambig_caps_x).contains (word_str[offset])) && (!dodgy_blob (blob_it.data ()))) { blob_box = blob_it.data ()->bounding_box (); blob_ht_above_baseline = blob_box.top () - bln_baseline_offset; - confirmed_char = check_blob_occ (word_str[i], - blob_ht_above_baseline, - est_x_ht, est_caps_ht); - if (confirmed_char != '\0') - if (STRING (chs_x_ht).contains (confirmed_char)) + strncpy(temp_char, word_str + offset, + word_res->best_choice->lengths()[i]); + temp_char[word_res->best_choice->lengths()[i]] = '\0'; + check_blob_occ (temp_char, + blob_ht_above_baseline, + est_x_ht, est_caps_ht, confirmed_char); + if (strcmp(confirmed_char, "") != 0) + if (STRING (chs_x_ht).contains (*confirmed_char)) x_ht.add (blob_ht_above_baseline, 1); else caps_ht.add (blob_ht_above_baseline, 1); @@ -692,8 +725,7 @@ void reject_ambigs( //rej any accepted xht ambig chars while (*word_str != '\0') { if (STRING (chs_ambig_caps_x).contains (*word_str)) word->reject_map[i].setrej_xht_fixup (); - word_str++; - i++; + word_str += word->best_choice->lengths()[i++]; } } @@ -713,6 +745,7 @@ void est_ambigs( //xht ambig ht stats const char *word_str; INT16 i; + INT16 offset; float min; //min ambig ch ht float max; //max ambig ch ht float short_limit; // for lower case @@ -738,10 +771,11 @@ void est_ambigs( //xht ambig ht stats tall_limit = max - (max - min) * x_ht_variation; word_str = word_res->best_choice->string ().string (); blob_it.set_to_list (word_res->outword->blob_list ()); - for (blob_it.mark_cycle_pt (), i = 0; - !blob_it.cycled_list (); blob_it.forward (), i++) { + for (blob_it.mark_cycle_pt (), i = 0, offset = 0; + !blob_it.cycled_list (); blob_it.forward (), + offset += word_res->best_choice->lengths()[i++]) { if (word_res->reject_map[i].accepted () && - STRING (chs_ambig_caps_x).contains (word_str[i]) && + STRING (chs_ambig_caps_x).contains (word_str[offset]) && (!dodgy_blob (blob_it.data ()))) { blob_box = blob_it.data ()->bounding_box (); blob_ht_above_baseline = @@ -770,7 +804,7 @@ void est_ambigs( //xht ambig ht stats * to be misleading *************************************************************************/ -BOOL8 dodgy_blob(PBLOB *blob) { +BOOL8 dodgy_blob(PBLOB *blob) { OUTLINE_IT outline_it = blob->out_list (); INT16 highest_bottom = -MAX_INT16; INT16 lowest_top = MAX_INT16; diff --git a/ccutil/mfcpch.h b/ccutil/mfcpch.h index d9d21723..cd30936c 100644 --- a/ccutil/mfcpch.h +++ b/ccutil/mfcpch.h @@ -5,10 +5,10 @@ // For Unix and mac the file does nothing. It needs to be included in all cpp // files for compatibility with the PC pre-compiled header mechanism. #ifdef __MSW32__ -#ifdef __IPEREGDLL +#ifndef _AFXDLL #define WIN32_LEAN_AND_MEAN #define STRICT 1 -#include +//#include #include #else #define VC_EXTRALEAN // Exclude rarely-used stuff from Windows headers diff --git a/cutil/danerror.cpp b/cutil/danerror.cpp index 874ea169..2a2c95b1 100644 --- a/cutil/danerror.cpp +++ b/cutil/danerror.cpp @@ -44,7 +44,7 @@ static INT32 CurrentTrapDepth = 0; Public Code ----------------------------------------------------------------------------**/ /*---------------------------------------------------------------------------*/ -void ReleaseErrorTrap() { +void ReleaseErrorTrap() { /* ** Parameters: ** None @@ -69,7 +69,7 @@ void ReleaseErrorTrap() { /*---------------------------------------------------------------------------*/ -void DoError(int Error, const char *Message) { +void DoError(int Error, const char *Message) { /* ** Parameters: ** Error error number which is to be trapped @@ -100,14 +100,13 @@ void DoError(int Error, const char *Message) { There used to be a call to abort() here. I've changed it to call into the C++ error code to generate a meaningful status code */ - signal_termination_handler(Error); + signal_termination_handler(Error); } if (ProcTrapStack[CurrentTrapDepth - 1] != DO_NOTHING) (*ProcTrapStack[CurrentTrapDepth - 1]) (); longjmp (ErrorTrapStack[CurrentTrapDepth - 1], 1); - assert(FALSE); } /* DoError */ @@ -115,7 +114,7 @@ void DoError(int Error, const char *Message) { Private Code ----------------------------------------------------------------------------**/ /*---------------------------------------------------------------------------*/ -jmp_buf &PushErrorTrap(VOID_PROC Procedure) { +jmp_buf &PushErrorTrap(VOID_PROC Procedure) { /* ** Parameters: ** Procedure trap procedure to execute diff --git a/dict/dawg.h b/dict/dawg.h index 6e37cb60..a124e3de 100644 --- a/dict/dawg.h +++ b/dict/dawg.h @@ -35,21 +35,59 @@ /*---------------------------------------------------------------------- T y p e s ----------------------------------------------------------------------*/ -#define MAX_WERD_LENGTH (INT32) 40 -#define MAX_NODE_EDGES (INT32) 100 -#define LAST_FLAG (INT32) 1 -#define DIRECTION_FLAG (INT32) 2 -#define WERD_END_FLAG (INT32) 4 +/* #define MAX_WERD_LENGTH (INT32) 40 */ +/* #define MAX_NODE_EDGES_DISPLAY (INT32) 100 */ +/* #define LAST_FLAG (INT32) 1 */ +/* #define DIRECTION_FLAG (INT32) 2 */ +/* #define WERD_END_FLAG (INT32) 4 */ + +/* #define LETTER_START_BIT 0 */ +/* #define FLAG_START_BIT 8 */ +/* #define NEXT_EDGE_START_BIT 11 */ + +/* #define NO_EDGE (INT32) 0x001fffff */ + +/* #define NEXT_EDGE_MASK (INT32) 0xfffff800 */ +/* #define FLAGS_MASK (INT32) 0x00000700 */ +/* #define LETTER_MASK (INT32) 0x000000ff */ + +/* #define REFFORMAT "%d" */ + +/* typedef UINT32 EDGE_RECORD; */ +/* typedef EDGE_RECORD *EDGE_ARRAY; */ +/* typedef INT32 EDGE_REF; */ +/* typedef INT32 NODE_REF; */ + +#define MAX_WERD_LENGTH (INT64) 40 +#define MAX_NODE_EDGES_DISPLAY (INT64) 100 +#define LAST_FLAG (INT64) 1 +#define DIRECTION_FLAG (INT64) 2 +#define WERD_END_FLAG (INT64) 4 + +#define LETTER_START_BIT 0 +#define FLAG_START_BIT 8 +#define NEXT_EDGE_START_BIT 11 + +#ifdef __MSW32__ +#define NO_EDGE (INT64) 0x001fffffffffffffi64 +#define NEXT_EDGE_MASK (INT64) 0xfffffffffffff800i64 +#define FLAGS_MASK (INT64) 0x0000000000000700i64 +#define LETTER_MASK (INT64) 0x00000000000000ffi64 +#else +#define NO_EDGE (INT64) 0x001fffffffffffffll +#define NEXT_EDGE_MASK (INT64) 0xfffffffffffff800ll +#define FLAGS_MASK (INT64) 0x0000000000000700ll +#define LETTER_MASK (INT64) 0x00000000000000ffll +#endif -#define FLAG_START_BIT 21 -#define LETTER_START_BIT 24 +#define MAX_NUM_EDGES_IN_SQUISHED_DAWG_FILE 2000000 -#define NO_EDGE (INT32) 0x1fffff +#define REFFORMAT "%lld" -typedef UINT32 EDGE_RECORD; +typedef UINT64 EDGE_RECORD; typedef EDGE_RECORD *EDGE_ARRAY; -typedef INT32 EDGE_REF; -typedef INT32 NODE_REF; +typedef INT64 EDGE_REF; +typedef INT64 NODE_REF; /*--------------------------------------------------------------------- V a r i a b l e s @@ -60,6 +98,28 @@ extern INT32 debug; /*---------------------------------------------------------------------- M a c r o s ----------------------------------------------------------------------*/ +/********************************************************************** +* edge_of +* +* Access the edge that is indexed by the requested edge number. +**********************************************************************/ + +#define edge_of(edges,e) \ + ((edges)[e]) + +/********************************************************************** +* print_edge +* +* Print the contents of a single edge entry in the DAWG. +**********************************************************************/ + +#define print_edge(dawg,edge) \ + printf ("%7d : next = %7d, char = '%c', %s %s %s\n", \ + edge, next_node (dawg, edge), edge_letter (dawg, edge), \ + (forward_edge (dawg, edge) ? "FORWARD" : " "), \ + (last_edge (dawg, edge) ? "LAST" : " "), \ + (end_of_word (dawg, edge) ? "EOW" : "")) \ + /********************************************************************** * next_node * @@ -67,7 +127,7 @@ extern INT32 debug; **********************************************************************/ #define next_node(edges,e) \ -((edges)[e] & NO_EDGE) +(((edges)[e] & NEXT_EDGE_MASK) >> NEXT_EDGE_START_BIT) /********************************************************************** * set_next_edge @@ -76,8 +136,17 @@ extern INT32 debug; **********************************************************************/ #define set_next_edge(edges,e,value) \ -((edges)[e] = ((edges)[e] & (INT32) 0xffe00000) |\ - (value & NO_EDGE)) +((edges)[e] = ((edges)[e] & (~NEXT_EDGE_MASK)) |\ + ((value << NEXT_EDGE_START_BIT) & NEXT_EDGE_MASK)) + +/********************************************************************** +* empty_edge_spot +* +* Return TRUE if this edge spot in this location is unoccupied. +**********************************************************************/ + +#define empty_edge_spot(edges,e) \ + ((edges)[e] == NEXT_EDGE_MASK) /********************************************************************** * set_empty_edge @@ -86,7 +155,7 @@ extern INT32 debug; **********************************************************************/ #define set_empty_edge(edges,e) \ -((edges)[e] = NO_EDGE) +((edges)[e] = NEXT_EDGE_MASK) /********************************************************************** * clear_all_edges @@ -105,7 +174,16 @@ for (edge=0; edge> FLAG_START_BIT) /********************************************************************** * edge_letter @@ -114,7 +192,16 @@ for (edge=0; edge> LETTER_START_BIT) +((char)(((edges)[e] & LETTER_MASK) >> LETTER_START_BIT)) + +/********************************************************************** +* letter_of_edge +* +* The letter choice that corresponds to this edge in the DAWG. +**********************************************************************/ + +#define letter_of_edge(edge) \ + ((char)((edge & LETTER_MASK) >> LETTER_START_BIT)) /********************************************************************** * last_edge @@ -171,7 +258,9 @@ while (! last_edge (edges,e++)) * Check the case of this character in the character string to make * sure that there is not a problem with the case. **********************************************************************/ - +// TODO(tkielbus) Replace islalpha, islower & isupper by unicode versions. +// However the lengths information is not available at this point in the +// code. We will probably get rid of the dictionaries at some point anyway. #define case_is_okay(word,i) \ (i ? \ ((isupper(word[i]) && islower(word[i-1])) ? \ diff --git a/training/training.cpp b/training/training.cpp index 652581bf..a14f8b56 100644 --- a/training/training.cpp +++ b/training/training.cpp @@ -23,7 +23,7 @@ make_int_var (LearningDebugLevel, 0, MakeLearningDebugLevel, make_int_var (NormMethod, character, MakeNormMethod, 15, 10, SetNormMethod, "Normalization Method ...") -char *demodir; /*demo home directory */ +//char *demodir; /*demo home directory */ void cprintf( //Trace printf -- GitLab