diff --git a/ccmain/control.cpp b/ccmain/control.cpp index f2bb24ead7397592bb71d481a5b8d98314dd9bf9..0c47d9ab38aa2a0965e4f543eb4eafed1b9084d7 100644 --- a/ccmain/control.cpp +++ b/ccmain/control.cpp @@ -50,6 +50,7 @@ #include "notdll.h" #include "tordvars.h" #include "adaptmatch.h" +#include "globals.h" #define MIN_FONT_ROW_COUNT 8 #define MAX_XHEIGHT_DIFF 3 @@ -148,18 +149,9 @@ EXTERN double_VAR (test_pt_x, 99999.99, "xcoord"); EXTERN double_VAR (test_pt_y, 99999.99, "ycoord"); extern int MatcherDebugLevel; -extern "C" { extern int display_ratings; } +extern int display_ratings; extern int number_debug; extern int adjust_debug; -/* -extern "C" { - extern int MatcherDebugLevel; - extern int display_ratings; - extern int number_debug; - extern int adjust_debug; -// extern int LearningDebugLevel; - }; -*/ FILE *choice_file = NULL; //Choice file ptr CLISTIZEH (PBLOB) CLISTIZE (PBLOB) @@ -569,8 +561,8 @@ if (dopasses==1) return; ////changed by jetsoft //needed for dll to output memory structure - if ((dopasses==0 || dopasses==2) && monitor) - output_pass (page_res_it,true, target_word_box); + if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv)) + output_pass(page_res_it, ocr_char_space() > 0, target_word_box); // end jetsoft } @@ -620,34 +612,33 @@ void classify_word_pass1( //recog one word tess_default_matcher, word->raw_choice, &blob_choices, word->outword); - /* Test for TESS screw up on word. Recog_word has already ensured that the choice list, outword blob lists and best_choice string are the same length. A TESS screw up is indicated by a blank filled or 0 length string. */ - if ((word->best_choice->string ().length () == 0) || + if ((word->best_choice->lengths ().length () == 0) || (strspn (word->best_choice->string ().string (), " ") == word->best_choice->string ().length ())) { word->done = FALSE; //Try again on pass2 - adaption may help word->tess_failed = TRUE; - word->reject_map.initialise (word->best_choice->string ().length ()); + word->reject_map.initialise (word->best_choice->lengths ().length ()); word->reject_map.rej_word_tess_failure (); } else { word->tess_failed = FALSE; - if ((word->best_choice->string ().length () != + if ((word->best_choice->lengths ().length () != word->outword->blob_list ()->length ()) || - (word->best_choice->string ().length () != blob_choices.length ())) { + (word->best_choice->lengths ().length () != blob_choices.length ())) { tprintf ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n", word->best_choice->string ().string (), - word->best_choice->string ().length (), + word->best_choice->lengths ().length (), word->outword->blob_list ()->length (), blob_choices.length ()); } - ASSERT_HOST (word->best_choice->string ().length () == + ASSERT_HOST (word->best_choice->lengths ().length () == word->outword->blob_list ()->length ()); - ASSERT_HOST (word->best_choice->string ().length () == + ASSERT_HOST (word->best_choice->lengths ().length () == blob_choices.length ()); /* @@ -664,12 +655,12 @@ void classify_word_pass1( //recog one word fix_rep_char(word); } else { - fix_quotes ((char *) word->best_choice->string ().string (), + fix_quotes (word->best_choice, //turn to double word->outword, &blob_choices); if (tessedit_fix_hyphens) //turn 2 to 1 - fix_hyphens ((char *) word->best_choice->string ().string (), word->outword, &blob_choices); + fix_hyphens (word->best_choice, word->outword, &blob_choices); record_certainty (word->best_choice->certainty (), 1); //accounting @@ -692,7 +683,7 @@ void classify_word_pass1( //recog one word rejmap = NULL; else { ASSERT_HOST (word->reject_map.length () == - word->best_choice->string ().length ()); + word->best_choice->lengths ().length ()); for (index = 0; index < word->reject_map.length (); index++) { if (adapt_ok || word->reject_map[index].accepted ()) @@ -704,7 +695,9 @@ void classify_word_pass1( //recog one word } //adapt to it - tess_adapter (word->outword, &word->denorm, word->best_choice->string ().string (), word->raw_choice->string ().string (), rejmap); + tess_adapter (word->outword, &word->denorm, + *word->best_choice, + *word->raw_choice, rejmap); } if (tessedit_enable_doc_dict) @@ -712,10 +705,12 @@ void classify_word_pass1( //recog one word set_word_fonts(word, &blob_choices); } } +#if 0 if (tessedit_print_text) { write_cooked_text (bln_word, word->best_choice->string (), word->done, FALSE, stdout); } +#endif delete bln_word; blob_choices.deep_clear (); } @@ -898,10 +893,12 @@ void classify_word_pass2( //word to do #endif set_global_subloc_code(SUBLOC_NORM); +#if 0 if (tessedit_print_text) { write_cooked_text (word->outword, word->best_choice->string (), word->done, done_this_pass, stdout); } +#endif check_debug_pt (word, 50); } @@ -971,18 +968,18 @@ void match_word_pass2( //recog one word // tprintf("Empty word produced\n"); } else { - if ((word->best_choice->string ().length () != + if ((word->best_choice->lengths ().length () != word->outword->blob_list ()->length ()) || - (word->best_choice->string ().length () != blob_choices.length ())) { + (word->best_choice->lengths ().length () != blob_choices.length ())) { tprintf ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n", word->best_choice->string ().string (), - word->best_choice->string ().length (), + word->best_choice->lengths ().length (), word->outword->blob_list ()->length (), blob_choices.length ()); } - ASSERT_HOST (word->best_choice->string ().length () == + ASSERT_HOST (word->best_choice->lengths ().length () == word->outword->blob_list ()->length ()); - ASSERT_HOST (word->best_choice->string ().length () == + ASSERT_HOST (word->best_choice->lengths ().length () == blob_choices.length ()); word->tess_failed = FALSE; @@ -990,29 +987,29 @@ void match_word_pass2( //recog one word fix_rep_char(word); } else { - fix_quotes ((char *) word->best_choice->string ().string (), + fix_quotes (word->best_choice, word->outword, &blob_choices); if (tessedit_fix_hyphens) - fix_hyphens ((char *) word->best_choice->string ().string (), + fix_hyphens (word->best_choice, word->outword, &blob_choices); /* Dont trust fix_quotes! - though I think I've fixed the bug */ - if ((word->best_choice->string ().length () != - word->outword->blob_list ()->length ()) || - (word->best_choice->string ().length () != - blob_choices.length ())) { + if ((word->best_choice->lengths ().length () != + word->outword->blob_list ()->length ()) || + (word->best_choice->lengths ().length () != + blob_choices.length ())) { #ifndef SECURE_NAMES tprintf ("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n", - word->best_choice->string ().string (), - word->best_choice->string ().length (), - word->outword->blob_list ()->length (), - blob_choices.length ()); + word->best_choice->string ().string (), + word->best_choice->lengths ().length (), + word->outword->blob_list ()->length (), + blob_choices.length ()); #endif } - ASSERT_HOST (word->best_choice->string ().length () == + ASSERT_HOST (word->best_choice->lengths ().length () == word->outword->blob_list ()->length ()); - ASSERT_HOST (word->best_choice->string ().length () == + ASSERT_HOST (word->best_choice->lengths ().length () == blob_choices.length ()); word->tess_accepted = tess_acceptable_word (word->best_choice, @@ -1039,7 +1036,7 @@ void fix_rep_char( //Repeated char word ) { struct REP_CH { - char ch; + char ch[UNICHAR_LEN + 1]; int count; }; @@ -1048,19 +1045,25 @@ void fix_rep_char( //Repeated char word int rep_ch_count = 0; //how many unique chs const char *word_str; //the repeated chs int i, j; + int offset; int total = 0; int max = 0; - char maxch = ' '; //Most common char + char *maxch = NULL; //Most common char word_str = word->best_choice->string ().string (); - word_len = strlen (word_str); + word_len = word->best_choice->lengths ().length ();; rep_ch = (REP_CH *) alloc_mem (word_len * sizeof (REP_CH)); - for (i = 0; i < word_len; i++) { - for (j = 0; j < rep_ch_count && rep_ch[j].ch != word_str[i]; j++); + for (i = 0, offset = 0; i < word_len; + offset += word->best_choice->lengths()[i++]) { + for (j = 0; j < rep_ch_count && + strncmp(rep_ch[j].ch, word_str + offset, + word->best_choice->lengths()[i]) != 0; j++); if (j < rep_ch_count) rep_ch[j].count++; else { - rep_ch[rep_ch_count].ch = word_str[i]; + strncpy(rep_ch[rep_ch_count].ch, word_str + offset, + word->best_choice->lengths()[i]); + rep_ch[rep_ch_count].ch[word->best_choice->lengths()[i]] = '\0'; rep_ch[rep_ch_count].count = 1; rep_ch_count++; } @@ -1068,7 +1071,7 @@ void fix_rep_char( //Repeated char word for (j = 0; j < rep_ch_count; j++) { total += rep_ch[j].count; - if ((rep_ch[j].count > max) && (rep_ch[j].ch != ' ')) { + if ((rep_ch[j].count > max) && (*rep_ch[j].ch != ' ')) { max = rep_ch[j].count; maxch = rep_ch[j].ch; } @@ -1078,26 +1081,47 @@ void fix_rep_char( //Repeated char word free_mem(rep_ch); word->reject_map.initialise (word_len); - for (i = 0; i < word_len; i++) { - if (word_str[i] != maxch) + for (i = 0, offset = 0; i < word_len; + offset += word->best_choice->lengths()[i++]) { + if (strncmp(word_str + offset, maxch, + word->best_choice->lengths()[i]) != 0) //rej unrecognised blobs word->reject_map[i].setrej_bad_repetition (); } word->done = TRUE; } +// TODO(tkielbus) Decide between keeping this behavior here or modifying the +// training data. + +// Utility function for fix_quotes +// Return true if the next character in the string (given the UTF8 length in +// bytes) is a quote character. +static int is_simple_quote(const char* signed_str, int length) { + const unsigned char* str = reinterpret_cast(signed_str); + //standard 1 byte quotes + return (length == 1 && (*str == '\'' || *str == '`')) || + //utf8 3 bytes curved quotes + (length == 3 && ((*str == 0xe2 && + *(str + 1) == 0x80 && + *(str + 2) == 0x98) || + (*str == 0xe2 && + *(str + 1) == 0x80 && + *(str + 2) == 0x99))); +} /********************************************************************** * fix_quotes * * Change pairs of quotes to double quotes. **********************************************************************/ - void fix_quotes( //make double quotes - char *string, //string to fix + WERD_CHOICE *choice, //choice to fix WERD *word, //word to do //char choices BLOB_CHOICE_LIST_CLIST *blob_choices) { - char *ptr; //string ptr + char *str = (char *) choice->string().string();//string ptr + int i; + int offset; //blobs PBLOB_IT blob_it = word->blob_list (); //choices @@ -1105,12 +1129,20 @@ void fix_quotes( //make double quotes BLOB_CHOICE_IT it1; //first choices BLOB_CHOICE_IT it2; //second choices - for (ptr = string; - *ptr != '\0'; ptr++, blob_it.forward (), choice_it.forward ()) { - if ((*ptr == '\'' || *ptr == '`') - && (*(ptr + 1) == '\'' || *(ptr + 1) == '`')) { - *ptr = '"'; //turn to double - strcpy (ptr + 1, ptr + 2); //shuffle up + for (i = 0, offset = 0; str[offset] != '\0'; + offset += choice->lengths()[i++], + blob_it.forward (), choice_it.forward ()) { + if (str[offset + choice->lengths()[i]] != '\0' && + is_simple_quote(str + offset, choice->lengths()[i]) && + is_simple_quote(str + offset + choice->lengths()[i], + choice->lengths()[i + 1])) { + str[offset] = '"'; //turn to double + strcpy (str + offset + 1, + str + offset + choice->lengths()[i] + + choice->lengths()[i + 1]); //shuffle up + choice->lengths()[i] = 1; + strcpy ((char*) choice->lengths().string() + i + 1, + choice->lengths().string() + i + 2); merge_blobs (blob_it.data (), blob_it.data_relative (1)); blob_it.forward (); delete blob_it.extract (); //get rid of spare @@ -1138,12 +1170,13 @@ void fix_quotes( //make double quotes * Change pairs of hyphens to a single hyphen if the bounding boxes touch * Typically a long dash which has been segmented. **********************************************************************/ - void fix_hyphens( //crunch double hyphens - char *string, //string to fix + WERD_CHOICE *choice, //choice to fix WERD *word, //word to do //char choices BLOB_CHOICE_LIST_CLIST *blob_choices) { - char *ptr; //string ptr + char *str = (char *) choice->string().string();//string ptr + int i; + int offset; //blobs PBLOB_IT blob_it = word->blob_list (); //choices @@ -1151,14 +1184,20 @@ void fix_hyphens( //crunch double hyphens BLOB_CHOICE_IT it1; //first choices BLOB_CHOICE_IT it2; //second choices - for (ptr = string; - *ptr != '\0'; ptr++, blob_it.forward (), choice_it.forward ()) { - if ((*ptr == '-' || *ptr == '~') && - (*(ptr + 1) == '-' || *(ptr + 1) == '~') && + for (i = 0, offset = 0; str[offset] != '\0'; + offset += choice->lengths()[i++], + blob_it.forward (), choice_it.forward ()) { + if ((str[offset] == '-' || str[offset] == '~') && + (str[offset + choice->lengths()[i]] == '-' || + str[offset + choice->lengths()[i]] == '~') && (blob_it.data ()->bounding_box ().right () >= blob_it.data_relative (1)->bounding_box ().left ())) { - *ptr = '-'; //turn to single hyphen - strcpy (ptr + 1, ptr + 2); //shuffle up + str[offset] = '-'; //turn to single hyphen + strcpy (str + offset + choice->lengths()[i], + str + offset + choice->lengths()[i] + + choice->lengths()[i + 1]); //shuffle up + strcpy ((char*) choice->lengths().string() + i + 1, + choice->lengths().string() + i + 2); merge_blobs (blob_it.data (), blob_it.data_relative (1)); blob_it.forward (); delete blob_it.extract (); //get rid of spare @@ -1249,11 +1288,9 @@ void choice_dump_tester( //dump chars in word it.set_to_list (ratings); for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) { blob_choice = it.data (); - if ((blob_choice->char_class () >= '!') && - (blob_choice->char_class () <= '~')) - fprintf (choice_file, "\t%c\t%f\t%f", - blob_choice->char_class (), - blob_choice->rating (), blob_choice->certainty ()); + fprintf (choice_file, "\t%s\t%f\t%f", + blob_choice->unichar (), + blob_choice->rating (), blob_choice->certainty ()); } fprintf (choice_file, "\n"); } @@ -1290,33 +1327,37 @@ WERD *make_bln_copy(WERD *src_word, ROW *row, float x_height, DENORM *denorm) { } -ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s) { +ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s, + const char *lengths) { int i = 0; + int offset = 0; int leading_punct_count; int upper_count = 0; int hyphen_pos = -1; ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE; - if (strlen (s) > 20) + if (strlen (lengths) > 20) return word_type; /* Single Leading punctuation char*/ - if ((s[i] != '\0') && (STRING (chs_leading_punct).contains (s[i]))) - i++; + if ((s[offset] != '\0') && (STRING (chs_leading_punct).contains (s[offset]))) + offset += lengths[i++]; leading_punct_count = i; /* Initial cap */ - while (isupper (s[i])) { - i++; + while ((s[offset] != '\0') && + unicharset.get_isupper(s + offset, lengths[i])) { + offset += lengths[i++]; upper_count++; } if (upper_count > 1) word_type = AC_UPPER_CASE; else { /* Lower case word, possibly with an initial cap */ - while (islower (s[i])) { - i++; + while ((s[offset] != '\0') && + unicharset.get_islower (s + offset, lengths[i])) { + offset += lengths[i++]; } if (i - leading_punct_count < quality_min_initial_alphas_reqd) goto not_a_word; @@ -1324,11 +1365,13 @@ ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s) { Allow a single hyphen in a lower case word - dont trust upper case - I've seen several cases of "H" -> "I-I" */ - if (s[i] == '-') { - hyphen_pos = i++; - if (s[i] != '\0') { - while (islower (s[i])) { - i++; + if (lengths[i] == 1 && s[offset] == '-') { + hyphen_pos = i; + offset += lengths[i++]; + if (s[offset] != '\0') { + while ((s[offset] != '\0') && + unicharset.get_islower(s + offset, lengths[i])) { + offset += lengths[i++]; } if (i < hyphen_pos + 3) goto not_a_word; @@ -1336,8 +1379,11 @@ ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s) { } else { /* Allow "'s" in NON hyphenated lower case words */ - if ((s[i] == '\'') && (s[i + 1] == 's')) - i += 2; + if (lengths[i] == 1 && (s[offset] == '\'') && + lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) { + offset += lengths[i++]; + offset += lengths[i++]; + } } if (upper_count > 0) word_type = AC_INITIAL_CAP; @@ -1346,13 +1392,15 @@ ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s) { } /* Up to two different, constrained trailing punctuation chars */ - if ((s[i] != '\0') && (STRING (chs_trailing_punct1).contains (s[i]))) - i++; - if ((s[i] != '\0') && - (s[i - 1] != s[i]) && (STRING (chs_trailing_punct2).contains (s[i]))) - i++; - - if (s[i] != '\0') + if (lengths[i] == 1 && (s[offset] != '\0') && + (STRING (chs_trailing_punct1).contains (s[offset]))) + offset += lengths[i++]; + if (lengths[i] == 1 && (s[offset] != '\0') && i > 0 && + (s[offset - lengths[i - 1]] != s[offset]) && + (STRING (chs_trailing_punct2).contains (s[offset]))) + offset += lengths[i++]; + + if (s[offset] != '\0') word_type = AC_UNACCEPTABLE; not_a_word: @@ -1360,17 +1408,26 @@ ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s) { if (word_type == AC_UNACCEPTABLE) { /* Look for abbreviation string */ i = 0; - if (isupper (s[0])) { + offset = 0; + if (s[0] != '\0' && unicharset.get_isupper (s, lengths[0])) { word_type = AC_UC_ABBREV; - while ((s[i] != '\0') && isupper (s[i]) && (s[i + 1] == '.')) - i += 2; + while ((s[offset] != '\0') && + unicharset.get_isupper(s + offset, lengths[i]) && + (lengths[i + 1] == 1 && s[offset + lengths[i]] == '.')) { + offset += lengths[i++]; + offset += lengths[i++]; + } } - else if (islower (s[0])) { + else if (s[0] != '\0' && unicharset.get_islower (s, lengths[0])) { word_type = AC_LC_ABBREV; - while ((s[i] != '\0') && islower (s[i]) && (s[i + 1] == '.')) - i += 2; + while ((s[offset] != '\0') && + unicharset.get_islower(s + offset, lengths[i]) && + (lengths[i + 1] == 1 && s[offset + lengths[i]] == '.')) { + offset += lengths[i++]; + offset += lengths[i++]; + } } - if (s[i] != '\0') + if (s[offset] != '\0') word_type = AC_UNACCEPTABLE; } @@ -1478,7 +1535,8 @@ void set_word_fonts( //good chars in word WERD_RES *word, //word to adapt to //detailed results BLOB_CHOICE_LIST_CLIST *blob_choices) { INT32 index; //char index - char choice_char; //char from word + INT32 offset; //char offset + char choice_char[UNICHAR_LEN + 1]; //char from word INT8 config; //font of char //character iterator BLOB_CHOICE_LIST_C_IT char_it = blob_choices; @@ -1517,16 +1575,19 @@ void set_word_fonts( //good chars in word word->italic = 0; word->bold = 0; - for (char_it.mark_cycle_pt (), index = 0; - !char_it.cycled_list (); char_it.forward (), index++) { - choice_char = word->best_choice->string ()[index]; + for (char_it.mark_cycle_pt (), index = 0, offset = 0; + !char_it.cycled_list (); char_it.forward (), + offset += word->best_choice->lengths()[index++]) { + strncpy(choice_char, word->best_choice->string ().string() + offset, + word->best_choice->lengths()[index]); + choice_char[word->best_choice->lengths()[index]] = '\0'; choice_it.set_to_list (char_it.data ()); for (choice_it.mark_cycle_pt (); !choice_it.cycled_list (); - choice_it.forward ()) { - if (choice_it.data ()->char_class () == choice_char) { + choice_it.forward ()) { + if (strcmp(choice_it.data ()->unichar (), choice_char) == 0) { config = choice_it.data ()->config (); if (tessedit_debug_fonts) - tprintf ("%c(%d=%d%c%c)", + tprintf ("%s(%d=%d%c%c)", choice_char, config, (config & 31) >> 2, config & 2 ? 'N' : 'B', config & 1 ? 'N' : 'I'); if (config != -1) { diff --git a/ccmain/tfacep.h b/ccmain/tfacep.h index 6bcdd2ff1a02eafc18b219381711f1d58f1fe9dc..9013414ee10c14f57e5e859a5d934e38cc4cb4e8 100644 --- a/ccmain/tfacep.h +++ b/ccmain/tfacep.h @@ -46,66 +46,8 @@ typedef void (*TESS_TESTER) (TBLOB *, BOOL8, char *, INT32, LIST); typedef LIST (*TESS_MATCHER) (TBLOB *, TBLOB *, TBLOB *, void *, TEXTROW *); -extern "C" -{ - /* - int start_recog( //Real main in C - int argc, - char *argv[]); - void program_editup2( //afterforking part - int argc, - char** argv); - - int end_recog( //Real main in C - int argc, - char *argv[]); - void set_interactive_pass(); - void set_pass1(); - void set_pass2(); - //ARRAY cc_recog(TWERD*,TESS_CHOICE*,TESS_CHOICE*,TESS_TESTER, - // TESS_TESTER);*/ - //void wo_learn_blob(TBLOB*,TEXTROW*,char*,INT32); - //LIST AdaptiveClassifier(TBLOB*,TBLOB*,TEXTROW*); - //void LearnBlob(TBLOB*,TEXTROW*,char*,INT32); - //TWERD *newword(); - //TBLOB *newblob(); - //TESSLINE *newoutline(); - //EDGEPT *newedgept(); - //void oldedgept(EDGEPT*); - //void destroy_nodes(void*,void (*)(void*)); - //TESS_LIST *append_choice(TESS_LIST*,char*,double,double,char); - //void fix_quotes (char*); - //void record_certainty(double,int); - //int AcceptableResult(A_CHOICE*,A_CHOICE*); - //int AdaptableWord(TWERD*,const char*,const char*); - //void delete_word(TWERD*); - //void free_blob(TBLOB*); - //void add_document_word(A_CHOICE*); - //void AdaptToWord(TWERD*,TEXTROW*,const char*,const char*,const char*); - //void SaveBadWord(const char*,double); - //void free_choice(TESS_CHOICE*); - //TWERD *newword(); - //TBLOB *newblob(); - //void free_blob( //free a blob - // TBLOB *blob); //blob to free - - //int dict_word( const char* ); - - //extern int tess_cn_matching; - //extern int tess_bn_matching; - //extern int last_word_on_line; - extern TEXTROW normalized_row; - //extern TESS_MATCHER blob_matchers[]; - //extern FILE *rawfile; - //extern FILE *textfile; - //extern int character_count; - //extern int word_count; - //extern int enable_assoc; - //extern int chop_enable; - //extern int permute_only_top; - extern int display_ratings; - -}; +extern TEXTROW normalized_row; +extern int display_ratings; #if 0 #define strsave(s) \ diff --git a/ccutil/tessopt.cpp b/ccutil/tessopt.cpp index 6046710ad07df356a42d6c4b8c4e039163c2d542..cd980df2b550d62acc69bf8dce427c22575c5b4c 100644 --- a/ccutil/tessopt.cpp +++ b/ccutil/tessopt.cpp @@ -23,8 +23,8 @@ #include "tessopt.h" #include "notdll.h" //must be last include -int optind; -char *optarg; +int tessoptind; +char *tessoptarg; /********************************************************************** * tessopt @@ -37,22 +37,22 @@ INT32 argc, //arg count char *argv[], //args const char *arglist //string of arg chars ) { - char *arg; //arg char + const char *arg; //arg char - if (optind == 0) - optind = 1; - if (optind < argc && argv[optind][0] == '-') { - arg = strchr (arglist, argv[optind][1]); + if (tessoptind == 0) + tessoptind = 1; + if (tessoptind < argc && argv[tessoptind][0] == '-') { + arg = strchr (arglist, argv[tessoptind][1]); if (arg == NULL || *arg == ':') return '?'; //dud option - optind++; - optarg = argv[optind]; + tessoptind++; + tessoptarg = argv[tessoptind]; if (arg[1] == ':') { - if (argv[optind - 1][2] != '\0') + if (argv[tessoptind - 1][2] != '\0') //immediately after - optarg = argv[optind - 1] + 2; + tessoptarg = argv[tessoptind - 1] + 2; else - optind++; + tessoptind++; } return *arg; } diff --git a/ccutil/tessopt.h b/ccutil/tessopt.h index 9cdd956794a9ef981baffca53528688649349901..149873c74a88bed4d50e6556837330b56d5dc779 100644 --- a/ccutil/tessopt.h +++ b/ccutil/tessopt.h @@ -20,8 +20,8 @@ #include "host.h" #include "notdll.h" //must be last include -extern int optind; -extern char *optarg; +extern int tessoptind; +extern char *tessoptarg; int tessopt ( //parse args INT32 argc, //arg count diff --git a/classify/extern.h b/classify/extern.h index cfae28c72a055b670ef5f5615dcfcd68db6ce625..2cdc78670eed6635da1124c73d18eae6a5ddb5ff 100644 --- a/classify/extern.h +++ b/classify/extern.h @@ -28,7 +28,7 @@ */ #ifdef __cplusplus -#define EXTERN extern "C" +#define EXTERN extern #else #define EXTERN extern #endif diff --git a/cutil/globals.h b/cutil/globals.h index f737a5dffe0ca446ce5ff518b2a3ec42eab80f4a..97abb05a7db01441e30be5fac94567adf72d3830 100644 --- a/cutil/globals.h +++ b/cutil/globals.h @@ -39,14 +39,12 @@ extern TBLOB *pageblobs; /*first blob on page */ extern TEXTBLOCK *pageblocks; /*first block on page */ /*class definitions */ -extern char classes[CLASSIZE][CLASSLENGTH]; +/* extern char classes[CLASSIZE][CLASSLENGTH]; */ extern int resolution; /*scanner res in dpi */ extern int acts[MAXPROC]; /*action flags */ extern int debugs[MAXPROC]; /*debug flags */ extern int plots[MAXPROC]; /*plot flags */ extern int corners[4]; /*corners of scan window */ -extern int optind; /*option index */ -extern char *optarg; /*option argument */ /*image file name */ extern char imagefile[FILENAMESIZE]; /* main directory */ diff --git a/training/cnTraining.cpp b/training/cnTraining.cpp index 8933adfbaffd8266f84a1a4beef1ccd95c0b0b98..9e57286eb53ba9428af330e5345986a57fddda18 100644 --- a/training/cnTraining.cpp +++ b/training/cnTraining.cpp @@ -37,6 +37,7 @@ #include #include #include +#include "unichar.h" #define MAXNAMESIZE 80 #define MAX_NUM_SAMPLES 10000 @@ -219,21 +220,34 @@ int main ( ParseArguments (argc, argv); while ((PageName = GetNextFilename()) != NULL) { - printf ("\nReading %s ...", PageName); + printf ("Reading %s ...\n", PageName); TrainingPage = Efopen (PageName, "r"); ReadTrainingSamples (TrainingPage, &CharList); fclose (TrainingPage); //WriteTrainingSamples (Directory, CharList); } + printf("Clustering ...\n"); pCharList = CharList; iterate(pCharList) { - //Cluster - CharSample = (LABELEDLIST) first_node (pCharList); - printf ("\nClustering %s ...", CharSample->Label); - Clusterer = SetUpForClustering(CharSample); - ProtoList = ClusterSamples(Clusterer, &Config); - AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label); + //Cluster + CharSample = (LABELEDLIST) first_node (pCharList); + //printf ("\nClustering %s ...", CharSample->Label); + Clusterer = SetUpForClustering(CharSample); + float SavedMinSamples = Config.MinSamples; + while (Config.MinSamples > 0.001) { + ProtoList = ClusterSamples(Clusterer, &Config); + if (NumberOfProtos(ProtoList, 1, 0) > 0) + break; + else { + Config.MinSamples *= 0.95; + printf("0 significant protos for %s." + " Retrying clustering with MinSamples = %f%%\n", + CharSample->Label, Config.MinSamples); + } + } + Config.MinSamples = SavedMinSamples; + AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label); } FreeTrainingSamples (CharList); WriteNormProtos (Directory, NormProtoList, Clusterer); @@ -262,7 +276,7 @@ void ParseArguments( ** ShowSignificantProtos flag controlling proto display ** ShowInsignificantProtos flag controlling proto display ** Config current clustering parameters -** optarg, optind defined by tessopt sys call +** tessoptarg, tessoptind defined by tessopt sys call ** Argc, Argv global copies of argc and argv ** Operation: ** This routine parses the command line arguments that were @@ -287,7 +301,6 @@ void ParseArguments( int Option; int ParametersRead; BOOL8 Error; - extern char *optarg; Error = FALSE; Argc = argc; @@ -297,48 +310,48 @@ void ParseArguments( switch ( Option ) { case 'n': - sscanf(optarg,"%d", &ParametersRead); + sscanf(tessoptarg,"%d", &ParametersRead); ShowInsignificantProtos = ParametersRead; break; case 'p': - sscanf(optarg,"%d", &ParametersRead); + sscanf(tessoptarg,"%d", &ParametersRead); ShowSignificantProtos = ParametersRead; break; case 'd': ShowAllSamples = FALSE; break; case 'C': - ParametersRead = sscanf( optarg, "%lf", &(Config.Confidence) ); + ParametersRead = sscanf( tessoptarg, "%lf", &(Config.Confidence) ); if ( ParametersRead != 1 ) Error = TRUE; else if ( Config.Confidence > 1 ) Config.Confidence = 1; else if ( Config.Confidence < 0 ) Config.Confidence = 0; break; case 'I': - ParametersRead = sscanf( optarg, "%f", &(Config.Independence) ); + ParametersRead = sscanf( tessoptarg, "%f", &(Config.Independence) ); if ( ParametersRead != 1 ) Error = TRUE; else if ( Config.Independence > 1 ) Config.Independence = 1; else if ( Config.Independence < 0 ) Config.Independence = 0; break; case 'M': - ParametersRead = sscanf( optarg, "%f", &(Config.MinSamples) ); + ParametersRead = sscanf( tessoptarg, "%f", &(Config.MinSamples) ); if ( ParametersRead != 1 ) Error = TRUE; else if ( Config.MinSamples > 1 ) Config.MinSamples = 1; else if ( Config.MinSamples < 0 ) Config.MinSamples = 0; break; case 'B': - ParametersRead = sscanf( optarg, "%f", &(Config.MaxIllegal) ); + ParametersRead = sscanf( tessoptarg, "%f", &(Config.MaxIllegal) ); if ( ParametersRead != 1 ) Error = TRUE; else if ( Config.MaxIllegal > 1 ) Config.MaxIllegal = 1; else if ( Config.MaxIllegal < 0 ) Config.MaxIllegal = 0; break; case 'R': - ParametersRead = sscanf( optarg, "%f", &RoundingAccuracy ); + ParametersRead = sscanf( tessoptarg, "%f", &RoundingAccuracy ); if ( ParametersRead != 1 ) Error = TRUE; else if ( RoundingAccuracy > 0.01 ) RoundingAccuracy = 0.01; else if ( RoundingAccuracy < 0.0 ) RoundingAccuracy = 0.0; break; case 'S': - switch ( optarg[0] ) + switch ( tessoptarg[0] ) { case 's': Config.ProtoStyle = spherical; break; case 'e': Config.ProtoStyle = elliptical; break; @@ -348,10 +361,10 @@ void ParseArguments( } break; case 'D': - Directory = optarg; + Directory = tessoptarg; break; case 'N': - if (sscanf (optarg, "%d", &MaxNumSamples) != 1 || + if (sscanf (tessoptarg, "%d", &MaxNumSamples) != 1 || MaxNumSamples <= 0) Error = TRUE; break; @@ -375,7 +388,7 @@ char *GetNextFilename () /* ** Parameters: none ** Globals: -** optind defined by tessopt sys call +** tessoptind defined by tessopt sys call ** Argc, Argv global copies of argc and argv ** Operation: ** This routine returns the next command line argument. If @@ -388,8 +401,8 @@ char *GetNextFilename () */ { - if (optind < Argc) - return (Argv [optind++]); + if (tessoptind < Argc) + return (Argv [tessoptind++]); else return (NULL); @@ -417,32 +430,32 @@ void ReadTrainingSamples ( */ { - char CharName[MAXNAMESIZE]; + char unichar[UNICHAR_LEN + 1]; LABELEDLIST CharSample; FEATURE_SET FeatureSamples; - CHAR_DESC CharDesc; - int Type, i; - - while (fscanf (File, "%s %s", FontName, CharName) == 2) { - CharSample = FindList (*TrainingSamples, CharName); - if (CharSample == NULL) { - CharSample = NewLabeledList (CharName); - *TrainingSamples = push (*TrainingSamples, CharSample); - } - CharDesc = ReadCharDescription (File); - Type = ShortNameToFeatureType(PROGRAM_FEATURE_TYPE); - FeatureSamples = FeaturesOfType(CharDesc, Type); - for (int feature = 0; feature < FeatureSamples->NumFeatures; ++feature) { - FEATURE f = FeatureSamples->Features[feature]; - for (int dim =0; dim < f->Type->NumParams; ++dim) - f->Params[dim] += UniformRandomNumber(-MINSD, MINSD); - } - CharSample->List = push (CharSample->List, FeatureSamples); - for (i = 0; i < NumFeatureSetsIn (CharDesc); i++) - if (Type != i) - FreeFeatureSet (FeaturesOfType (CharDesc, i)); - free (CharDesc); - } + CHAR_DESC CharDesc; + int Type, i; + + while (fscanf (File, "%s %s", FontName, unichar) == 2) { + CharSample = FindList (*TrainingSamples, unichar); + if (CharSample == NULL) { + CharSample = NewLabeledList (unichar); + *TrainingSamples = push (*TrainingSamples, CharSample); + } + CharDesc = ReadCharDescription (File); + Type = ShortNameToFeatureType(PROGRAM_FEATURE_TYPE); + FeatureSamples = FeaturesOfType(CharDesc, Type); + for (int feature = 0; feature < FeatureSamples->NumFeatures; ++feature) { + FEATURE f = FeatureSamples->Features[feature]; + for (int dim =0; dim < f->Type->NumParams; ++dim) + f->Params[dim] += UniformRandomNumber(-MINSD, MINSD); + } + CharSample->List = push (CharSample->List, FeatureSamples); + for (i = 0; i < NumFeatureSetsIn (CharDesc); i++) + if (Type != i) + FreeFeatureSet (FeaturesOfType (CharDesc, i)); + free (CharDesc); + } } // ReadTrainingSamples /*---------------------------------------------------------------------------*/ @@ -606,7 +619,6 @@ void WriteNormProtos ( char Filename[MAXNAMESIZE]; LABELEDLIST LabeledProto; int N; - char Label; strcpy (Filename, ""); if (Directory != NULL) @@ -623,9 +635,17 @@ void WriteNormProtos ( { LabeledProto = (LABELEDLIST) first_node (LabeledProtoList); N = NumberOfProtos(LabeledProto->List, - ShowSignificantProtos, ShowInsignificantProtos); - Label = NameToChar(LabeledProto->Label); - fprintf(File, "\n%c %d\n", Label, N); + ShowSignificantProtos, ShowInsignificantProtos); + if (N < 1) { + printf ("\nError! Not enough protos for %s: %d protos" + " (%d significant protos" + ", %d insignificant protos)\n", + LabeledProto->Label, N, + NumberOfProtos(LabeledProto->List, 1, 0), + NumberOfProtos(LabeledProto->List, 0, 1)); + exit(1); + } + fprintf(File, "\n%s %d\n", LabeledProto->Label, N); WriteProtos(File, Clusterer->SampleSize, LabeledProto->List, ShowSignificantProtos, ShowInsignificantProtos); } diff --git a/training/mfTraining.cpp b/training/mfTraining.cpp index c956774bae2eb249d5fb81a2ea3c669aa483db72..0e72ad5e25f39b1f12ed5bf908ec2df8ebf4785d 100644 --- a/training/mfTraining.cpp +++ b/training/mfTraining.cpp @@ -44,6 +44,9 @@ #include "intproto.h" #include "variables.h" #include "freelist.h" +#include "efio.h" +#include "danerror.h" +#include "globals.h" #include #include @@ -73,7 +76,6 @@ typedef MERGE_CLASS_NODE* MERGE_CLASS; #define round(x,frag)(floor(x/frag+.5)*frag) - /**---------------------------------------------------------------------------- Public Function Prototypes ----------------------------------------------------------------------------**/ @@ -164,21 +166,7 @@ void Normalize ( void SetUpForFloat2Int( LIST LabeledClassList); -void WritePFFMTable(INT_TEMPLATES Templates, const char* filename) { - FILE* fp = Efopen(filename, "wb"); - /* then write out each class */ - for (int i = 0; i < NumClassesIn (Templates); i++) { - int MaxLength = 0; - INT_CLASS Class = ClassForIndex (Templates, i); - for (int ConfigId = 0; ConfigId < NumIntConfigsIn (Class); ConfigId++) { - if (LengthForConfigId (Class, ConfigId) > MaxLength) - MaxLength = LengthForConfigId (Class, ConfigId); - } - fprintf(fp, "%c %d\n", ClassIdForIndex(Templates, i), MaxLength); - } - fclose(fp); -} - +void WritePFFMTable(INT_TEMPLATES Templates, const char* filename); //--------------Global Data Definitions and Declarations-------------- static char FontName[MAXNAMESIZE]; @@ -200,6 +188,9 @@ static CLUSTERCONFIG Config = static FLOAT32 RoundingAccuracy = 0.0; +// The unicharset used during mftraining +static UNICHARSET unicharset_mftraining; + /*---------------------------------------------------------------------------- Public Code -----------------------------------------------------------------------------*/ @@ -260,12 +251,17 @@ int main ( LIST pCharList, pProtoList; char Filename[MAXNAMESIZE]; + // Clean the unichar set + unicharset_mftraining.clear(); + // Space character needed to represent NIL classification + unicharset_mftraining.unichar_insert(" "); + ParseArguments (argc, argv); InitFastTrainerVars (); InitSubfeatureVars (); while ((PageName = GetNextFilename()) != NULL) { - printf ("\nReading %s ...", PageName); + printf ("Reading %s ...\n", PageName); TrainingPage = Efopen (PageName, "r"); CharList = ReadTrainingSamples (TrainingPage); fclose (TrainingPage); @@ -275,7 +271,7 @@ int main ( { //Cluster CharSample = (LABELEDLIST) first_node (pCharList); - printf ("\nClustering %s ...", CharSample->Label); +// printf ("\nClustering %s ...", CharSample->Label); Clusterer = SetUpForClustering(CharSample); ProtoList = ClusterSamples(Clusterer, &Config); //WriteClusteredTrainingSamples (Directory, ProtoList, Clusterer, CharSample); @@ -320,14 +316,13 @@ int main ( FreeProtoList (&ProtoList); } FreeTrainingSamples (CharList); - printf ("\n"); } //WriteMergedTrainingSamples(Directory,ClassList); WriteMicrofeat(Directory, ClassList); InitIntProtoVars (); InitPrototypes (); SetUpForFloat2Int(ClassList); - IntTemplates = CreateIntTemplates(TrainingData); + IntTemplates = CreateIntTemplates(TrainingData, unicharset_mftraining); strcpy (Filename, ""); if (Directory != NULL) { @@ -340,11 +335,18 @@ int main ( #else OutFile = Efopen (Filename, "wb"); #endif - WriteIntTemplates(OutFile, IntTemplates); + WriteIntTemplates(OutFile, IntTemplates, unicharset_mftraining); fclose (OutFile); - // Now create pffmtable. - WritePFFMTable(IntTemplates, "pffmtable"); - printf ("\nDone!\n"); /**/ + strcpy (Filename, ""); + if (Directory != NULL) + { + strcat (Filename, Directory); + strcat (Filename, "/"); + } + strcat (Filename, "pffmtable"); + // Now create pffmtable. + WritePFFMTable(IntTemplates, Filename); + printf ("Done!\n"); /**/ FreeLabeledClassList (ClassList); return 0; } /* main */ @@ -367,7 +369,7 @@ char **argv) ** ShowSignificantProtos flag controlling proto display ** ShowInsignificantProtos flag controlling proto display ** Config current clustering parameters -** optarg, optind defined by tessopt sys call +** tessoptarg, tessoptind defined by tessopt sys call ** Argc, Argv global copies of argc and argv ** Operation: ** This routine parses the command line arguments that were @@ -392,7 +394,6 @@ char **argv) int Option; int ParametersRead; BOOL8 Error; - extern char *optarg; Error = FALSE; Argc = argc; @@ -411,37 +412,37 @@ char **argv) ShowAllSamples = FALSE; break; case 'C': - ParametersRead = sscanf( optarg, "%lf", &(Config.Confidence) ); + ParametersRead = sscanf( tessoptarg, "%lf", &(Config.Confidence) ); if ( ParametersRead != 1 ) Error = TRUE; else if ( Config.Confidence > 1 ) Config.Confidence = 1; else if ( Config.Confidence < 0 ) Config.Confidence = 0; break; case 'I': - ParametersRead = sscanf( optarg, "%f", &(Config.Independence) ); + ParametersRead = sscanf( tessoptarg, "%f", &(Config.Independence) ); if ( ParametersRead != 1 ) Error = TRUE; else if ( Config.Independence > 1 ) Config.Independence = 1; else if ( Config.Independence < 0 ) Config.Independence = 0; break; case 'M': - ParametersRead = sscanf( optarg, "%f", &(Config.MinSamples) ); + ParametersRead = sscanf( tessoptarg, "%f", &(Config.MinSamples) ); if ( ParametersRead != 1 ) Error = TRUE; else if ( Config.MinSamples > 1 ) Config.MinSamples = 1; else if ( Config.MinSamples < 0 ) Config.MinSamples = 0; break; case 'B': - ParametersRead = sscanf( optarg, "%f", &(Config.MaxIllegal) ); + ParametersRead = sscanf( tessoptarg, "%f", &(Config.MaxIllegal) ); if ( ParametersRead != 1 ) Error = TRUE; else if ( Config.MaxIllegal > 1 ) Config.MaxIllegal = 1; else if ( Config.MaxIllegal < 0 ) Config.MaxIllegal = 0; break; case 'R': - ParametersRead = sscanf( optarg, "%f", &RoundingAccuracy ); + ParametersRead = sscanf( tessoptarg, "%f", &RoundingAccuracy ); if ( ParametersRead != 1 ) Error = TRUE; else if ( RoundingAccuracy > 0.01 ) RoundingAccuracy = 0.01; else if ( RoundingAccuracy < 0.0 ) RoundingAccuracy = 0.0; break; case 'S': - switch ( optarg[0] ) + switch ( tessoptarg[0] ) { case 's': Config.ProtoStyle = spherical; break; case 'e': Config.ProtoStyle = elliptical; break; @@ -451,10 +452,10 @@ char **argv) } break; case 'D': - Directory = optarg; + Directory = tessoptarg; break; case 'N': - if (sscanf (optarg, "%d", &MaxNumSamples) != 1 || + if (sscanf (tessoptarg, "%d", &MaxNumSamples) != 1 || MaxNumSamples <= 0) Error = TRUE; break; @@ -478,7 +479,7 @@ char *GetNextFilename () /* ** Parameters: none ** Globals: -** optind defined by tessopt sys call +** tessoptind defined by tessopt sys call ** Argc, Argv global copies of argc and argv ** Operation: ** This routine returns the next command line argument. If @@ -491,8 +492,8 @@ char *GetNextFilename () */ { - if (optind < Argc) - return (Argv [optind++]); + if (tessoptind < Argc) + return (Argv [tessoptind++]); else return (NULL); @@ -519,33 +520,41 @@ LIST ReadTrainingSamples ( */ { - char CharName[MAXNAMESIZE]; - LABELEDLIST CharSample; - FEATURE_SET FeatureSamples; + char unichar[UNICHAR_LEN + 1]; + LABELEDLIST CharSample; + FEATURE_SET FeatureSamples; LIST TrainingSamples = NIL; CHAR_DESC CharDesc; int Type, i; - while (fscanf (File, "%s %s", FontName, CharName) == 2) { - CharSample = FindList (TrainingSamples, CharName); + while (fscanf (File, "%s %s", FontName, unichar) == 2) { + if (!unicharset_mftraining.contains_unichar(unichar)) { + unicharset_mftraining.unichar_insert(unichar); + if (unicharset_mftraining.size() > MAX_NUM_CLASSES) { + cprintf("Error: Size of unicharset of mftraining is " + "greater than MAX_NUM_CLASSES\n"); + exit(1); + } + } + CharSample = FindList (TrainingSamples, unichar); if (CharSample == NULL) { - CharSample = NewLabeledList (CharName); + CharSample = NewLabeledList (unichar); TrainingSamples = push (TrainingSamples, CharSample); } CharDesc = ReadCharDescription (File); Type = ShortNameToFeatureType(PROGRAM_FEATURE_TYPE); FeatureSamples = FeaturesOfType(CharDesc, Type); - for (int feature = 0; feature < FeatureSamples->NumFeatures; ++feature) { - FEATURE f = FeatureSamples->Features[feature]; - for (int dim =0; dim < f->Type->NumParams; ++dim) - f->Params[dim] += UniformRandomNumber(-MINSD, MINSD); - } + for (int feature = 0; feature < FeatureSamples->NumFeatures; ++feature) { + FEATURE f = FeatureSamples->Features[feature]; + for (int dim =0; dim < f->Type->NumParams; ++dim) + f->Params[dim] += UniformRandomNumber(-MINSD, MINSD); + } CharSample->List = push (CharSample->List, FeatureSamples); for (i = 0; i < NumFeatureSetsIn (CharDesc); i++) - if (Type != i) - FreeFeatureSet (FeaturesOfType (CharDesc, i)); + if (Type != i) + FreeFeatureSet (FeaturesOfType (CharDesc, i)); free (CharDesc); - } + } return (TrainingSamples); } /* ReadTrainingSamples */ @@ -843,7 +852,7 @@ void WriteProtos( int i; PROTO Proto; - fprintf(File, "%c\n", NameToChar(MergeClass->Label)); + fprintf(File, "%s\n", MergeClass->Label); fprintf(File, "%d\n", NumProtosIn(MergeClass->Class)); for(i=0; i < NumProtosIn(MergeClass->Class); i++) { @@ -900,7 +909,7 @@ void FreeTrainingSamples ( LIST FeatureList; - printf ("\nFreeTrainingSamples..."); +// printf ("FreeTrainingSamples...\n"); iterate (CharList) /* iterate thru all of the fonts */ { CharSample = (LABELEDLIST) first_node (CharList); @@ -1161,12 +1170,13 @@ void SetUpForFloat2Int( BIT_VECTOR NewConfig; BIT_VECTOR OldConfig; - printf("Float2Int ..."); +// printf("Float2Int ...\n"); iterate(LabeledClassList) { MergeClass = (MERGE_CLASS) first_node (LabeledClassList); - Class = &TrainingData[NameToChar(MergeClass->Label)]; + Class = &TrainingData[unicharset_mftraining.unichar_to_id( + MergeClass->Label)]; NumProtos = NumProtosIn(MergeClass->Class); NumConfigs = NumConfigsIn(MergeClass->Class); @@ -1204,3 +1214,20 @@ void SetUpForFloat2Int( } } } // SetUpForFloat2Int + +/*--------------------------------------------------------------------------*/ +void WritePFFMTable(INT_TEMPLATES Templates, const char* filename) { + FILE* fp = Efopen(filename, "wb"); + /* then write out each class */ + for (int i = 0; i < NumClassesIn (Templates); i++) { + int MaxLength = 0; + INT_CLASS Class = ClassForIndex (Templates, i); + for (int ConfigId = 0; ConfigId < NumIntConfigsIn (Class); ConfigId++) { + if (LengthForConfigId (Class, ConfigId) > MaxLength) + MaxLength = LengthForConfigId (Class, ConfigId); + } + fprintf(fp, "%s %d\n", unicharset_mftraining.id_to_unichar( + ClassIdForIndex(Templates, i)), MaxLength); + } + fclose(fp); +} // WritePFFMTable diff --git a/training/unicharset_extractor.cpp b/training/unicharset_extractor.cpp index f5a6b4aed54392bf195d9254c1643f136fcde57d..eabd82ba6dcbe720536f4dd52cb16fde2580745b 100644 --- a/training/unicharset_extractor.cpp +++ b/training/unicharset_extractor.cpp @@ -52,8 +52,8 @@ int main(int argc, char** argv) { while ((option = tessopt(argc, argv, "D" )) != EOF) { switch (option) { case 'D': - output_directory = optarg; - ++optind; + output_directory = tessoptarg; + ++tessoptind; break; } } @@ -64,12 +64,12 @@ int main(int argc, char** argv) { unicharset_file_name += kUnicharsetFileName; // Load box files - for (; optind < argc; ++optind) { - printf("Extracting unicharset from %s\n", argv[optind]); + for (; tessoptind < argc; ++tessoptind) { + printf("Extracting unicharset from %s\n", argv[tessoptind]); - FILE* box_file = fopen(argv[optind], "r"); + FILE* box_file = fopen(argv[tessoptind], "r"); if (box_file == NULL) { - printf("Cannot open box file %s\n", argv[optind]); + printf("Cannot open box file %s\n", argv[tessoptind]); return -1; }