提交 f382fb56 编写于 作者: T theraysmith

Fixed various internationalization issues, mostly for training

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@106 d0cd1f9f-072b-0410-8dd7-cf729c803f20
上级 100942d7
...@@ -31,6 +31,7 @@ what measures we are interested in. ...@@ -31,6 +31,7 @@ what measures we are interested in.
#include <assert.h> #include <assert.h>
#include <errno.h> #include <errno.h>
#endif #endif
#include "boxread.h"
#include "mainblk.h" #include "mainblk.h"
#include "genblob.h" #include "genblob.h"
#include "fixxht.h" #include "fixxht.h"
...@@ -207,43 +208,20 @@ void clear_any_old_text( //remove correct text ...@@ -207,43 +208,20 @@ void clear_any_old_text( //remove correct text
BOOL8 read_next_box(FILE* box_file, // BOOL8 read_next_box(FILE* box_file, //
BOX *box, BOX *box,
UNICHAR_ID *uch_id) { UNICHAR_ID *uch_id) {
char buff[256]; //boxfile read buffer int x_min;
char *buffptr = buff; int y_min;
STRING box_filename; int x_max;
static INT16 line = 0; int y_max;
INT32 x_min; char uch[kBufSize];
INT32 y_min;
INT32 x_max; while (read_next_box(box_file, uch, &x_min, &y_min, &x_max, &y_max)) {
INT32 y_max;
INT32 count = 0;
char uch[256];
while (!feof (box_file)) {
fgets (buff, sizeof (buff) - 1, box_file);
line++;
buffptr = buff;
const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr);
if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
buffptr += 3; // Skip unicode file designation.
/* Check for blank lines in box file */
while (isspace (*buffptr))
buffptr++;
if (*buffptr != '\0') {
count =
sscanf (buffptr,
"%s " INT32FORMAT " " INT32FORMAT " " INT32FORMAT " "
INT32FORMAT, uch, &x_min, &y_min, &x_max, &y_max);
if (count != 5) {
tprintf ("Box file format error on line %i ignored\n", line);
}
else {
if (!unicharset_boxes.contains_unichar(uch)) if (!unicharset_boxes.contains_unichar(uch))
{ {
unicharset_boxes.unichar_insert(uch); unicharset_boxes.unichar_insert(uch);
if (unicharset_boxes.size() > MAX_NUM_CLASSES) { if (unicharset_boxes.size() > MAX_NUM_CLASSES) {
tprintf("Error: Size of unicharset of boxes is \ tprintf("Error: Size of unicharset of boxes is "
greater than MAX_NUM_CLASSES\n"); "greater than MAX_NUM_CLASSES (%d)\n",
MAX_NUM_CLASSES);
exit(1); exit(1);
} }
} }
...@@ -251,8 +229,6 @@ greater than MAX_NUM_CLASSES\n"); ...@@ -251,8 +229,6 @@ greater than MAX_NUM_CLASSES\n");
*box = BOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max)); *box = BOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max));
return TRUE; //read a box ok return TRUE; //read a box ok
} }
}
}
return FALSE; //EOF return FALSE; //EOF
} }
......
...@@ -857,7 +857,7 @@ void write_shm_text( //write output ...@@ -857,7 +857,7 @@ void write_shm_text( //write output
lineend ? OCR_NL_NEWLINE : OCR_NL_NONE); lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
} else { } else {
for (int suboffset = 0; suboffset < text_lengths[index]; ++suboffset) for (int suboffset = 0; suboffset < text_lengths[index]; ++suboffset)
ocr_append_char (text[offset + suboffset], ocr_append_char (static_cast<unsigned char>(text[offset+suboffset]),
blob_box.left (), blob_box.right (), blob_box.left (), blob_box.right (),
page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.top (),
page_image.get_ysize () - 1 - blob_box.bottom (), page_image.get_ysize () - 1 - blob_box.bottom (),
......
/**********************************************************************
* File: boxread.cpp
* Description: Read data from a box file.
* Author: Ray Smith
* Created: Fri Aug 24 17:47:23 PDT 2007
*
* (C) Copyright 2007, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "mfcpch.h"
#include <string.h>
#include "boxread.h"
#include "unichar.h"
#include "tprintf.h"
bool read_next_box(FILE* box_file, char* utf8_str,
int* x_min, int* y_min, int* x_max, int* y_max) {
static int line = 0;
int count = 0;
char buff[kBufSize]; //boxfile read buffer
char uch[kBufSize];
char *buffptr = buff;
while (!feof(box_file)) {
fgets(buff, sizeof(buff) - 1, box_file);
line++;
buffptr = buff;
const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr);
if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
buffptr += 3; // Skip unicode file designation.
/* Check for blank lines in box file */
while (*buffptr == ' ' || *buffptr == '\t')
buffptr++;
if (*buffptr != '\0') {
count = sscanf(buffptr, "%s " INT32FORMAT " " INT32FORMAT " "
INT32FORMAT " " INT32FORMAT,
uch, x_min, y_min, x_max, y_max);
if (count == 5) {
// Validate UTF8 by making unichars with it.
int used = 0;
int uch_len = strlen(uch);
while (used < uch_len) {
UNICHAR ch(uch + used, uch_len - used);
int new_used = ch.utf8_len();
if (new_used == 0) {
tprintf("Bad utf-8 char starting with 0x%x at line %d, col %d, \n",
uch[used], used + 1, line);
count = 0;
break;
}
used += new_used;
}
if (uch_len > UNICHAR_LEN) {
tprintf("utf-8 string too long at line %d\n", line);
count = 0;
}
}
if (count != 5) {
tprintf("Box file format error on line %i ignored\n", line);
} else {
strcpy(utf8_str, uch);
return true; //read a box ok
}
}
}
fclose(box_file);
line = 0;
return false; //EOF
}
/**********************************************************************
* File: boxread.cpp
* Description: Read data from a box file.
* Author: Ray Smith
* Created: Fri Aug 24 17:47:23 PDT 2007
*
* (C) Copyright 2007, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef THIRD_PARTY_TESSERACT_CCUTIL_BOXREAD_H__
#define THIRD_PARTY_TESSERACT_CCUTIL_BOXREAD_H__
#include <stdio.h>
const int kBufSize = 256;
// read_next_box factors out the code to interpret a line of a box
// file so that applybox and unicharset_extractor interpert the same way.
// This function returns the next valid box file utf8 string and coords
// and returns true, or false on eof (and closes the file).
// If ignores the uft8 file signature, checks for valid utf-8 and allows
// space or tab between fields.
// utf8_str must be at least kBufSize in length.
bool read_next_box(FILE* box_file, char* utf8_str,
int* x_min, int* y_min, int* x_max, int* y_max);
#endif // THIRD_PARTY_TESSERACT_CCUTIL_BOXREAD_H__
...@@ -134,8 +134,8 @@ int UNICHAR::utf8_step(const char* utf8_str) { ...@@ -134,8 +134,8 @@ int UNICHAR::utf8_step(const char* utf8_str) {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
}; };
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
// Maximum number of characters that can be stored in a UNICHAR. Must be // Maximum number of characters that can be stored in a UNICHAR. Must be
// at least 4. Must not exceed 31 without changing the coding of length. // at least 4. Must not exceed 31 without changing the coding of length.
#define UNICHAR_LEN 4 #define UNICHAR_LEN 8
// A UNICHAR_ID is the unique id of a unichar. // A UNICHAR_ID is the unique id of a unichar.
typedef int UNICHAR_ID; typedef int UNICHAR_ID;
......
...@@ -123,8 +123,11 @@ INT32 def_letter_is_okay(EDGE_ARRAY dawg, ...@@ -123,8 +123,11 @@ INT32 def_letter_is_okay(EDGE_ARRAY dawg,
const char *ptr; const char *ptr;
for (ptr = word; *ptr != '\0';) { for (ptr = word; *ptr != '\0';) {
word_single_lengths += UNICHAR::utf8_step(ptr); int step = UNICHAR::utf8_step(ptr);
ptr += UNICHAR::utf8_step(ptr); if (step == 0)
return FALSE;
word_single_lengths += step;
ptr += step;
} }
if (*node == NO_EDGE) { /* Trailing punctuation */ if (*node == NO_EDGE) { /* Trailing punctuation */
...@@ -175,9 +178,10 @@ INT32 def_letter_is_okay(EDGE_ARRAY dawg, ...@@ -175,9 +178,10 @@ INT32 def_letter_is_okay(EDGE_ARRAY dawg,
if (case_sensative || case_is_okay (dummy_word, char_index)) { if (case_sensative || case_is_okay (dummy_word, char_index)) {
//next_node (dawg, edge); //next_node (dawg, edge);
*node = next_node(dawg, edge); *node = next_node(dawg, edge);
if (*node == 0)
*node = NO_EDGE;
return (TRUE); return (TRUE);
} } else {
else {
return (FALSE); return (FALSE);
} }
} }
......
...@@ -43,7 +43,7 @@ ...@@ -43,7 +43,7 @@
----------------------------------------------------------------------*/ ----------------------------------------------------------------------*/
#define FREQ_WERD 1.0 #define FREQ_WERD 1.0
#define GOOD_WERD 1.1 #define GOOD_WERD 1.1
#define OK_WERD 1.25 #define OK_WERD 1.3125
#define MAX_FREQ_EDGES 1500 #define MAX_FREQ_EDGES 1500
#define NO_RATING -1 #define NO_RATING -1
......
...@@ -24,14 +24,63 @@ ...@@ -24,14 +24,63 @@
// unichar per line. // unichar per line.
#include <stdio.h> #include <stdio.h>
/*
** Include automatically generated configuration file if running autoconf
*/
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif
#if defined(HAVE_WCHAR_T) || defined(__MSW32__) || defined(GOOGLE3)
#include <wchar.h>
#include <wctype.h>
#define USING_WCTYPE
#endif
#include "unichar.h" #include "unichar.h"
#include "unicharset.h" #include "unicharset.h"
#include "strngs.h" #include "strngs.h"
#include "boxread.h"
#include "tessopt.h" #include "tessopt.h"
static const char* const kUnicharsetFileName = "unicharset"; static const char* const kUnicharsetFileName = "unicharset";
// Set character properties using wctype if we have it.
// Contributed by piggy@gmail.com.
// Modified by Ray to use UNICHAR for unicode conversion
// and to check for wctype using autoconf/presence of windows.
void set_properties(UNICHARSET *unicharset, const char* const c_string) {
#ifdef USING_WCTYPE
UNICHAR_ID id;
int wc;
// Convert the string to a unichar id.
id = unicharset->unichar_to_id(c_string);
int step = 0;
int len = strlen(c_string);
for (int offset = 0; offset < len; offset += step) {
step = UNICHAR::utf8_step(c_string + offset);
if (step == 0)
break; // Invalid utf-8.
// Get the next Unicode cond point in the string.
UNICHAR ch(c_string + offset, step);
wc = ch.first_uni();
/* Copy the properties. */
if (iswalpha(wc)) {
unicharset->set_isalpha(id, 1);
if (iswlower(wc))
unicharset->set_islower(id, 1);
if (iswlower(wc))
unicharset->set_isupper(id, 1);
}
if (iswdigit(wc))
unicharset->set_isdigit(id, 1);
}
#endif
}
int main(int argc, char** argv) { int main(int argc, char** argv) {
int option; int option;
const char* output_directory = "."; const char* output_directory = ".";
...@@ -73,18 +122,12 @@ int main(int argc, char** argv) { ...@@ -73,18 +122,12 @@ int main(int argc, char** argv) {
return -1; return -1;
} }
while (!feof(box_file)) {
int x_min, y_min, x_max, y_max; int x_min, y_min, x_max, y_max;
char buffer[256]; char c_string[kBufSize];
char c_string[256]; while (read_next_box(box_file, c_string, &x_min, &y_min, &x_max, &y_max)) {
fgets(buffer, sizeof (buffer), box_file);
sscanf(buffer, "%s %d %d %d %d",
c_string, &x_min, &y_min, &x_max, &y_max);
unicharset.unichar_insert(c_string); unicharset.unichar_insert(c_string);
set_properties(&unicharset, c_string);
} }
fclose(box_file);
} }
// Write unicharset file // Write unicharset file
......
...@@ -96,6 +96,11 @@ void best_first_search(CHUNKS_RECORD *chunks_record, ...@@ -96,6 +96,11 @@ void best_first_search(CHUNKS_RECORD *chunks_record,
save_best_state(chunks_record); save_best_state(chunks_record);
#endif #endif
start_recording(); start_recording();
FLOAT32 worst_priority = 2.0f * prioritize_state(chunks_record,
the_search,
best_state);
if (worst_priority < worst_state)
worst_priority = worst_state;
guided_state = *state; guided_state = *state;
do { do {
...@@ -119,7 +124,7 @@ void best_first_search(CHUNKS_RECORD *chunks_record, ...@@ -119,7 +124,7 @@ void best_first_search(CHUNKS_RECORD *chunks_record,
break; break;
} }
expand_node(chunks_record, the_search); expand_node(worst_priority, chunks_record, the_search);
} }
free_state (the_search->this_state); free_state (the_search->this_state);
...@@ -372,7 +377,8 @@ CHOICES_LIST rebuild_current_state(TBLOB *blobs, ...@@ -372,7 +377,8 @@ CHOICES_LIST rebuild_current_state(TBLOB *blobs,
* each one has not already been visited. If not add it to the priority * each one has not already been visited. If not add it to the priority
* queue. * queue.
**********************************************************************/ **********************************************************************/
void expand_node(CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search) { void expand_node(FLOAT32 worst_priority,
CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search) {
STATE old_state; STATE old_state;
int x; int x;
int mask = 1 << (the_search->num_joints - 1 - 32); int mask = 1 << (the_search->num_joints - 1 - 32);
...@@ -383,8 +389,8 @@ void expand_node(CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search) { ...@@ -383,8 +389,8 @@ void expand_node(CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search) {
for (x = the_search->num_joints; x > 32; x--) { for (x = the_search->num_joints; x > 32; x--) {
the_search->this_state->part1 = mask ^ old_state.part1; the_search->this_state->part1 = mask ^ old_state.part1;
if (!hash_lookup (the_search->closed_states, the_search->this_state)) if (!hash_lookup (the_search->closed_states, the_search->this_state))
push_queue (the_search->open_states, push_queue (the_search->open_states, the_search->this_state,
the_search->this_state, worst_priority,
prioritize_state (chunks_record, the_search, &old_state)); prioritize_state (chunks_record, the_search, &old_state));
mask >>= 1; mask >>= 1;
} }
...@@ -399,8 +405,8 @@ void expand_node(CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search) { ...@@ -399,8 +405,8 @@ void expand_node(CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search) {
while (x--) { while (x--) {
the_search->this_state->part2 = mask ^ old_state.part2; the_search->this_state->part2 = mask ^ old_state.part2;
if (!hash_lookup (the_search->closed_states, the_search->this_state)) if (!hash_lookup (the_search->closed_states, the_search->this_state))
push_queue (the_search->open_states, push_queue (the_search->open_states, the_search->this_state,
the_search->this_state, worst_priority,
prioritize_state (chunks_record, the_search, &old_state)); prioritize_state (chunks_record, the_search, &old_state));
mask >>= 1; mask >>= 1;
} }
...@@ -472,10 +478,11 @@ STATE *pop_queue(HEAP *queue) { ...@@ -472,10 +478,11 @@ STATE *pop_queue(HEAP *queue) {
* *
* Add this state into the priority queue. * Add this state into the priority queue.
**********************************************************************/ **********************************************************************/
void push_queue(HEAP *queue, STATE *state, FLOAT32 priority) { void push_queue(HEAP *queue, STATE *state, FLOAT32 worst_priority,
FLOAT32 priority) {
HEAPENTRY entry; HEAPENTRY entry;
if (SizeOfHeap (queue) < MaxSizeOfHeap (queue) && priority < worst_state) { if (SizeOfHeap (queue) < MaxSizeOfHeap (queue) && priority < worst_priority) {
entry.Data = (char *) new_state (state); entry.Data = (char *) new_state (state);
num_pushed++; num_pushed++;
entry.Key = priority; entry.Key = priority;
......
...@@ -112,7 +112,9 @@ CHOICES_LIST rebuild_current_state(TBLOB *blobs, ...@@ -112,7 +112,9 @@ CHOICES_LIST rebuild_current_state(TBLOB *blobs,
CHOICES_LIST old_choices, CHOICES_LIST old_choices,
int fx); int fx);
void expand_node(CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search); void expand_node(FLOAT32 worst_priority,
CHUNKS_RECORD *chunks_record,
SEARCH_RECORD *the_search);
SEARCH_RECORD *new_search(CHUNKS_RECORD *chunks_record, SEARCH_RECORD *new_search(CHUNKS_RECORD *chunks_record,
int num_joints, int num_joints,
...@@ -122,7 +124,8 @@ SEARCH_RECORD *new_search(CHUNKS_RECORD *chunks_record, ...@@ -122,7 +124,8 @@ SEARCH_RECORD *new_search(CHUNKS_RECORD *chunks_record,
STATE *pop_queue(HEAP *queue); STATE *pop_queue(HEAP *queue);
void push_queue(HEAP *queue, STATE *state, FLOAT32 priority); void push_queue(HEAP *queue, STATE *state,
FLOAT32 worst_priority, FLOAT32 priority);
void replace_char_widths(CHUNKS_RECORD *chunks_record, SEARCH_STATE state); void replace_char_widths(CHUNKS_RECORD *chunks_record, SEARCH_STATE state);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册