提交 f382fb56 编写于 作者: T theraysmith

Fixed various internationalization issues, mostly for training

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@106 d0cd1f9f-072b-0410-8dd7-cf729c803f20
上级 100942d7
......@@ -31,6 +31,7 @@ what measures we are interested in.
#include <assert.h>
#include <errno.h>
#endif
#include "boxread.h"
#include "mainblk.h"
#include "genblob.h"
#include "fixxht.h"
......@@ -207,51 +208,26 @@ void clear_any_old_text( //remove correct text
BOOL8 read_next_box(FILE* box_file, //
BOX *box,
UNICHAR_ID *uch_id) {
char buff[256]; //boxfile read buffer
char *buffptr = buff;
STRING box_filename;
static INT16 line = 0;
INT32 x_min;
INT32 y_min;
INT32 x_max;
INT32 y_max;
INT32 count = 0;
char uch[256];
while (!feof (box_file)) {
fgets (buff, sizeof (buff) - 1, box_file);
line++;
buffptr = buff;
const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr);
if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
buffptr += 3; // Skip unicode file designation.
/* Check for blank lines in box file */
while (isspace (*buffptr))
buffptr++;
if (*buffptr != '\0') {
count =
sscanf (buffptr,
"%s " INT32FORMAT " " INT32FORMAT " " INT32FORMAT " "
INT32FORMAT, uch, &x_min, &y_min, &x_max, &y_max);
if (count != 5) {
tprintf ("Box file format error on line %i ignored\n", line);
}
else {
if (!unicharset_boxes.contains_unichar(uch))
{
unicharset_boxes.unichar_insert(uch);
if (unicharset_boxes.size() > MAX_NUM_CLASSES) {
tprintf("Error: Size of unicharset of boxes is \
greater than MAX_NUM_CLASSES\n");
exit(1);
}
}
*uch_id = unicharset_boxes.unichar_to_id(uch);
*box = BOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max));
return TRUE; //read a box ok
int x_min;
int y_min;
int x_max;
int y_max;
char uch[kBufSize];
while (read_next_box(box_file, uch, &x_min, &y_min, &x_max, &y_max)) {
if (!unicharset_boxes.contains_unichar(uch))
{
unicharset_boxes.unichar_insert(uch);
if (unicharset_boxes.size() > MAX_NUM_CLASSES) {
tprintf("Error: Size of unicharset of boxes is "
"greater than MAX_NUM_CLASSES (%d)\n",
MAX_NUM_CLASSES);
exit(1);
}
}
*uch_id = unicharset_boxes.unichar_to_id(uch);
*box = BOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max));
return TRUE; //read a box ok
}
return FALSE; //EOF
}
......
......@@ -857,7 +857,7 @@ void write_shm_text( //write output
lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
} else {
for (int suboffset = 0; suboffset < text_lengths[index]; ++suboffset)
ocr_append_char (text[offset + suboffset],
ocr_append_char (static_cast<unsigned char>(text[offset+suboffset]),
blob_box.left (), blob_box.right (),
page_image.get_ysize () - 1 - blob_box.top (),
page_image.get_ysize () - 1 - blob_box.bottom (),
......
/**********************************************************************
* File: boxread.cpp
* Description: Read data from a box file.
* Author: Ray Smith
* Created: Fri Aug 24 17:47:23 PDT 2007
*
* (C) Copyright 2007, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "mfcpch.h"
#include <string.h>
#include "boxread.h"
#include "unichar.h"
#include "tprintf.h"
bool read_next_box(FILE* box_file, char* utf8_str,
int* x_min, int* y_min, int* x_max, int* y_max) {
static int line = 0;
int count = 0;
char buff[kBufSize]; //boxfile read buffer
char uch[kBufSize];
char *buffptr = buff;
while (!feof(box_file)) {
fgets(buff, sizeof(buff) - 1, box_file);
line++;
buffptr = buff;
const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr);
if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
buffptr += 3; // Skip unicode file designation.
/* Check for blank lines in box file */
while (*buffptr == ' ' || *buffptr == '\t')
buffptr++;
if (*buffptr != '\0') {
count = sscanf(buffptr, "%s " INT32FORMAT " " INT32FORMAT " "
INT32FORMAT " " INT32FORMAT,
uch, x_min, y_min, x_max, y_max);
if (count == 5) {
// Validate UTF8 by making unichars with it.
int used = 0;
int uch_len = strlen(uch);
while (used < uch_len) {
UNICHAR ch(uch + used, uch_len - used);
int new_used = ch.utf8_len();
if (new_used == 0) {
tprintf("Bad utf-8 char starting with 0x%x at line %d, col %d, \n",
uch[used], used + 1, line);
count = 0;
break;
}
used += new_used;
}
if (uch_len > UNICHAR_LEN) {
tprintf("utf-8 string too long at line %d\n", line);
count = 0;
}
}
if (count != 5) {
tprintf("Box file format error on line %i ignored\n", line);
} else {
strcpy(utf8_str, uch);
return true; //read a box ok
}
}
}
fclose(box_file);
line = 0;
return false; //EOF
}
/**********************************************************************
* File: boxread.cpp
* Description: Read data from a box file.
* Author: Ray Smith
* Created: Fri Aug 24 17:47:23 PDT 2007
*
* (C) Copyright 2007, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef THIRD_PARTY_TESSERACT_CCUTIL_BOXREAD_H__
#define THIRD_PARTY_TESSERACT_CCUTIL_BOXREAD_H__
#include <stdio.h>
const int kBufSize = 256;
// read_next_box factors out the code to interpret a line of a box
// file so that applybox and unicharset_extractor interpert the same way.
// This function returns the next valid box file utf8 string and coords
// and returns true, or false on eof (and closes the file).
// If ignores the uft8 file signature, checks for valid utf-8 and allows
// space or tab between fields.
// utf8_str must be at least kBufSize in length.
bool read_next_box(FILE* box_file, char* utf8_str,
int* x_min, int* y_min, int* x_max, int* y_max);
#endif // THIRD_PARTY_TESSERACT_CCUTIL_BOXREAD_H__
......@@ -134,8 +134,8 @@ int UNICHAR::utf8_step(const char* utf8_str) {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
};
......
......@@ -24,7 +24,7 @@
// Maximum number of characters that can be stored in a UNICHAR. Must be
// at least 4. Must not exceed 31 without changing the coding of length.
#define UNICHAR_LEN 4
#define UNICHAR_LEN 8
// A UNICHAR_ID is the unique id of a unichar.
typedef int UNICHAR_ID;
......
......@@ -123,8 +123,11 @@ INT32 def_letter_is_okay(EDGE_ARRAY dawg,
const char *ptr;
for (ptr = word; *ptr != '\0';) {
word_single_lengths += UNICHAR::utf8_step(ptr);
ptr += UNICHAR::utf8_step(ptr);
int step = UNICHAR::utf8_step(ptr);
if (step == 0)
return FALSE;
word_single_lengths += step;
ptr += step;
}
if (*node == NO_EDGE) { /* Trailing punctuation */
......@@ -174,10 +177,11 @@ INT32 def_letter_is_okay(EDGE_ARRAY dawg,
if (edge != NO_EDGE) { /* Normal edge in DAWG */
if (case_sensative || case_is_okay (dummy_word, char_index)) {
//next_node (dawg, edge);
*node = next_node(dawg, edge);
*node = next_node(dawg, edge);
if (*node == 0)
*node = NO_EDGE;
return (TRUE);
}
else {
} else {
return (FALSE);
}
}
......
......@@ -43,7 +43,7 @@
----------------------------------------------------------------------*/
#define FREQ_WERD 1.0
#define GOOD_WERD 1.1
#define OK_WERD 1.25
#define OK_WERD 1.3125
#define MAX_FREQ_EDGES 1500
#define NO_RATING -1
......
......@@ -24,14 +24,63 @@
// unichar per line.
#include <stdio.h>
/*
** Include automatically generated configuration file if running autoconf
*/
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif
#if defined(HAVE_WCHAR_T) || defined(__MSW32__) || defined(GOOGLE3)
#include <wchar.h>
#include <wctype.h>
#define USING_WCTYPE
#endif
#include "unichar.h"
#include "unicharset.h"
#include "strngs.h"
#include "boxread.h"
#include "tessopt.h"
static const char* const kUnicharsetFileName = "unicharset";
// Set character properties using wctype if we have it.
// Contributed by piggy@gmail.com.
// Modified by Ray to use UNICHAR for unicode conversion
// and to check for wctype using autoconf/presence of windows.
void set_properties(UNICHARSET *unicharset, const char* const c_string) {
#ifdef USING_WCTYPE
UNICHAR_ID id;
int wc;
// Convert the string to a unichar id.
id = unicharset->unichar_to_id(c_string);
int step = 0;
int len = strlen(c_string);
for (int offset = 0; offset < len; offset += step) {
step = UNICHAR::utf8_step(c_string + offset);
if (step == 0)
break; // Invalid utf-8.
// Get the next Unicode cond point in the string.
UNICHAR ch(c_string + offset, step);
wc = ch.first_uni();
/* Copy the properties. */
if (iswalpha(wc)) {
unicharset->set_isalpha(id, 1);
if (iswlower(wc))
unicharset->set_islower(id, 1);
if (iswlower(wc))
unicharset->set_isupper(id, 1);
}
if (iswdigit(wc))
unicharset->set_isdigit(id, 1);
}
#endif
}
int main(int argc, char** argv) {
int option;
const char* output_directory = ".";
......@@ -73,18 +122,12 @@ int main(int argc, char** argv) {
return -1;
}
while (!feof(box_file)) {
int x_min, y_min, x_max, y_max;
char buffer[256];
char c_string[256];
fgets(buffer, sizeof (buffer), box_file);
sscanf(buffer, "%s %d %d %d %d",
c_string, &x_min, &y_min, &x_max, &y_max);
int x_min, y_min, x_max, y_max;
char c_string[kBufSize];
while (read_next_box(box_file, c_string, &x_min, &y_min, &x_max, &y_max)) {
unicharset.unichar_insert(c_string);
set_properties(&unicharset, c_string);
}
fclose(box_file);
}
// Write unicharset file
......
......@@ -41,7 +41,7 @@
#include "structures.h"
#include "wordclass.h"
void call_caller();
void call_caller();
/*----------------------------------------------------------------------
V a r i a b l e s
......@@ -65,9 +65,9 @@ make_float_var (worst_state, 1, make_worst_state,
* Create and initialize references to debug variables that control
* operations in this file.
**********************************************************************/
void init_bestfirst_vars() {
make_seg_states();
make_worst_state();
void init_bestfirst_vars() {
make_seg_states();
make_worst_state();
}
......@@ -93,9 +93,14 @@ void best_first_search(CHUNKS_RECORD *chunks_record,
best_choice, raw_choice, state);
#ifndef GRAPHICS_DISABLED
save_best_state(chunks_record);
save_best_state(chunks_record);
#endif
start_recording();
start_recording();
FLOAT32 worst_priority = 2.0f * prioritize_state(chunks_record,
the_search,
best_state);
if (worst_priority < worst_state)
worst_priority = worst_state;
guided_state = *state;
do {
......@@ -109,7 +114,7 @@ void best_first_search(CHUNKS_RECORD *chunks_record,
guided_state = *(the_search->this_state);
keep_going =
evaluate_state(chunks_record, the_search, fixpt, best_state, pass);
evaluate_state(chunks_record, the_search, fixpt, best_state, pass);
hash_add (the_search->closed_states, the_search->this_state);
......@@ -119,7 +124,7 @@ void best_first_search(CHUNKS_RECORD *chunks_record,
break;
}
expand_node(chunks_record, the_search);
expand_node(worst_priority, chunks_record, the_search);
}
free_state (the_search->this_state);
......@@ -130,8 +135,8 @@ void best_first_search(CHUNKS_RECORD *chunks_record,
state->part1 = the_search->best_state->part1;
state->part2 = the_search->best_state->part2;
stop_recording();
delete_search(the_search);
stop_recording();
delete_search(the_search);
}
......@@ -141,7 +146,7 @@ void best_first_search(CHUNKS_RECORD *chunks_record,
* Return the width of several of the chunks (if they were joined to-
* gether.
**********************************************************************/
int chunks_width(WIDTH_RECORD *width_record, int start_chunk, int last_chunk) {
int chunks_width(WIDTH_RECORD *width_record, int start_chunk, int last_chunk) {
int result = 0;
int x;
......@@ -157,7 +162,7 @@ int chunks_width(WIDTH_RECORD *width_record, int start_chunk, int last_chunk) {
*
* Terminate the current search and free all the memory involved.
**********************************************************************/
void delete_search(SEARCH_RECORD *the_search) {
void delete_search(SEARCH_RECORD *the_search) {
float closeness;
closeness = (the_search->num_joints ?
......@@ -174,7 +179,7 @@ void delete_search(SEARCH_RECORD *the_search) {
free_hash_table (the_search->closed_states);
FreeHeapData (the_search->open_states, (void_dest) free_state);
memfree(the_search);
memfree(the_search);
}
......@@ -204,7 +209,7 @@ CHOICES_LIST evaluate_chunks(CHUNKS_RECORD *chunks_record,
y = x + search_state[i];
if (blob_skip) {
array_free(char_choices);
array_free(char_choices);
return (NULL);
} /* Process one square */
/* Classify if needed */
......@@ -216,7 +221,7 @@ CHOICES_LIST evaluate_chunks(CHUNKS_RECORD *chunks_record,
this_state, best_state, pass, i - 1);
if (this_choice == NIL) {
array_free(char_choices);
array_free(char_choices);
return (NULL);
}
/* Add permuted ratings */
......@@ -256,7 +261,7 @@ INT16 evaluate_state(CHUNKS_RECORD *chunks_record,
chunk_groups = bin_to_chunks (the_search->this_state,
the_search->num_joints);
bin_to_pieces (the_search->this_state, the_search->num_joints, widths);
LogNewSegmentation(widths);
LogNewSegmentation(widths);
rating_limit = class_probability (the_search->best_choice);
......@@ -270,14 +275,14 @@ INT16 evaluate_state(CHUNKS_RECORD *chunks_record,
if (AcceptableChoice (char_choices, the_search->best_choice,
the_search->raw_choice, fixpt))
keep_going = FALSE;
array_free(char_choices);
array_free(char_choices);
}
#ifndef GRAPHICS_DISABLED
if (display_segmentations) {
display_segmentation (chunks_record->chunks, chunk_groups);
if (display_segmentations > 1)
window_wait(segm_window);
window_wait(segm_window);
}
#endif
......@@ -285,12 +290,12 @@ INT16 evaluate_state(CHUNKS_RECORD *chunks_record,
the_search->before_best = the_search->num_states;
the_search->best_state->part1 = the_search->this_state->part1;
the_search->best_state->part2 = the_search->this_state->part2;
replace_char_widths(chunks_record, chunk_groups);
replace_char_widths(chunks_record, chunk_groups);
}
else if (char_choices != NULL)
fixpt->index = -1;
memfree(chunk_groups);
memfree(chunk_groups);
return (keep_going);
}
......@@ -337,7 +342,7 @@ CHOICES_LIST rebuild_current_state(TBLOB *blobs,
array_value (old_choices, x) = NULL;
}
else {
join_pieces(blobs, seam_list, x, y);
join_pieces(blobs, seam_list, x, y);
for (blob = blobs, blobindex = 0, p_blob = NULL; blobindex < x;
blobindex++) {
p_blob = blob;
......@@ -358,8 +363,8 @@ CHOICES_LIST rebuild_current_state(TBLOB *blobs,
x = y - search_state[i];
}
memfree(search_state);
free_all_choices(old_choices, x);
memfree(search_state);
free_all_choices(old_choices, x);
return (char_choices);
}
......@@ -372,7 +377,8 @@ CHOICES_LIST rebuild_current_state(TBLOB *blobs,
* each one has not already been visited. If not add it to the priority
* queue.
**********************************************************************/
void expand_node(CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search) {
void expand_node(FLOAT32 worst_priority,
CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search) {
STATE old_state;
int x;
int mask = 1 << (the_search->num_joints - 1 - 32);
......@@ -383,9 +389,9 @@ void expand_node(CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search) {
for (x = the_search->num_joints; x > 32; x--) {
the_search->this_state->part1 = mask ^ old_state.part1;
if (!hash_lookup (the_search->closed_states, the_search->this_state))
push_queue (the_search->open_states,
the_search->this_state,
prioritize_state (chunks_record, the_search, &old_state));
push_queue (the_search->open_states, the_search->this_state,
worst_priority,
prioritize_state (chunks_record, the_search, &old_state));
mask >>= 1;
}
......@@ -399,9 +405,9 @@ void expand_node(CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search) {
while (x--) {
the_search->this_state->part2 = mask ^ old_state.part2;
if (!hash_lookup (the_search->closed_states, the_search->this_state))
push_queue (the_search->open_states,
the_search->this_state,
prioritize_state (chunks_record, the_search, &old_state));
push_queue (the_search->open_states, the_search->this_state,
worst_priority,
prioritize_state (chunks_record, the_search, &old_state));
mask >>= 1;
}
}
......@@ -449,7 +455,7 @@ SEARCH_RECORD *new_search(CHUNKS_RECORD *chunks_record,
* Get this state from the priority queue. It should be the state that
* has the greatest urgency to be evaluated.
**********************************************************************/
STATE *pop_queue(HEAP *queue) {
STATE *pop_queue(HEAP *queue) {
HEAPENTRY entry;
if (GetTopOfHeap (queue, &entry) == OK) {
......@@ -472,14 +478,15 @@ STATE *pop_queue(HEAP *queue) {
*
* Add this state into the priority queue.
**********************************************************************/
void push_queue(HEAP *queue, STATE *state, FLOAT32 priority) {
void push_queue(HEAP *queue, STATE *state, FLOAT32 worst_priority,
FLOAT32 priority) {
HEAPENTRY entry;
if (SizeOfHeap (queue) < MaxSizeOfHeap (queue) && priority < worst_state) {
if (SizeOfHeap (queue) < MaxSizeOfHeap (queue) && priority < worst_priority) {
entry.Data = (char *) new_state (state);
num_pushed++;
entry.Key = priority;
HeapStore(queue, &entry);
HeapStore(queue, &entry);
}
}
......@@ -490,7 +497,7 @@ void push_queue(HEAP *queue, STATE *state, FLOAT32 priority) {
* Replace the value of the char_width field in the chunks_record with
* the updated width measurements from the last_segmentation.
**********************************************************************/
void replace_char_widths(CHUNKS_RECORD *chunks_record, SEARCH_STATE state) {
void replace_char_widths(CHUNKS_RECORD *chunks_record, SEARCH_STATE state) {
WIDTH_RECORD *width_record;
int num_blobs;
int i;
......
......@@ -80,7 +80,7 @@ extern int num_popped;
/*----------------------------------------------------------------------
F u n c t i o n s
----------------------------------------------------------------------*/
void init_bestfirst_vars();
void init_bestfirst_vars();
void best_first_search(CHUNKS_RECORD *chunks_record,
A_CHOICE *best_choice,
......@@ -90,9 +90,9 @@ void best_first_search(CHUNKS_RECORD *chunks_record,
STATE *best_state,
INT32 pass);
int chunks_width(WIDTH_RECORD *width_record, int start_chunk, int last_chunk);
int chunks_width(WIDTH_RECORD *width_record, int start_chunk, int last_chunk);
void delete_search(SEARCH_RECORD *the_search);
void delete_search(SEARCH_RECORD *the_search);
CHOICES_LIST evaluate_chunks(CHUNKS_RECORD *chunks_record,
SEARCH_STATE search_state,
......@@ -112,7 +112,9 @@ CHOICES_LIST rebuild_current_state(TBLOB *blobs,
CHOICES_LIST old_choices,
int fx);
void expand_node(CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search);
void expand_node(FLOAT32 worst_priority,
CHUNKS_RECORD *chunks_record,
SEARCH_RECORD *the_search);
SEARCH_RECORD *new_search(CHUNKS_RECORD *chunks_record,
int num_joints,
......@@ -120,11 +122,12 @@ SEARCH_RECORD *new_search(CHUNKS_RECORD *chunks_record,
A_CHOICE *raw_choice,
STATE *state);
STATE *pop_queue(HEAP *queue);
STATE *pop_queue(HEAP *queue);
void push_queue(HEAP *queue, STATE *state, FLOAT32 priority);
void push_queue(HEAP *queue, STATE *state,
FLOAT32 worst_priority, FLOAT32 priority);
void replace_char_widths(CHUNKS_RECORD *chunks_record, SEARCH_STATE state);
void replace_char_widths(CHUNKS_RECORD *chunks_record, SEARCH_STATE state);
/*
#if defined(__STDC__) || defined(__cplusplus)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册