提交 f382fb56 编写于 作者: T theraysmith

Fixed various internationalization issues, mostly for training

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@106 d0cd1f9f-072b-0410-8dd7-cf729c803f20
上级 100942d7
......@@ -31,6 +31,7 @@ what measures we are interested in.
#include <assert.h>
#include <errno.h>
#endif
#include "boxread.h"
#include "mainblk.h"
#include "genblob.h"
#include "fixxht.h"
......@@ -207,43 +208,20 @@ void clear_any_old_text( //remove correct text
BOOL8 read_next_box(FILE* box_file, //
BOX *box,
UNICHAR_ID *uch_id) {
char buff[256]; //boxfile read buffer
char *buffptr = buff;
STRING box_filename;
static INT16 line = 0;
INT32 x_min;
INT32 y_min;
INT32 x_max;
INT32 y_max;
INT32 count = 0;
char uch[256];
while (!feof (box_file)) {
fgets (buff, sizeof (buff) - 1, box_file);
line++;
buffptr = buff;
const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr);
if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
buffptr += 3; // Skip unicode file designation.
/* Check for blank lines in box file */
while (isspace (*buffptr))
buffptr++;
if (*buffptr != '\0') {
count =
sscanf (buffptr,
"%s " INT32FORMAT " " INT32FORMAT " " INT32FORMAT " "
INT32FORMAT, uch, &x_min, &y_min, &x_max, &y_max);
if (count != 5) {
tprintf ("Box file format error on line %i ignored\n", line);
}
else {
int x_min;
int y_min;
int x_max;
int y_max;
char uch[kBufSize];
while (read_next_box(box_file, uch, &x_min, &y_min, &x_max, &y_max)) {
if (!unicharset_boxes.contains_unichar(uch))
{
unicharset_boxes.unichar_insert(uch);
if (unicharset_boxes.size() > MAX_NUM_CLASSES) {
tprintf("Error: Size of unicharset of boxes is \
greater than MAX_NUM_CLASSES\n");
tprintf("Error: Size of unicharset of boxes is "
"greater than MAX_NUM_CLASSES (%d)\n",
MAX_NUM_CLASSES);
exit(1);
}
}
......@@ -251,8 +229,6 @@ greater than MAX_NUM_CLASSES\n");
*box = BOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max));
return TRUE; //read a box ok
}
}
}
return FALSE; //EOF
}
......
......@@ -857,7 +857,7 @@ void write_shm_text( //write output
lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
} else {
for (int suboffset = 0; suboffset < text_lengths[index]; ++suboffset)
ocr_append_char (text[offset + suboffset],
ocr_append_char (static_cast<unsigned char>(text[offset+suboffset]),
blob_box.left (), blob_box.right (),
page_image.get_ysize () - 1 - blob_box.top (),
page_image.get_ysize () - 1 - blob_box.bottom (),
......
/**********************************************************************
* File: boxread.cpp
* Description: Read data from a box file.
* Author: Ray Smith
* Created: Fri Aug 24 17:47:23 PDT 2007
*
* (C) Copyright 2007, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "mfcpch.h"
#include <string.h>
#include "boxread.h"
#include "unichar.h"
#include "tprintf.h"
bool read_next_box(FILE* box_file, char* utf8_str,
int* x_min, int* y_min, int* x_max, int* y_max) {
static int line = 0;
int count = 0;
char buff[kBufSize]; //boxfile read buffer
char uch[kBufSize];
char *buffptr = buff;
while (!feof(box_file)) {
fgets(buff, sizeof(buff) - 1, box_file);
line++;
buffptr = buff;
const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr);
if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
buffptr += 3; // Skip unicode file designation.
/* Check for blank lines in box file */
while (*buffptr == ' ' || *buffptr == '\t')
buffptr++;
if (*buffptr != '\0') {
count = sscanf(buffptr, "%s " INT32FORMAT " " INT32FORMAT " "
INT32FORMAT " " INT32FORMAT,
uch, x_min, y_min, x_max, y_max);
if (count == 5) {
// Validate UTF8 by making unichars with it.
int used = 0;
int uch_len = strlen(uch);
while (used < uch_len) {
UNICHAR ch(uch + used, uch_len - used);
int new_used = ch.utf8_len();
if (new_used == 0) {
tprintf("Bad utf-8 char starting with 0x%x at line %d, col %d, \n",
uch[used], used + 1, line);
count = 0;
break;
}
used += new_used;
}
if (uch_len > UNICHAR_LEN) {
tprintf("utf-8 string too long at line %d\n", line);
count = 0;
}
}
if (count != 5) {
tprintf("Box file format error on line %i ignored\n", line);
} else {
strcpy(utf8_str, uch);
return true; //read a box ok
}
}
}
fclose(box_file);
line = 0;
return false; //EOF
}
/**********************************************************************
* File: boxread.cpp
* Description: Read data from a box file.
* Author: Ray Smith
* Created: Fri Aug 24 17:47:23 PDT 2007
*
* (C) Copyright 2007, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef THIRD_PARTY_TESSERACT_CCUTIL_BOXREAD_H__
#define THIRD_PARTY_TESSERACT_CCUTIL_BOXREAD_H__
#include <stdio.h>
const int kBufSize = 256;
// read_next_box factors out the code to interpret a line of a box
// file so that applybox and unicharset_extractor interpert the same way.
// This function returns the next valid box file utf8 string and coords
// and returns true, or false on eof (and closes the file).
// If ignores the uft8 file signature, checks for valid utf-8 and allows
// space or tab between fields.
// utf8_str must be at least kBufSize in length.
bool read_next_box(FILE* box_file, char* utf8_str,
int* x_min, int* y_min, int* x_max, int* y_max);
#endif // THIRD_PARTY_TESSERACT_CCUTIL_BOXREAD_H__
......@@ -134,8 +134,8 @@ int UNICHAR::utf8_step(const char* utf8_str) {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
};
......
......@@ -24,7 +24,7 @@
// Maximum number of characters that can be stored in a UNICHAR. Must be
// at least 4. Must not exceed 31 without changing the coding of length.
#define UNICHAR_LEN 4
#define UNICHAR_LEN 8
// A UNICHAR_ID is the unique id of a unichar.
typedef int UNICHAR_ID;
......
......@@ -123,8 +123,11 @@ INT32 def_letter_is_okay(EDGE_ARRAY dawg,
const char *ptr;
for (ptr = word; *ptr != '\0';) {
word_single_lengths += UNICHAR::utf8_step(ptr);
ptr += UNICHAR::utf8_step(ptr);
int step = UNICHAR::utf8_step(ptr);
if (step == 0)
return FALSE;
word_single_lengths += step;
ptr += step;
}
if (*node == NO_EDGE) { /* Trailing punctuation */
......@@ -175,9 +178,10 @@ INT32 def_letter_is_okay(EDGE_ARRAY dawg,
if (case_sensative || case_is_okay (dummy_word, char_index)) {
//next_node (dawg, edge);
*node = next_node(dawg, edge);
if (*node == 0)
*node = NO_EDGE;
return (TRUE);
}
else {
} else {
return (FALSE);
}
}
......
......@@ -43,7 +43,7 @@
----------------------------------------------------------------------*/
#define FREQ_WERD 1.0
#define GOOD_WERD 1.1
#define OK_WERD 1.25
#define OK_WERD 1.3125
#define MAX_FREQ_EDGES 1500
#define NO_RATING -1
......
......@@ -24,14 +24,63 @@
// unichar per line.
#include <stdio.h>
/*
** Include automatically generated configuration file if running autoconf
*/
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif
#if defined(HAVE_WCHAR_T) || defined(__MSW32__) || defined(GOOGLE3)
#include <wchar.h>
#include <wctype.h>
#define USING_WCTYPE
#endif
#include "unichar.h"
#include "unicharset.h"
#include "strngs.h"
#include "boxread.h"
#include "tessopt.h"
static const char* const kUnicharsetFileName = "unicharset";
// Set character properties using wctype if we have it.
// Contributed by piggy@gmail.com.
// Modified by Ray to use UNICHAR for unicode conversion
// and to check for wctype using autoconf/presence of windows.
void set_properties(UNICHARSET *unicharset, const char* const c_string) {
#ifdef USING_WCTYPE
UNICHAR_ID id;
int wc;
// Convert the string to a unichar id.
id = unicharset->unichar_to_id(c_string);
int step = 0;
int len = strlen(c_string);
for (int offset = 0; offset < len; offset += step) {
step = UNICHAR::utf8_step(c_string + offset);
if (step == 0)
break; // Invalid utf-8.
// Get the next Unicode cond point in the string.
UNICHAR ch(c_string + offset, step);
wc = ch.first_uni();
/* Copy the properties. */
if (iswalpha(wc)) {
unicharset->set_isalpha(id, 1);
if (iswlower(wc))
unicharset->set_islower(id, 1);
if (iswlower(wc))
unicharset->set_isupper(id, 1);
}
if (iswdigit(wc))
unicharset->set_isdigit(id, 1);
}
#endif
}
int main(int argc, char** argv) {
int option;
const char* output_directory = ".";
......@@ -73,18 +122,12 @@ int main(int argc, char** argv) {
return -1;
}
while (!feof(box_file)) {
int x_min, y_min, x_max, y_max;
char buffer[256];
char c_string[256];
fgets(buffer, sizeof (buffer), box_file);
sscanf(buffer, "%s %d %d %d %d",
c_string, &x_min, &y_min, &x_max, &y_max);
char c_string[kBufSize];
while (read_next_box(box_file, c_string, &x_min, &y_min, &x_max, &y_max)) {
unicharset.unichar_insert(c_string);
set_properties(&unicharset, c_string);
}
fclose(box_file);
}
// Write unicharset file
......
......@@ -96,6 +96,11 @@ void best_first_search(CHUNKS_RECORD *chunks_record,
save_best_state(chunks_record);
#endif
start_recording();
FLOAT32 worst_priority = 2.0f * prioritize_state(chunks_record,
the_search,
best_state);
if (worst_priority < worst_state)
worst_priority = worst_state;
guided_state = *state;
do {
......@@ -119,7 +124,7 @@ void best_first_search(CHUNKS_RECORD *chunks_record,
break;
}
expand_node(chunks_record, the_search);
expand_node(worst_priority, chunks_record, the_search);
}
free_state (the_search->this_state);
......@@ -372,7 +377,8 @@ CHOICES_LIST rebuild_current_state(TBLOB *blobs,
* each one has not already been visited. If not add it to the priority
* queue.
**********************************************************************/
void expand_node(CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search) {
void expand_node(FLOAT32 worst_priority,
CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search) {
STATE old_state;
int x;
int mask = 1 << (the_search->num_joints - 1 - 32);
......@@ -383,8 +389,8 @@ void expand_node(CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search) {
for (x = the_search->num_joints; x > 32; x--) {
the_search->this_state->part1 = mask ^ old_state.part1;
if (!hash_lookup (the_search->closed_states, the_search->this_state))
push_queue (the_search->open_states,
the_search->this_state,
push_queue (the_search->open_states, the_search->this_state,
worst_priority,
prioritize_state (chunks_record, the_search, &old_state));
mask >>= 1;
}
......@@ -399,8 +405,8 @@ void expand_node(CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search) {
while (x--) {
the_search->this_state->part2 = mask ^ old_state.part2;
if (!hash_lookup (the_search->closed_states, the_search->this_state))
push_queue (the_search->open_states,
the_search->this_state,
push_queue (the_search->open_states, the_search->this_state,
worst_priority,
prioritize_state (chunks_record, the_search, &old_state));
mask >>= 1;
}
......@@ -472,10 +478,11 @@ STATE *pop_queue(HEAP *queue) {
*
* Add this state into the priority queue.
**********************************************************************/
void push_queue(HEAP *queue, STATE *state, FLOAT32 priority) {
void push_queue(HEAP *queue, STATE *state, FLOAT32 worst_priority,
FLOAT32 priority) {
HEAPENTRY entry;
if (SizeOfHeap (queue) < MaxSizeOfHeap (queue) && priority < worst_state) {
if (SizeOfHeap (queue) < MaxSizeOfHeap (queue) && priority < worst_priority) {
entry.Data = (char *) new_state (state);
num_pushed++;
entry.Key = priority;
......
......@@ -112,7 +112,9 @@ CHOICES_LIST rebuild_current_state(TBLOB *blobs,
CHOICES_LIST old_choices,
int fx);
void expand_node(CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search);
void expand_node(FLOAT32 worst_priority,
CHUNKS_RECORD *chunks_record,
SEARCH_RECORD *the_search);
SEARCH_RECORD *new_search(CHUNKS_RECORD *chunks_record,
int num_joints,
......@@ -122,7 +124,8 @@ SEARCH_RECORD *new_search(CHUNKS_RECORD *chunks_record,
STATE *pop_queue(HEAP *queue);
void push_queue(HEAP *queue, STATE *state, FLOAT32 priority);
void push_queue(HEAP *queue, STATE *state,
FLOAT32 worst_priority, FLOAT32 priority);
void replace_char_widths(CHUNKS_RECORD *chunks_record, SEARCH_STATE state);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册