Fixed various internationalization issues, mostly for training

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@106 d0cd1f9f-072b-0410-8dd7-cf729c803f20

Fixed various internationalization issues, mostly for training
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@106 d0cd1f9f-072b-0410-8dd7-cf729c803f20
f382fb56 · theraysmith · 100942d7 · f382fb56 · f382fb56 · f382fb56
11 changed file
--- a/ccmain/applybox.cpp
+++ b/ccmain/applybox.cpp
@@ -31,6 +31,7 @@ what measures we are interested in.
 #include <assert.h>
 #include <errno.h>
 #endif
+#include "boxread.h"
 #include "mainblk.h"
 #include "genblob.h"
 #include "fixxht.h"
@@ -207,43 +208,20 @@ void clear_any_old_text(                        //remove correct text
 BOOL8 read_next_box(FILE* box_file,  //
                    BOX *box,
                    UNICHAR_ID *uch_id) {
-  char buff[256];                //boxfile read buffer
-  char *buffptr = buff;
-  STRING box_filename;
-  static INT16 line = 0;
-  INT32 x_min;
-  INT32 y_min;
-  INT32 x_max;
-  INT32 y_max;
-  INT32 count = 0;
-  char uch[256];
-
-  while (!feof (box_file)) {
-    fgets (buff, sizeof (buff) - 1, box_file);
-    line++;
-
-    buffptr = buff;
-    const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr);
-    if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
-      buffptr += 3;  // Skip unicode file designation.
-    /* Check for blank lines in box file */
-    while (isspace (*buffptr))
-      buffptr++;
-    if (*buffptr != '\0') {
-      count =
-        sscanf (buffptr,
-        "%s " INT32FORMAT " " INT32FORMAT " " INT32FORMAT " "
-        INT32FORMAT, uch, &x_min, &y_min, &x_max, &y_max);
-      if (count != 5) {
-        tprintf ("Box file format error on line %i ignored\n", line);
-      }
-      else {
+  int x_min;
+  int y_min;
+  int x_max;
+  int y_max;
+  char uch[kBufSize];
+
+  while (read_next_box(box_file, uch, &x_min, &y_min, &x_max, &y_max)) {
    if (!unicharset_boxes.contains_unichar(uch))
    {
      unicharset_boxes.unichar_insert(uch);
      if (unicharset_boxes.size() > MAX_NUM_CLASSES) {
-            tprintf("Error: Size of unicharset of boxes is \
-greater than MAX_NUM_CLASSES\n");
+        tprintf("Error: Size of unicharset of boxes is "
+                "greater than MAX_NUM_CLASSES (%d)\n",
+                MAX_NUM_CLASSES);
        exit(1);
      }
    }
@@ -251,8 +229,6 @@ greater than MAX_NUM_CLASSES\n");
    *box = BOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max));
    return TRUE;             //read a box ok
  }
-    }
-  }
  return FALSE;                  //EOF
 }


--- a/ccmain/output.cpp
+++ b/ccmain/output.cpp
@@ -857,7 +857,7 @@ void write_shm_text(                    //write output
                         lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
        } else {
          for (int suboffset = 0; suboffset < text_lengths[index]; ++suboffset)
-            ocr_append_char (text[offset + suboffset],
+            ocr_append_char (static_cast<unsigned char>(text[offset+suboffset]),
                             blob_box.left (), blob_box.right (),
                             page_image.get_ysize () - 1 - blob_box.top (),
                             page_image.get_ysize () - 1 - blob_box.bottom (),

--- a/ccutil/boxread.cpp
+++ b/ccutil/boxread.cpp
+/**********************************************************************
+ * File:        boxread.cpp
+ * Description: Read data from a box file.
+ * Author:		Ray Smith
+ * Created:		Fri Aug 24 17:47:23 PDT 2007
+ *
+ * (C) Copyright 2007, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "mfcpch.h"
+#include <string.h>
+#include "boxread.h"
+#include "unichar.h"
+#include "tprintf.h"
+
+bool read_next_box(FILE* box_file, char* utf8_str,
+                   int* x_min, int* y_min, int* x_max, int* y_max) {
+  static int line = 0;
+  int count = 0;
+  char buff[kBufSize];                //boxfile read buffer
+  char uch[kBufSize];
+  char *buffptr = buff;
+
+  while (!feof(box_file)) {
+    fgets(buff, sizeof(buff) - 1, box_file);
+    line++;
+
+    buffptr = buff;
+    const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr);
+    if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
+      buffptr += 3;  // Skip unicode file designation.
+    /* Check for blank lines in box file */
+    while (*buffptr == ' ' || *buffptr == '\t')
+      buffptr++;
+    if (*buffptr != '\0') {
+      count = sscanf(buffptr, "%s " INT32FORMAT " " INT32FORMAT " "
+                     INT32FORMAT " " INT32FORMAT,
+                     uch, x_min, y_min, x_max, y_max);
+      if (count == 5) {
+        // Validate UTF8 by making unichars with it.
+        int used = 0;
+        int uch_len = strlen(uch);
+        while (used < uch_len) {
+          UNICHAR ch(uch + used, uch_len - used);
+          int new_used = ch.utf8_len();
+          if (new_used == 0) {
+            tprintf("Bad utf-8 char starting with 0x%x at line %d, col %d, \n",
+                    uch[used], used + 1, line);
+            count = 0;
+            break;
+          }
+          used += new_used;
+        }
+        if (uch_len > UNICHAR_LEN) {
+          tprintf("utf-8 string too long at line %d\n", line);
+          count = 0;
+        }
+      }
+      if (count != 5) {
+        tprintf("Box file format error on line %i ignored\n", line);
+      } else {
+        strcpy(utf8_str, uch);
+        return true;             //read a box ok
+      }
+    }
+  }
+  fclose(box_file);
+  line = 0;
+  return false;                  //EOF
+}
--- a/ccutil/boxread.h
+++ b/ccutil/boxread.h
+/**********************************************************************
+ * File:        boxread.cpp
+ * Description: Read data from a box file.
+ * Author:		Ray Smith
+ * Created:		Fri Aug 24 17:47:23 PDT 2007
+ *
+ * (C) Copyright 2007, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef THIRD_PARTY_TESSERACT_CCUTIL_BOXREAD_H__
+#define THIRD_PARTY_TESSERACT_CCUTIL_BOXREAD_H__
+
+#include <stdio.h>
+
+const int kBufSize = 256;
+// read_next_box factors out the code to interpret a line of a box
+// file so that applybox and unicharset_extractor interpert the same way.
+// This function returns the next valid box file utf8 string and coords
+// and returns true, or false on eof (and closes the file).
+// If ignores the uft8 file signature, checks for valid utf-8 and allows
+// space or tab between fields.
+// utf8_str must be at least kBufSize in length.
+bool read_next_box(FILE* box_file, char* utf8_str,
+                   int* x_min, int* y_min, int* x_max, int* y_max);
+
+#endif  // THIRD_PARTY_TESSERACT_CCUTIL_BOXREAD_H__
+
--- a/ccutil/unichar.cpp
+++ b/ccutil/unichar.cpp
@@ -134,8 +134,8 @@ int UNICHAR::utf8_step(const char* utf8_str) {
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
  };

--- a/ccutil/unichar.h
+++ b/ccutil/unichar.h
@@ -24,7 +24,7 @@

 // Maximum number of characters that can be stored in a UNICHAR. Must be
 // at least 4. Must not exceed 31 without changing the coding of length.
-#define UNICHAR_LEN 4
+#define UNICHAR_LEN 8

 // A UNICHAR_ID is the unique id of a unichar.
 typedef int UNICHAR_ID;

--- a/dict/dawg.cpp
+++ b/dict/dawg.cpp
@@ -123,8 +123,11 @@ INT32 def_letter_is_okay(EDGE_ARRAY dawg,
  const char *ptr;

  for (ptr = word; *ptr != '\0';) {
-    word_single_lengths += UNICHAR::utf8_step(ptr);
-    ptr += UNICHAR::utf8_step(ptr);
+    int step = UNICHAR::utf8_step(ptr);
+    if (step == 0)
+      return FALSE;
+    word_single_lengths += step;
+    ptr += step;
  }

  if (*node == NO_EDGE) {        /* Trailing punctuation */
@@ -175,9 +178,10 @@ INT32 def_letter_is_okay(EDGE_ARRAY dawg,
    if (case_sensative || case_is_okay (dummy_word, char_index)) {
                                 //next_node (dawg, edge);
      *node = next_node(dawg, edge);
+      if (*node == 0)
+        *node = NO_EDGE;
      return (TRUE);
-    }
-    else {
+    } else {
      return (FALSE);
    }
  }

--- a/dict/permdawg.cpp
+++ b/dict/permdawg.cpp
@@ -43,7 +43,7 @@
 ----------------------------------------------------------------------*/
 #define FREQ_WERD     1.0
 #define GOOD_WERD     1.1
-#define OK_WERD       1.25
+#define OK_WERD       1.3125
 #define MAX_FREQ_EDGES    1500
 #define NO_RATING              -1


--- a/training/unicharset_extractor.cpp
+++ b/training/unicharset_extractor.cpp
@@ -24,14 +24,63 @@
 // unichar per line.

 #include <stdio.h>
+/*
+** Include automatically generated configuration file if running autoconf
+*/
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+#if defined(HAVE_WCHAR_T) || defined(__MSW32__) || defined(GOOGLE3)
+#include <wchar.h>
+#include <wctype.h>
+#define USING_WCTYPE
+#endif

 #include "unichar.h"
 #include "unicharset.h"
 #include "strngs.h"
+#include "boxread.h"
 #include "tessopt.h"

 static const char* const kUnicharsetFileName = "unicharset";

+// Set character properties using wctype if we have it.
+// Contributed by piggy@gmail.com.
+// Modified by Ray to use UNICHAR for unicode conversion
+// and to check for wctype using autoconf/presence of windows.
+void set_properties(UNICHARSET *unicharset, const char* const c_string) {
+#ifdef USING_WCTYPE
+  UNICHAR_ID id;
+  int wc;
+
+  // Convert the string to a unichar id.
+  id = unicharset->unichar_to_id(c_string);
+
+  int step = 0;
+  int len = strlen(c_string);
+  for (int offset = 0; offset < len; offset += step) {
+    step = UNICHAR::utf8_step(c_string + offset);
+    if (step == 0)
+      break; // Invalid utf-8.
+
+    // Get the next Unicode cond point in the string.
+    UNICHAR ch(c_string + offset, step);
+    wc = ch.first_uni();
+
+    /* Copy the properties. */
+    if (iswalpha(wc)) {
+      unicharset->set_isalpha(id, 1);
+      if (iswlower(wc))
+        unicharset->set_islower(id, 1);
+      if (iswlower(wc))
+        unicharset->set_isupper(id, 1);
+    }
+    if (iswdigit(wc))
+      unicharset->set_isdigit(id, 1);
+  }
+#endif
+}
+
 int main(int argc, char** argv) {
  int option;
  const char* output_directory = ".";
@@ -73,18 +122,12 @@ int main(int argc, char** argv) {
      return -1;
    }

-    while (!feof(box_file)) {
    int x_min, y_min, x_max, y_max;
-      char buffer[256];
-      char c_string[256];
-
-      fgets(buffer, sizeof (buffer), box_file);
-      sscanf(buffer, "%s %d %d %d %d",
-             c_string, &x_min, &y_min, &x_max, &y_max);
-
+    char c_string[kBufSize];
+    while (read_next_box(box_file, c_string, &x_min, &y_min, &x_max, &y_max)) {
      unicharset.unichar_insert(c_string);
+      set_properties(&unicharset, c_string);
    }
-    fclose(box_file);
  }

  // Write unicharset file

--- a/wordrec/bestfirst.cpp
+++ b/wordrec/bestfirst.cpp
@@ -96,6 +96,11 @@ void best_first_search(CHUNKS_RECORD *chunks_record,
  save_best_state(chunks_record);
 #endif
  start_recording();
+  FLOAT32 worst_priority = 2.0f * prioritize_state(chunks_record,
+                                                   the_search,
+                                                   best_state);
+  if (worst_priority < worst_state)
+    worst_priority = worst_state;

  guided_state = *state;
  do {
@@ -119,7 +124,7 @@ void best_first_search(CHUNKS_RECORD *chunks_record,
        break;
      }

-      expand_node(chunks_record, the_search); 
+      expand_node(worst_priority, chunks_record, the_search);
    }

    free_state (the_search->this_state);
@@ -372,7 +377,8 @@ CHOICES_LIST rebuild_current_state(TBLOB *blobs,
 * each one has not already been visited.  If not add it to the priority
 * queue.
 **********************************************************************/
-void expand_node(CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search) { 
+void expand_node(FLOAT32 worst_priority,
+                 CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search) {
  STATE old_state;
  int x;
  int mask = 1 << (the_search->num_joints - 1 - 32);
@@ -383,8 +389,8 @@ void expand_node(CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search) {
  for (x = the_search->num_joints; x > 32; x--) {
    the_search->this_state->part1 = mask ^ old_state.part1;
    if (!hash_lookup (the_search->closed_states, the_search->this_state))
-      push_queue (the_search->open_states,
-        the_search->this_state,
+      push_queue (the_search->open_states, the_search->this_state,
+                  worst_priority,
                  prioritize_state (chunks_record, the_search, &old_state));
    mask >>= 1;
  }
@@ -399,8 +405,8 @@ void expand_node(CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search) {
  while (x--) {
    the_search->this_state->part2 = mask ^ old_state.part2;
    if (!hash_lookup (the_search->closed_states, the_search->this_state))
-      push_queue (the_search->open_states,
-        the_search->this_state,
+      push_queue (the_search->open_states, the_search->this_state,
+                  worst_priority,
                  prioritize_state (chunks_record, the_search, &old_state));
    mask >>= 1;
  }
@@ -472,10 +478,11 @@ STATE *pop_queue(HEAP *queue) {
 *
 * Add this state into the priority queue.
 **********************************************************************/
-void push_queue(HEAP *queue, STATE *state, FLOAT32 priority) { 
+void push_queue(HEAP *queue, STATE *state, FLOAT32 worst_priority,
+                FLOAT32 priority) {
  HEAPENTRY entry;

-  if (SizeOfHeap (queue) < MaxSizeOfHeap (queue) && priority < worst_state) {
+  if (SizeOfHeap (queue) < MaxSizeOfHeap (queue) && priority < worst_priority) {
    entry.Data = (char *) new_state (state);
    num_pushed++;
    entry.Key = priority;

--- a/wordrec/bestfirst.h
+++ b/wordrec/bestfirst.h
@@ -112,7 +112,9 @@ CHOICES_LIST rebuild_current_state(TBLOB *blobs,
                                   CHOICES_LIST old_choices,
                                   int fx);

-void expand_node(CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search); 
+void expand_node(FLOAT32 worst_priority,
+                 CHUNKS_RECORD *chunks_record,
+                 SEARCH_RECORD *the_search);

 SEARCH_RECORD *new_search(CHUNKS_RECORD *chunks_record,
                          int num_joints,
@@ -122,7 +124,8 @@ SEARCH_RECORD *new_search(CHUNKS_RECORD *chunks_record,

 STATE *pop_queue(HEAP *queue);

-void push_queue(HEAP *queue, STATE *state, FLOAT32 priority); 
+void push_queue(HEAP *queue, STATE *state,
+                FLOAT32 worst_priority, FLOAT32 priority);

 void replace_char_widths(CHUNKS_RECORD *chunks_record, SEARCH_STATE state);