tfacepp.cpp 17.9 KB
Newer Older
T
tmbdev 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
/**********************************************************************
 * File:        tfacepp.cpp  (Formerly tface++.c)
 * Description: C++ side of the C/C++ Tess/Editor interface.
 * Author:					Ray Smith
 * Created:					Thu Apr 23 15:39:23 BST 1992
 *
 * (C) Copyright 1992, Hewlett-Packard Ltd.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 **********************************************************************/

#include "mfcpch.h"
#ifdef __UNIX__
#include          <assert.h>
#endif
#include          "errcode.h"
#include          "tessarray.h"
//#include                                                      "fxtop.h"
#include          "werd.h"
#include          "tfacep.h"
#include          "tstruct.h"
#include          "tfacepp.h"
#include          "tessvars.h"
#include          "reject.h"

#define EXTERN

EXTERN BOOL_VAR (tessedit_override_permuter, TRUE, "According to dict_word");

static POLY_MATCHER tess_matcher;//current matcher
static POLY_TESTER tess_tester;  //current tester
static POLY_TESTER tess_trainer; //current trainer
static DENORM *tess_denorm;      //current denorm
static WERD *tess_word;          //current word

#define MAX_UNDIVIDED_LENGTH 24
/**********************************************************************
 * recog_word
 *
 * Convert the word to tess form and pass it to the tess segmenter.
 * Convert the output back to editor form.
 **********************************************************************/
WERD_CHOICE *recog_word(                           //recog one owrd
                        WERD *word,                //word to do
                        DENORM *denorm,            //de-normaliser
                        POLY_MATCHER matcher,      //matcher function
                        POLY_TESTER tester,        //tester function
                        POLY_TESTER trainer,       //trainer function
                        BOOL8 testing,             //true if answer driven
                        WERD_CHOICE *&raw_choice,  //raw result //list of blob lists
                        BLOB_CHOICE_LIST_CLIST *blob_choices,
                        WERD *&outword             //bln word output
                       ) {
  WERD_CHOICE *word_choice;
  UINT8 perm_type;
  UINT8 real_dict_perm_type;

  if (word->blob_list ()->empty ()) {
67 68 69 70 71
    char empty_lengths[] = {0};
    word_choice = new WERD_CHOICE ("", empty_lengths,
                                   10.0f, -1.0f, TOP_CHOICE_PERM);
    raw_choice = new WERD_CHOICE ("", empty_lengths,
                                  10.0f, -1.0f, TOP_CHOICE_PERM);
T
tmbdev 已提交
72 73 74 75 76 77
    outword = word->poly_copy (denorm->row ()->x_height ());
  }
  else
    word_choice = recog_word_recursive (word, denorm, matcher, tester,
      trainer, testing, raw_choice,
      blob_choices, outword);
78
  if ((word_choice->lengths ().length () !=
T
tmbdev 已提交
79
    outword->blob_list ()->length ()) ||
80
  (word_choice->lengths ().length () != blob_choices->length ())) {
T
tmbdev 已提交
81 82
    tprintf
      ("recog_word ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
83
      word_choice->string ().string (), word_choice->lengths ().length (),
T
tmbdev 已提交
84 85
      outword->blob_list ()->length (), blob_choices->length ());
  }
86
  ASSERT_HOST (word_choice->lengths ().length () ==
T
tmbdev 已提交
87
    outword->blob_list ()->length ());
88
  ASSERT_HOST (word_choice->lengths ().length () == blob_choices->length ());
T
tmbdev 已提交
89 90 91 92 93 94 95 96 97 98 99 100 101

  /* Copy any reject blobs into the outword */
  outword->rej_blob_list ()->deep_copy (word->rej_blob_list ());

  if (tessedit_override_permuter) {
    /* Override the permuter type if a straight dictionary check disagrees. */
    perm_type = word_choice->permuter ();
    if ((perm_type != SYSTEM_DAWG_PERM) &&
    (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
      real_dict_perm_type = dict_word (word_choice->string ().string ());
      if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
        (real_dict_perm_type == FREQ_DAWG_PERM) ||
        (real_dict_perm_type == USER_DAWG_PERM)) &&
102 103
        (alpha_count (word_choice->string ().string (),
                      word_choice->lengths ().string ()) > 0))
T
tmbdev 已提交
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
        word_choice->set_permuter (real_dict_perm_type);
      //Use dict perm
    }
    if (tessedit_rejection_debug && perm_type != word_choice->permuter ()) {
      tprintf ("Permuter Type Flipped from %d to %d\n",
        perm_type, word_choice->permuter ());
    }
  }
  assert ((word_choice == NULL) == (raw_choice == NULL));
  return word_choice;
}


/**********************************************************************
 * recog_word_recursive
 *
 * Convert the word to tess form and pass it to the tess segmenter.
 * Convert the output back to editor form.
 **********************************************************************/

WERD_CHOICE *recog_word_recursive(                           //recog one owrd
                                  WERD *word,                //word to do
                                  DENORM *denorm,            //de-normaliser
                                  POLY_MATCHER matcher,      //matcher function
                                  POLY_TESTER tester,        //tester function
                                  POLY_TESTER trainer,       //trainer function
                                  BOOL8 testing,             //true if answer driven
                                  WERD_CHOICE *&raw_choice,  //raw result //list of blob lists
                                  BLOB_CHOICE_LIST_CLIST *blob_choices,
                                  WERD *&outword             //bln word output
                                 ) {
  INT32 initial_blob_choice_len;
  INT32 word_length;             //no of blobs
  STRING word_string;            //converted from tess
138
  STRING word_string_lengths;
T
tmbdev 已提交
139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
  ARRAY tess_ratings;            //tess results
  A_CHOICE tess_choice;          //best word
  A_CHOICE tess_raw;             //raw result
  TWERD *tessword;               //tess format
  BLOB_CHOICE_LIST *choice_list; //fake list
                                 //iterator
  BLOB_CHOICE_LIST_C_IT choice_it;

  tess_matcher = matcher;        //install matcher
  tess_tester = testing ? tester : NULL;
  tess_trainer = testing ? trainer : NULL;
  tess_denorm = denorm;
  tess_word = word;
  //      blob_matchers[1]=call_matcher;
  if (word->blob_list ()->length () > MAX_UNDIVIDED_LENGTH) {
    return split_and_recog_word (word, denorm, matcher, tester, trainer,
      testing, raw_choice, blob_choices,
      outword);
  }
  else {
    if (word->flag (W_EOL))
      last_word_on_line = TRUE;
    else
      last_word_on_line = FALSE;
    initial_blob_choice_len = blob_choices->length ();
    tessword = make_tess_word (word, NULL);
    tess_ratings = cc_recog (tessword, &tess_choice, &tess_raw,
      testing
      && tester != NULL /* ? call_tester : NULL */ ,
      testing
      && trainer !=
      NULL /* ? call_train_tester : NULL */ );
                                 //convert word
    outword = make_ed_word (tessword, word);
    if (outword == NULL) {
      outword = word->poly_copy (denorm->row ()->x_height ());
    }
    delete_word(tessword);  //get rid of it
                                 //no of blobs
    word_length = outword->blob_list ()->length ();
                                 //convert all ratings
    convert_choice_lists(tess_ratings, blob_choices);
                                 //copy string
    word_string = tess_raw.string;
183 184
    word_string_lengths = tess_raw.lengths;
    while (word_string_lengths.length () < word_length) {
T
tmbdev 已提交
185
      word_string += " ";        //pad with blanks
186 187
      word_string_lengths += 1;
    }
T
tmbdev 已提交
188
    raw_choice = new WERD_CHOICE (word_string.string (),
189 190 191
                                  word_string_lengths.string (),
                                  tess_raw.rating, tess_raw.certainty,
                                  tess_raw.permuter);
T
tmbdev 已提交
192
    word_string = tess_choice.string;
193 194 195 196 197
    word_string_lengths = tess_choice.lengths;
    if (word_string_lengths.length () > word_length) {
      tprintf ("recog_word: Discarded long string \"%s\""
               " (%d characters vs %d blobs)\n",
        word_string.string (), word_string_lengths.length(), word_length);
T
tmbdev 已提交
198
      word_string = NULL;        //should never happen
199
      word_string_lengths = NULL;
T
tmbdev 已提交
200 201 202
    }
    if (blob_choices->length () - initial_blob_choice_len != word_length) {
      word_string = NULL;        //force rejection
203
      word_string_lengths = NULL;
T
tmbdev 已提交
204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
      tprintf ("recog_word: Choices list len:%d; blob lists len:%d\n",
        blob_choices->length (), word_length);
                                 //list of lists
      choice_it.set_to_list (blob_choices);
      while (blob_choices->length () - initial_blob_choice_len <
      word_length) {
                                 //get fake one
        choice_list = new BLOB_CHOICE_LIST;
                                 //add to list
        choice_it.add_to_end (choice_list);
        tprintf ("recog_word: Added dummy choice list\n");
      }
      while (blob_choices->length () - initial_blob_choice_len >
      word_length) {
        choice_it.move_to_last ();
                                 //should never happen
        delete choice_it.extract ();
        tprintf ("recog_word: Deleted choice list\n");
      }
    }
224
    while (word_string_lengths.length () < word_length) {
T
tmbdev 已提交
225
      word_string += " ";        //pad with blanks
226 227
      word_string_lengths += 1;
    }
T
tmbdev 已提交
228 229

    assert (raw_choice != NULL);
230
    if (tess_choice.string) {
T
tmbdev 已提交
231
      strfree(tess_choice.string);
232 233 234
      strfree(tess_choice.lengths);
    }
    if (tess_raw.string) {
T
tmbdev 已提交
235
      strfree(tess_raw.string);
236 237
      strfree(tess_raw.lengths);
    }
T
tmbdev 已提交
238
    return new WERD_CHOICE (word_string.string (),
239 240 241
                            word_string_lengths.string (),
                            tess_choice.rating, tess_choice.certainty,
                            tess_choice.permuter);
T
tmbdev 已提交
242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347
  }
}


/**********************************************************************
 * split_and_recog_word
 *
 * Convert the word to tess form and pass it to the tess segmenter.
 * Convert the output back to editor form.
 **********************************************************************/

WERD_CHOICE *split_and_recog_word(                           //recog one owrd
                                  WERD *word,                //word to do
                                  DENORM *denorm,            //de-normaliser
                                  POLY_MATCHER matcher,      //matcher function
                                  POLY_TESTER tester,        //tester function
                                  POLY_TESTER trainer,       //trainer function
                                  BOOL8 testing,             //true if answer driven
                                  WERD_CHOICE *&raw_choice,  //raw result //list of blob lists
                                  BLOB_CHOICE_LIST_CLIST *blob_choices,
                                  WERD *&outword             //bln word output
                                 ) {
  //   INT32                                                      outword1_len;
  //   INT32                                                      outword2_len;
  WERD *first_word;              //poly copy of word
  WERD *second_word;             //fabricated word
  WERD *outword2;                //2nd output word
  PBLOB *blob;
  WERD_CHOICE *result;           //resturn value
  WERD_CHOICE *result2;          //output of 2nd word
  WERD_CHOICE *raw_choice2;      //raw version of 2nd
  float gap;                     //blob gap
  float bestgap;                 //biggest gap
  PBLOB_LIST new_blobs;          //list of gathered blobs
  PBLOB_IT blob_it;
                                 //iterator
  PBLOB_IT new_blob_it = &new_blobs;

  first_word = word->poly_copy (denorm->row ()->x_height ());
  blob_it.set_to_list (first_word->blob_list ());
  bestgap = -MAX_INT32;
  while (!blob_it.at_last ()) {
    blob = blob_it.data ();
                                 //gap to next
    gap = blob_it.data_relative (1)->bounding_box ().left () - blob->bounding_box ().right ();
    blob_it.forward ();
    if (gap > bestgap) {
      bestgap = gap;             //find biggest
      new_blob_it = blob_it;     //save position
    }
  }
                                 //take 2nd half
  new_blobs.assign_to_sublist (&new_blob_it, &blob_it);
                                 //make it a word
  second_word = new WERD (&new_blobs, 1, NULL);
  ASSERT_HOST (word->blob_list ()->length () ==
    first_word->blob_list ()->length () +
    second_word->blob_list ()->length ());

  result = recog_word_recursive (first_word, denorm, matcher,
    tester, trainer, testing, raw_choice,
    blob_choices, outword);
  delete first_word;             //done that one
  result2 = recog_word_recursive (second_word, denorm, matcher,
    tester, trainer, testing, raw_choice2,
    blob_choices, outword2);
  delete second_word;            //done that too
  *result += *result2;           //combine ratings
  delete result2;
  *raw_choice += *raw_choice2;
  delete raw_choice2;            //finished with it
  //   outword1_len= outword->blob_list()->length();
  //   outword2_len= outword2->blob_list()->length();
  outword->join_on (outword2);   //join words
  delete outword2;
  //   if ( outword->blob_list()->length() != outword1_len + outword2_len )
  //      tprintf( "Split&Recog: part1len=%d; part2len=%d; combinedlen=%d\n",
  //                                outword1_len, outword2_len, outword->blob_list()->length() );
  //   ASSERT_HOST( outword->blob_list()->length() == outword1_len + outword2_len );
  return result;
}


/**********************************************************************
 * call_matcher
 *
 * Called from Tess with a blob in tess form.
 * Convert the blob to editor form.
 * Call the matcher setup by the segmenter in tess_matcher.
 * Convert the output choices back to tess form.
 **********************************************************************/

LIST call_matcher(                  //call a matcher
                  TBLOB *ptblob,    //previous
                  TBLOB *tessblob,  //blob to match
                  TBLOB *ntblob,    //next
                  void *,           //unused parameter
                  TEXTROW *         //always null anyway
                 ) {
  PBLOB *pblob;                  //converted blob
  PBLOB *blob;                   //converted blob
  PBLOB *nblob;                  //converted blob
  LIST result;                   //tess output
  BLOB_CHOICE *choice;           //current choice
  BLOB_CHOICE_LIST ratings;      //matcher result
  BLOB_CHOICE_IT it;             //iterator
348
  char choice_lengths[2] = {0, 0};
T
tmbdev 已提交
349 350

  blob = make_ed_blob (tessblob);//convert blob
351 352
  if (blob == NULL) {
    tprintf("Failed to convert blob for recognition!\n");
T
tmbdev 已提交
353
    return NULL;                 //can't do it
354
  }
T
tmbdev 已提交
355 356 357 358 359 360 361 362 363 364 365 366 367
  pblob = ptblob != NULL ? make_ed_blob (ptblob) : NULL;
  nblob = ntblob != NULL ? make_ed_blob (ntblob) : NULL;
  (*tess_matcher) (pblob, blob, nblob, tess_word, tess_denorm, ratings);
  //match it
  delete blob;                   //don't need that now
  if (pblob != NULL)
    delete pblob;
  if (nblob != NULL)
    delete nblob;
  it.set_to_list (&ratings);     //get list
  result = NULL;
  for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
    choice = it.data ();
368 369 370 371
    choice_lengths[0] = strlen(choice->unichar ());
    result = append_choice (result, choice->unichar (),
                            choice_lengths, choice->rating (),
                            choice->certainty (), choice->config ());
T
tmbdev 已提交
372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
  }
  return result;                 //converted list
}


/**********************************************************************
 * call_tester
 *
 * Called from Tess with a blob in tess form.
 * Convert the blob to editor form.
 * Call the tester setup by the segmenter in tess_tester.
 **********************************************************************/

void call_tester(                     //call a tester
                 TBLOB *tessblob,     //blob to test
                 BOOL8 correct_blob,  //true if good
                 char *text,          //source text
                 INT32 count,         //chars in text
                 LIST result          //output of matcher
                ) {
  PBLOB *blob;                   //converted blob
  BLOB_CHOICE_LIST ratings;      //matcher result

  blob = make_ed_blob (tessblob);//convert blob
  if (blob == NULL)
    return;
                                 //make it right type
  convert_choice_list(result, ratings);
  if (tess_tester != NULL)
    (*tess_tester) (blob, tess_denorm, correct_blob, text, count, &ratings);
  delete blob;                   //don't need that now
}


/**********************************************************************
 * call_train_tester
 *
 * Called from Tess with a blob in tess form.
 * Convert the blob to editor form.
 * Call the trainer setup by the segmenter in tess_trainer.
 **********************************************************************/

void call_train_tester(                     //call a tester
                       TBLOB *tessblob,     //blob to test
                       BOOL8 correct_blob,  //true if good
                       char *text,          //source text
                       INT32 count,         //chars in text
                       LIST result          //output of matcher
                      ) {
  PBLOB *blob;                   //converted blob
  BLOB_CHOICE_LIST ratings;      //matcher result

  blob = make_ed_blob (tessblob);//convert blob
  if (blob == NULL)
    return;
                                 //make it right type
  convert_choice_list(result, ratings);
  if (tess_trainer != NULL)
    (*tess_trainer) (blob, tess_denorm, correct_blob, text, count, &ratings);
  delete blob;                   //don't need that now
}