werd.cpp 15.0 KB
Newer Older
T
tmbdev 已提交
1 2 3
/**********************************************************************
 * File:        werd.cpp  (Formerly word.c)
 * Description: Code for the WERD class.
4 5
 * Author:      Ray Smith
 * Created:     Tue Oct 08 14:32:12 BST 1991
T
tmbdev 已提交
6 7 8 9 10 11 12 13 14 15 16 17 18 19
 *
 * (C) Copyright 1991, Hewlett-Packard Ltd.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 **********************************************************************/

J
joregan 已提交
20
#include "mfcpch.h"
21 22 23 24
#include "blckerr.h"
#include "helpers.h"
#include "linlsq.h"
#include "werd.h"
25

26 27 28 29 30
// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif

31 32 33
#define FIRST_COLOUR    ScrollView::RED         //< first rainbow colour
#define LAST_COLOUR     ScrollView::AQUAMARINE  //< last rainbow colour
#define CHILD_COLOUR    ScrollView::BROWN       //< colour of children
T
tmbdev 已提交
34 35

const ERRCODE CANT_SCALE_EDGESTEPS =
36
    "Attempted to scale an edgestep format word";
T
tmbdev 已提交
37

38
ELIST2IZE(WERD)
T
tmbdev 已提交
39

40
/**
T
tmbdev 已提交
41 42 43
 * WERD::WERD
 *
 * Constructor to build a WERD from a list of C_BLOBs.
44 45 46 47
 *   blob_list     The C_BLOBs (in word order) are not copied;
 *                 we take its elements and put them in our lists.
 *   blank_count   blanks in front of the word
 *   text          correct text, outlives this WERD
48
 */
49 50 51 52 53 54 55
WERD::WERD(C_BLOB_LIST *blob_list, uinT8 blank_count, const char *text)
  : blanks(blank_count),
    flags(0),
    script_id_(0),
    correct(text) {
  C_BLOB_IT start_it = blob_list;
  C_BLOB_IT end_it = blob_list;
T
tmbdev 已提交
56
  C_BLOB_IT rej_cblob_it = &rej_cblobs;
57
  C_OUTLINE_IT c_outline_it;
58 59
  inT16 inverted_vote = 0;
  inT16 non_inverted_vote = 0;
T
tmbdev 已提交
60

61 62 63 64 65
  // Move blob_list's elements into cblobs.
  while (!end_it.at_last())
    end_it.forward();
  cblobs.assign_to_sublist(&start_it, &end_it);

T
tmbdev 已提交
66 67 68 69 70 71 72 73 74 75 76 77 78
  /*
    Set white on black flag for the WERD, moving any duff blobs onto the
    rej_cblobs list.
    First, walk the cblobs checking the inverse flag for each outline of each
    cblob. If a cblob has inconsistent flag settings for its different
    outlines, move the blob to the reject list. Otherwise, increment the
    appropriate w-on-b or b-on-w vote for the word.

    Now set the inversion flag for the WERD by maximum vote.

    Walk the blobs again, moving any blob whose inversion flag does not agree
    with the concencus onto the reject list.
  */
79 80
  start_it.set_to_list(&cblobs);
  if (start_it.empty())
T
tmbdev 已提交
81
    return;
82 83 84 85 86 87 88 89 90 91
  for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) {
    BOOL8 reject_blob = FALSE;
    BOOL8 blob_inverted;

    c_outline_it.set_to_list(start_it.data()->out_list());
    blob_inverted = c_outline_it.data()->flag(COUT_INVERSE);
    for (c_outline_it.mark_cycle_pt();
         !c_outline_it.cycled_list() && !reject_blob;
         c_outline_it.forward()) {
      reject_blob = c_outline_it.data()->flag(COUT_INVERSE) != blob_inverted;
T
tmbdev 已提交
92
    }
93 94 95
    if (reject_blob) {
      rej_cblob_it.add_after_then_move(start_it.extract());
    } else {
T
tmbdev 已提交
96 97 98 99 100 101 102
      if (blob_inverted)
        inverted_vote++;
      else
        non_inverted_vote++;
    }
  }

103
  flags.set_bit(W_INVERSE, (inverted_vote > non_inverted_vote));
T
tmbdev 已提交
104

105 106
  start_it.set_to_list(&cblobs);
  if (start_it.empty())
T
tmbdev 已提交
107
    return;
108 109 110 111
  for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) {
    c_outline_it.set_to_list(start_it.data()->out_list());
    if (c_outline_it.data()->flag(COUT_INVERSE) != flags.bit(W_INVERSE))
      rej_cblob_it.add_after_then_move(start_it.extract());
T
tmbdev 已提交
112 113 114 115
  }
}


116
/**
T
tmbdev 已提交
117 118 119 120
 * WERD::WERD
 *
 * Constructor to build a WERD from a list of C_BLOBs.
 * The C_BLOBs are not copied so the source list is emptied.
121
 */
T
tmbdev 已提交
122

123 124 125 126 127 128 129
WERD::WERD(C_BLOB_LIST * blob_list,         //< In word order
           WERD * clone)                    //< Source of flags
  : flags(clone->flags),
    script_id_(clone->script_id_),
    correct(clone->correct) {
  C_BLOB_IT start_it = blob_list;  // iterator
  C_BLOB_IT end_it = blob_list;    // another
T
tmbdev 已提交
130 131 132 133 134 135 136 137 138

  while (!end_it.at_last ())
    end_it.forward ();           //move to last
  ((C_BLOB_LIST *) (&cblobs))->assign_to_sublist (&start_it, &end_it);
  //move to our list
  blanks = clone->blanks;
  //      fprintf(stderr,"Wrong constructor!!!!\n");
}

139 140 141 142 143 144 145 146 147 148 149
// Construct a WERD from a single_blob and clone the flags from this.
// W_BOL and W_EOL flags are set according to the given values.
WERD* WERD::ConstructFromSingleBlob(bool bol, bool eol, C_BLOB* blob) {
  C_BLOB_LIST temp_blobs;
  C_BLOB_IT temp_it(&temp_blobs);
  temp_it.add_after_then_move(blob);
  WERD* blob_word = new WERD(&temp_blobs, this);
  blob_word->set_flag(W_BOL, bol);
  blob_word->set_flag(W_EOL, eol);
  return blob_word;
}
T
tmbdev 已提交
150

151
/**
T
tmbdev 已提交
152 153 154 155 156 157 158 159 160 161
 * WERD::bounding_box
 *
 * Return the bounding box of the WERD.
 * This is quite a mess to compute!
 * ORIGINALLY, REJECT CBLOBS WERE EXCLUDED, however, this led to bugs when the
 * words on the row were re-sorted. The original words were built with reject
 * blobs included. The FUZZY SPACE flags were set accordingly. If ALL the
 * blobs in a word are rejected the BB for the word is NULL, causing the sort
 * to screw up, leading to the erroneous possibility of the first word in a
 * row being marked as FUZZY space.
162
 */
T
tmbdev 已提交
163

164 165 166
TBOX WERD::bounding_box() {
  TBOX box;                       // box being built
  C_BLOB_IT rej_cblob_it = &rej_cblobs;  // rejected blobs
T
tmbdev 已提交
167

168 169 170
  for (rej_cblob_it.mark_cycle_pt(); !rej_cblob_it.cycled_list();
       rej_cblob_it.forward()) {
    box += rej_cblob_it.data()->bounding_box();
T
tmbdev 已提交
171 172
  }

173 174 175
  C_BLOB_IT it = &cblobs;    // blobs of WERD
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
    box += it.data()->bounding_box();
T
tmbdev 已提交
176 177 178 179 180
  }
  return box;
}


181
/**
T
tmbdev 已提交
182 183 184 185
 * WERD::move
 *
 * Reposition WERD by vector
 * NOTE!! REJECT CBLOBS ARE NOT MOVED
186
 */
T
tmbdev 已提交
187

188
void WERD::move(const ICOORD vec) {
T
tmbdev 已提交
189 190
  C_BLOB_IT cblob_it(&cblobs);  // cblob iterator

191 192
  for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward())
    cblob_it.data()->move(vec);
T
tmbdev 已提交
193 194
}

195
/**
T
tmbdev 已提交
196 197 198
 * WERD::join_on
 *
 * Join other word onto this one. Delete the old word.
199
 */
T
tmbdev 已提交
200

201
void WERD::join_on(WERD* other) {
202 203
  C_BLOB_IT blob_it(&cblobs);
  C_BLOB_IT src_it(&other->cblobs);
204
  C_BLOB_IT rej_cblob_it(&rej_cblobs);
205
  C_BLOB_IT src_rej_it(&other->rej_cblobs);
T
tmbdev 已提交
206

207 208 209
  while (!src_it.empty()) {
    blob_it.add_to_end(src_it.extract());
    src_it.forward();
T
tmbdev 已提交
210
  }
211 212 213
  while (!src_rej_it.empty()) {
    rej_cblob_it.add_to_end(src_rej_it.extract());
    src_rej_it.forward();
T
tmbdev 已提交
214 215 216 217
  }
}


218
/**
T
tmbdev 已提交
219 220 221
 * WERD::copy_on
 *
 * Copy blobs from other word onto this one.
222
 */
T
tmbdev 已提交
223

224 225
void WERD::copy_on(WERD* other) {
  bool reversed = other->bounding_box().left() < bounding_box().left();
226 227
  C_BLOB_IT c_blob_it(&cblobs);
  C_BLOB_LIST c_blobs;
T
tmbdev 已提交
228

229 230 231
  c_blobs.deep_copy(&other->cblobs, &C_BLOB::deep_copy);
  if (reversed) {
    c_blob_it.add_list_before(&c_blobs);
232
  } else {
233 234
    c_blob_it.move_to_last();
    c_blob_it.add_list_after(&c_blobs);
T
tmbdev 已提交
235
  }
236
  if (!other->rej_cblobs.empty()) {
237
    C_BLOB_IT rej_c_blob_it(&rej_cblobs);
T
tmbdev 已提交
238 239
    C_BLOB_LIST new_rej_c_blobs;

240
    new_rej_c_blobs.deep_copy(&other->rej_cblobs, &C_BLOB::deep_copy);
241 242 243 244 245 246
    if (reversed) {
      rej_c_blob_it.add_list_before(&new_rej_c_blobs);
    } else {
      rej_c_blob_it.move_to_last();
      rej_c_blob_it.add_list_after(&new_rej_c_blobs);
    }
T
tmbdev 已提交
247 248 249
  }
}

250
/**
T
tmbdev 已提交
251 252 253
 * WERD::print
 *
 * Display members
254
 */
T
tmbdev 已提交
255

256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
void WERD::print() {
  tprintf("Blanks= %d\n", blanks);
  bounding_box().print();
  tprintf("Flags = %d = 0%o\n", flags.val, flags.val);
  tprintf("   W_SEGMENTED = %s\n", flags.bit(W_SEGMENTED) ? "TRUE" : "FALSE ");
  tprintf("   W_ITALIC = %s\n", flags.bit(W_ITALIC) ? "TRUE" : "FALSE ");
  tprintf("   W_BOL = %s\n", flags.bit(W_BOL) ? "TRUE" : "FALSE ");
  tprintf("   W_EOL = %s\n", flags.bit(W_EOL) ? "TRUE" : "FALSE ");
  tprintf("   W_NORMALIZED = %s\n",
          flags.bit(W_NORMALIZED) ? "TRUE" : "FALSE ");
  tprintf("   W_SCRIPT_HAS_XHEIGHT = %s\n",
          flags.bit(W_SCRIPT_HAS_XHEIGHT) ? "TRUE" : "FALSE ");
  tprintf("   W_SCRIPT_IS_LATIN = %s\n",
          flags.bit(W_SCRIPT_IS_LATIN) ? "TRUE" : "FALSE ");
  tprintf("   W_DONT_CHOP = %s\n", flags.bit(W_DONT_CHOP) ? "TRUE" : "FALSE ");
  tprintf("   W_REP_CHAR = %s\n", flags.bit(W_REP_CHAR) ? "TRUE" : "FALSE ");
  tprintf("   W_FUZZY_SP = %s\n", flags.bit(W_FUZZY_SP) ? "TRUE" : "FALSE ");
  tprintf("   W_FUZZY_NON = %s\n", flags.bit(W_FUZZY_NON) ? "TRUE" : "FALSE ");
  tprintf("Correct= %s\n", correct.string());
  tprintf("Rejected cblob count = %d\n", rej_cblobs.length());
  tprintf("Script = %d\n", script_id_);
T
tmbdev 已提交
277 278 279
}


280
/**
T
tmbdev 已提交
281 282 283
 * WERD::plot
 *
 * Draw the WERD in the given colour.
284
 */
T
tmbdev 已提交
285 286

#ifndef GRAPHICS_DISABLED
287
void WERD::plot(ScrollView *window, ScrollView::Color colour) {
288 289 290
  C_BLOB_IT it = &cblobs;
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
    it.data()->plot(window, colour, colour);
T
tmbdev 已提交
291
  }
292
  plot_rej_blobs(window);
T
tmbdev 已提交
293
}
294 295 296 297 298 299 300 301 302

// Get the next color in the (looping) rainbow.
ScrollView::Color WERD::NextColor(ScrollView::Color colour) {
  ScrollView::Color next = static_cast<ScrollView::Color>(colour + 1);
  if (next >= LAST_COLOUR || next < FIRST_COLOUR)
    next = FIRST_COLOUR;
  return next;
}

303
/**
T
tmbdev 已提交
304 305
 * WERD::plot
 *
306
 * Draw the WERD in rainbow colours in window.
307
 */
T
tmbdev 已提交
308

309 310
void WERD::plot(ScrollView* window) {
  ScrollView::Color colour = FIRST_COLOUR;
311 312 313 314
  C_BLOB_IT it = &cblobs;
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
    it.data()->plot(window, colour, CHILD_COLOUR);
    colour = NextColor(colour);
T
tmbdev 已提交
315
  }
316
  plot_rej_blobs(window);
T
tmbdev 已提交
317 318 319
}


320
/**
T
tmbdev 已提交
321 322
 * WERD::plot_rej_blobs
 *
323
 * Draw the WERD rejected blobs in window - ALWAYS GREY
324
 */
T
tmbdev 已提交
325

326

327
void WERD::plot_rej_blobs(ScrollView *window) {
328 329 330
  C_BLOB_IT it = &rej_cblobs;
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
    it.data()->plot(window, ScrollView::GREY, ScrollView::GREY);
T
tmbdev 已提交
331 332
  }
}
333
#endif  // GRAPHICS_DISABLED
T
tmbdev 已提交
334 335


336
/**
T
tmbdev 已提交
337 338 339
 * WERD::shallow_copy()
 *
 * Make a shallow copy of a word
340
 */
T
tmbdev 已提交
341

342
WERD *WERD::shallow_copy() {
T
tmbdev 已提交
343 344 345 346 347 348 349 350 351 352
  WERD *new_word = new WERD;

  new_word->blanks = blanks;
  new_word->flags = flags;
  new_word->dummy = dummy;
  new_word->correct = correct;
  return new_word;
}


353
/**
T
tmbdev 已提交
354 355 356
 * WERD::operator=
 *
 * Assign a word, DEEP copying the blob list
357
 */
T
tmbdev 已提交
358

359 360
WERD & WERD::operator= (const WERD & source) {
  this->ELIST2_LINK::operator= (source);
T
tmbdev 已提交
361 362
  blanks = source.blanks;
  flags = source.flags;
363
  script_id_ = source.script_id_;
T
tmbdev 已提交
364 365
  dummy = source.dummy;
  correct = source.correct;
366 367 368
  if (!cblobs.empty())
    cblobs.clear();
  cblobs.deep_copy(&source.cblobs, &C_BLOB::deep_copy);
T
tmbdev 已提交
369

370 371 372
  if (!rej_cblobs.empty())
    rej_cblobs.clear();
  rej_cblobs.deep_copy(&source.rej_cblobs, &C_BLOB::deep_copy);
T
tmbdev 已提交
373 374 375 376
  return *this;
}


377
/**
T
tmbdev 已提交
378 379 380 381
 *  word_comparator()
 *
 *  word comparator used to sort a word list so that words are in increasing
 *  order of left edge.
382
 */
T
tmbdev 已提交
383

384 385 386 387 388
int word_comparator(const void *word1p, const void *word2p) {
  WERD *word1 = *(WERD **)word1p;
  WERD *word2 = *(WERD **)word2p;
  return word1->bounding_box().left() - word2->bounding_box().left();
}
T
tmbdev 已提交
389

390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449
/**
 *  WERD::ConstructWerdWithNewBlobs()
 *
 * This method returns a new werd constructed using the blobs in the input
 * all_blobs list, which correspond to the blobs in this werd object. The
 * blobs used to construct the new word are consumed and removed from the
 * input all_blobs list.
 * Returns NULL if the word couldn't be constructed.
 * Returns original blobs for which no matches were found in the output list
 * orphan_blobs (appends).
 */

WERD* WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST* all_blobs,
                                      C_BLOB_LIST* orphan_blobs) {
  C_BLOB_LIST current_blob_list;
  C_BLOB_IT werd_blobs_it(&current_blob_list);
  // Add the word's c_blobs.
  werd_blobs_it.add_list_after(cblob_list());

  // New blob list. These contain the blobs which will form the new word.
  C_BLOB_LIST new_werd_blobs;
  C_BLOB_IT new_blobs_it(&new_werd_blobs);

  // not_found_blobs contains the list of current word's blobs for which a
  // corresponding blob wasn't found in the input all_blobs list.
  C_BLOB_LIST not_found_blobs;
  C_BLOB_IT not_found_it(&not_found_blobs);
  not_found_it.move_to_last();

  werd_blobs_it.move_to_first();
  for (werd_blobs_it.mark_cycle_pt(); !werd_blobs_it.cycled_list();
       werd_blobs_it.forward()) {
    C_BLOB* werd_blob = werd_blobs_it.extract();
    TBOX werd_blob_box = werd_blob->bounding_box();
    bool found = false;
    // Now find the corresponding blob for this blob in the all_blobs
    // list. For now, follow the inefficient method of pairwise
    // comparisons. Ideally, one can pre-bucket the blobs by row.
    C_BLOB_IT all_blobs_it(all_blobs);
    for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list();
         all_blobs_it.forward()) {
      C_BLOB* a_blob = all_blobs_it.data();
      // Compute the overlap of the two blobs. If major, a_blob should
      // be added to the new blobs list.
      TBOX a_blob_box = a_blob->bounding_box();
      if (a_blob_box.null_box()) {
        tprintf("Bounding box couldn't be ascertained\n");
      }
      if (werd_blob_box.contains(a_blob_box) ||
          werd_blob_box.major_overlap(a_blob_box)) {
        // Old blobs are from minimal splits, therefore are expected to be
        // bigger. The new small blobs should cover a significant portion.
        // This is it.
        all_blobs_it.extract();
        new_blobs_it.add_after_then_move(a_blob);
        found = true;
      }
    }
    if (!found) {
      not_found_it.add_after_then_move(werd_blob);
450 451
    } else {
      delete werd_blob;
452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468
    }
  }
  // Iterate over all not found blobs. Some of them may be due to
  // under-segmentation (which is OK, since the corresponding blob is already
  // in the list in that case.
  not_found_it.move_to_first();
  for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list();
       not_found_it.forward()) {
    C_BLOB* not_found = not_found_it.data();
    TBOX not_found_box = not_found->bounding_box();
    C_BLOB_IT existing_blobs_it(new_blobs_it);
    for (existing_blobs_it.mark_cycle_pt(); !existing_blobs_it.cycled_list();
         existing_blobs_it.forward()) {
      C_BLOB* a_blob = existing_blobs_it.data();
      TBOX a_blob_box = a_blob->bounding_box();
      if ((not_found_box.major_overlap(a_blob_box) ||
           a_blob_box.major_overlap(not_found_box)) &&
469
           not_found_box.y_overlap(a_blob_box)) {
470
        // Already taken care of.
471 472
        delete not_found_it.extract();
        break;
473 474 475 476 477 478 479 480 481 482 483 484 485
      }
    }
  }
  if (orphan_blobs) {
    C_BLOB_IT orphan_blobs_it(orphan_blobs);
    orphan_blobs_it.move_to_last();
    orphan_blobs_it.add_list_after(&not_found_blobs);
  }

  // New blobs are ready. Create a new werd object with these.
  WERD* new_werd = NULL;
  if (!new_werd_blobs.empty()) {
    new_werd = new WERD(&new_werd_blobs, this);
486 487 488 489
  } else {
    // Add the blobs back to this word so that it can be reused.
    C_BLOB_IT this_list_it(cblob_list());
    this_list_it.add_list_after(&not_found_blobs);
490 491
  }
  return new_werd;
T
tmbdev 已提交
492
}