unicharset.cpp 41.3 KB
Newer Older
T
theraysmith 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
///////////////////////////////////////////////////////////////////////
// File:        unicharset.cpp
// Description: Unicode character/ligature set class.
// Author:      Thomas Kielbus
// Created:     Wed Jun 28 17:05:01 PDT 2006
//
// (C) Copyright 2006, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

20 21
#include "unicharset.h"

T
theraysmith 已提交
22 23 24 25
#include <assert.h>
#include <stdio.h>
#include <string.h>

26 27
#include "params.h"
#include "serialis.h"
28
#include "tesscallback.h"
T
theraysmith 已提交
29
#include "tprintf.h"
T
theraysmith 已提交
30 31
#include "unichar.h"

32 33 34 35
// TODO(rays) Move UNICHARSET to tesseract namespace.
using tesseract::char32;
using tesseract::UNICHAR;

36 37 38 39 40
// Special character used in representing character fragments.
static const char kSeparator = '|';
// Special character used in representing 'natural' character fragments.
static const char kNaturalFlag = 'n';

41 42 43 44
static const int ISALPHA_MASK = 0x1;
static const int ISLOWER_MASK = 0x2;
static const int ISUPPER_MASK = 0x4;
static const int ISDIGIT_MASK = 0x8;
T
theraysmith 已提交
45
static const int ISPUNCTUATION_MASK = 0x10;
46

47 48 49 50 51
// Y coordinate threshold for determining cap-height vs x-height.
// TODO(rays) Bring the global definition down to the ccutil library level,
// so this constant is relative to some other constants.
static const int kMeanlineThreshold = 220;
// Let C be the number of alpha chars for which all tops exceed
52 53 54 55 56
// kMeanlineThreshold, and X the number of alpha chars for which all
// tops are below kMeanlineThreshold, then if X > C *
// kMinXHeightFraction and C > X * kMinCapHeightFraction or more than
// half the alpha characters have upper or lower case, then the
// unicharset "has x-height".
57
const double kMinXHeightFraction = 0.25;
58 59 60 61 62 63 64 65 66 67 68
const double kMinCapHeightFraction = 0.05;

/*static */
const char* UNICHARSET::kCustomLigatures[][2] = {
  {"ct", "\uE003"},  // c + t -> U+E003
  {"ſh", "\uE006"},  // long-s + h -> U+E006
  {"ſi", "\uE007"},  // long-s + i -> U+E007
  {"ſl", "\uE008"},  // long-s + l -> U+E008
  {"ſſ", "\uE009"},  // long-s + long-s -> U+E009
  {NULL, NULL}
};
69

70 71 72 73 74 75 76 77 78
// List of mappings to make when ingesting strings from the outside.
// The substitutions clean up text that should exist for rendering of
// synthetic data, but not in the recognition set.
const char* UNICHARSET::kCleanupMaps[][2] = {
    {"\u0640", ""},    // TATWEEL is deleted.
    {"\ufb01", "fi"},  // fi ligature->fi pair.
    {"\ufb02", "fl"},  // fl ligature->fl pair.
    {nullptr, nullptr}};

79 80 81 82 83 84 85
// List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
const char* UNICHARSET::kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT] = {
    " ",
    "Joined",
    "|Broken|0|1"
};

86 87 88
UNICHARSET::UNICHAR_PROPERTIES::UNICHAR_PROPERTIES() {
  Init();
}
89 90

// Initialize all properties to sensible default values.
91 92 93 94 95 96 97 98
void UNICHARSET::UNICHAR_PROPERTIES::Init() {
  isalpha = false;
  islower = false;
  isupper = false;
  isdigit = false;
  ispunctuation = false;
  isngram = false;
  enabled = false;
99 100 101 102 103 104 105 106 107 108 109 110
  SetRangesOpen();
  script_id = 0;
  other_case = 0;
  mirror = 0;
  normed = "";
  direction = UNICHARSET::U_LEFT_TO_RIGHT;
  fragment = NULL;
}

// Sets all ranges wide open. Initialization default in case there are
// no useful values available.
void UNICHARSET::UNICHAR_PROPERTIES::SetRangesOpen() {
111 112 113 114
  min_bottom = 0;
  max_bottom = MAX_UINT8;
  min_top = 0;
  max_top = MAX_UINT8;
115 116 117 118 119 120
  width = 0.0f;
  width_sd = 0.0f;
  bearing = 0.0f;
  bearing_sd = 0.0f;
  advance = 0.0f;
  advance_sd = 0.0f;
121 122 123 124 125 126 127 128
}

// Sets all ranges to empty. Used before expanding with font-based data.
void UNICHARSET::UNICHAR_PROPERTIES::SetRangesEmpty() {
  min_bottom = MAX_UINT8;
  max_bottom = 0;
  min_top = MAX_UINT8;
  max_top = 0;
129 130 131 132 133 134
  width = 0.0f;
  width_sd = 0.0f;
  bearing = 0.0f;
  bearing_sd = 0.0f;
  advance = 0.0f;
  advance_sd = 0.0f;
135 136
}

137 138
// Returns true if any of the top/bottom/width/bearing/advance ranges/stats
// is emtpy.
139
bool UNICHARSET::UNICHAR_PROPERTIES::AnyRangeEmpty() const {
140
  return width == 0.0f || advance == 0.0f;
141 142 143 144 145 146 147 148 149
}

// Expands the ranges with the ranges from the src properties.
void UNICHARSET::UNICHAR_PROPERTIES::ExpandRangesFrom(
    const UNICHAR_PROPERTIES& src) {
  UpdateRange(src.min_bottom, &min_bottom, &max_bottom);
  UpdateRange(src.max_bottom, &min_bottom, &max_bottom);
  UpdateRange(src.min_top, &min_top, &max_top);
  UpdateRange(src.max_top, &min_top, &max_top);
150 151 152 153 154 155 156 157 158 159 160 161
  if (src.width_sd > width_sd) {
    width = src.width;
    width_sd = src.width_sd;
  }
  if (src.bearing_sd > bearing_sd) {
    bearing = src.bearing;
    bearing_sd = src.bearing_sd;
  }
  if (src.advance_sd > advance_sd) {
    advance = src.advance;
    advance_sd = src.advance_sd;
  }
162 163 164 165 166 167 168 169
}

// Copies the properties from src into this.
void UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(const UNICHAR_PROPERTIES& src) {
  // Apart from the fragment, everything else can be done with a default copy.
  CHAR_FRAGMENT* saved_fragment = fragment;
  *this = src;  // Bitwise copy.
  fragment = saved_fragment;
170
}
171

T
theraysmith 已提交
172 173 174 175
UNICHARSET::UNICHARSET() :
    unichars(NULL),
    ids(),
    size_used(0),
176
    size_reserved(0),
177
    script_table(NULL),
178
    script_table_size_used(0),
179 180
    null_script("NULL") {
  clear();
181 182 183 184 185
  for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) {
    unichar_insert(kSpecialUnicharCodes[i]);
    if (i == UNICHAR_JOINED)
      set_isngram(i, true);
  }
186
}
T
theraysmith 已提交
187 188

UNICHARSET::~UNICHARSET() {
189
  clear();
T
theraysmith 已提交
190 191 192
}

void UNICHARSET::reserve(int unichars_number) {
193
  if (unichars_number > size_reserved) {
T
theraysmith 已提交
194 195
    UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
    for (int i = 0; i < size_used; ++i)
196
      unichars_new[i] = unichars[i];
T
theraysmith 已提交
197 198 199
    for (int j = size_used; j < unichars_number; ++j) {
      unichars_new[j].properties.script_id = add_script(null_script);
    }
T
theraysmith 已提交
200 201 202 203 204 205
    delete[] unichars;
    unichars = unichars_new;
    size_reserved = unichars_number;
  }
}

S
Stefan Weil 已提交
206
UNICHAR_ID
T
theraysmith 已提交
207
UNICHARSET::unichar_to_id(const char* const unichar_repr) const {
208 209 210 211 212
  string cleaned =
      old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
  return ids.contains(cleaned.data(), cleaned.size())
             ? ids.unichar_to_id(cleaned.data(), cleaned.size())
             : INVALID_UNICHAR_ID;
T
theraysmith 已提交
213 214
}

S
Stefan Weil 已提交
215 216
UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
                                     int length) const {
T
theraysmith 已提交
217
  assert(length > 0 && length <= UNICHAR_LEN);
218 219 220 221 222
  string cleaned(unichar_repr, length);
  if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
  return ids.contains(cleaned.data(), cleaned.size())
             ? ids.unichar_to_id(cleaned.data(), cleaned.size())
             : INVALID_UNICHAR_ID;
T
theraysmith 已提交
223 224
}

225
// Return the minimum number of bytes that matches a legal UNICHAR_ID,
226 227 228 229
// while leaving the rest of the string encodable. Returns 0 if the
// beginning of the string is not encodable.
// WARNING: this function now encodes the whole string for precision.
// Use encode_string in preference to repeatedly calling step.
230
int UNICHARSET::step(const char* str) const {
231 232 233 234 235 236
  GenericVector<UNICHAR_ID> encoding;
  GenericVector<char> lengths;
  encode_string(str, true, &encoding, &lengths, NULL);
  if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0;
  return lengths[0];
}
237

238 239 240 241 242
// Return whether the given UTF-8 string is encodable with this UNICHARSET.
// If not encodable, write the first byte offset which cannot be converted
// into the second (return) argument.
bool UNICHARSET::encodable_string(const char *str,
                                  int *first_bad_position) const {
243 244 245 246 247 248 249 250 251 252
  GenericVector<UNICHAR_ID> encoding;
  return encode_string(str, true, &encoding, NULL, first_bad_position);
}

// Encodes the given UTF-8 string with this UNICHARSET.
// Returns true if the encoding succeeds completely, false if there is at
// least one INVALID_UNICHAR_ID in the returned encoding, but in this case
// the rest of the string is still encoded.
// If lengths is not NULL, then it is filled with the corresponding
// byte length of each encoded UNICHAR_ID.
253 254 255
// WARNING: Caller must guarantee that str has already been cleaned of codes
// that do not belong in the unicharset, or encoding may fail.
// Use CleanupString to perform the cleaning.
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280
bool UNICHARSET::encode_string(const char* str, bool give_up_on_failure,
                               GenericVector<UNICHAR_ID>* encoding,
                               GenericVector<char>* lengths,
                               int* encoded_length) const {
  GenericVector<UNICHAR_ID> working_encoding;
  GenericVector<char> working_lengths;
  GenericVector<char> best_lengths;
  encoding->truncate(0);  // Just in case str is empty.
  int str_length = strlen(str);
  int str_pos = 0;
  bool perfect = true;
  while (str_pos < str_length) {
    encode_string(str, str_pos, str_length, &working_encoding, &working_lengths,
                  &str_pos, encoding, &best_lengths);
    if (str_pos < str_length) {
      // This is a non-match. Skip one utf-8 character.
      perfect = false;
      if (give_up_on_failure) break;
      int step = UNICHAR::utf8_step(str + str_pos);
      if (step == 0) step = 1;
      encoding->push_back(INVALID_UNICHAR_ID);
      best_lengths.push_back(step);
      str_pos += step;
      working_encoding = *encoding;
      working_lengths = best_lengths;
281 282
    }
  }
283 284 285
  if (lengths != NULL) *lengths = best_lengths;
  if (encoded_length != NULL) *encoded_length = str_pos;
  return perfect;
286 287
}

S
Stefan Weil 已提交
288
const char* UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
T
theraysmith 已提交
289 290 291
  if (id == INVALID_UNICHAR_ID) {
    return INVALID_UNICHAR;
  }
292 293 294 295
  ASSERT_HOST(id < this->size());
  return unichars[id].representation;
}

S
Stefan Weil 已提交
296
const char* UNICHARSET::id_to_unichar_ext(UNICHAR_ID id) const {
297 298 299 300 301 302 303 304 305 306 307 308 309 310
  if (id == INVALID_UNICHAR_ID) {
    return INVALID_UNICHAR;
  }
  ASSERT_HOST(id < this->size());
  // Resolve from the kCustomLigatures table if this is a private encoding.
  if (get_isprivate(id)) {
    const char* ch = id_to_unichar(id);
    for (int i = 0; kCustomLigatures[i][0] != NULL; ++i) {
      if (!strcmp(ch, kCustomLigatures[i][1])) {
        return kCustomLigatures[i][0];
      }
    }
  }
  // Otherwise return the stored representation.
311
  return unichars[id].representation;
T
theraysmith 已提交
312 313
}

T
theraysmith 已提交
314 315 316
// Return a STRING that reformats the utf8 str into the str followed
// by its hex unicodes.
STRING UNICHARSET::debug_utf8_str(const char* str) {
317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334
  STRING result = str;
  result += " [";
  int step = 1;
  // Chop into unicodes and code each as hex.
  for (int i = 0; str[i] != '\0'; i += step) {
    char hex[sizeof(int) * 2 + 1];
    step = UNICHAR::utf8_step(str + i);
    if (step == 0) {
      step = 1;
      sprintf(hex, "%x", str[i]);
    } else {
      UNICHAR ch(str + i, step);
      sprintf(hex, "%x", ch.first_uni());
    }
    result += hex;
    result += " ";
  }
  result += "]";
T
theraysmith 已提交
335 336 337 338 339 340
  return result;
}

// Return a STRING containing debug information on the unichar, including
// the id_to_unichar, its hex unicodes and the properties.
STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
341
  if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id));
T
theraysmith 已提交
342 343
  const CHAR_FRAGMENT *fragment = this->get_fragment(id);
  if (fragment) {
344
    return fragment->to_string();
T
theraysmith 已提交
345 346 347
  }
  const char* str = id_to_unichar(id);
  STRING result = debug_utf8_str(str);
348 349 350 351 352 353 354 355 356 357 358 359 360
  // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
  if (get_isalpha(id)) {
    if (get_islower(id))
      result += "a";
    else if (get_isupper(id))
      result += "A";
    else
      result += "x";
  }
  // Append 0 if a digit.
  if (get_isdigit(id)) {
    result += "0";
  }
T
theraysmith 已提交
361 362 363 364
  // Append p is a punctuation symbol.
  if (get_ispunctuation(id)) {
    result += "p";
  }
365 366 367
  return result;
}

368 369 370 371
// Sets the normed_ids vector from the normed string. normed_ids is not
// stored in the file, and needs to be set when the UNICHARSET is loaded.
void UNICHARSET::set_normed_ids(UNICHAR_ID unichar_id) {
  unichars[unichar_id].properties.normed_ids.truncate(0);
372 373 374 375 376 377 378
  if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
    unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
  } else if (!encode_string(unichars[unichar_id].properties.normed.string(),
                            true, &unichars[unichar_id].properties.normed_ids,
                            NULL, NULL)) {
    unichars[unichar_id].properties.normed_ids.truncate(0);
    unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
379 380 381
  }
}

382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401
// Returns whether the unichar id represents a unicode value in the private use
// area. We use this range only internally to represent uncommon ligatures
// (eg. 'ct') that do not have regular unicode values.
bool UNICHARSET::get_isprivate(UNICHAR_ID unichar_id) const {
  UNICHAR uc(id_to_unichar(unichar_id), -1);
  int uni = uc.first_uni();
  return (uni >= 0xE000 && uni <= 0xF8FF);
}


// Sets all ranges to empty, so they can be expanded to set the values.
void UNICHARSET::set_ranges_empty() {
  for (int id = 0; id < size_used; ++id) {
    unichars[id].properties.SetRangesEmpty();
  }
}

// Sets all the properties for this unicharset given a src unicharset with
// everything set. The unicharsets don't have to be the same, and graphemes
// are correctly accounted for.
402 403 404
void UNICHARSET::PartialSetPropertiesFromOther(int start_index,
                                               const UNICHARSET& src) {
  for (int ch = start_index; ch < size_used; ++ch) {
405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423
    const char* utf8 = id_to_unichar(ch);
    UNICHAR_PROPERTIES properties;
    if (src.GetStrProperties(utf8, &properties)) {
      // Setup the script_id, other_case, and mirror properly.
      const char* script = src.get_script_from_script_id(properties.script_id);
      properties.script_id = add_script(script);
      const char* other_case = src.id_to_unichar(properties.other_case);
      if (contains_unichar(other_case)) {
        properties.other_case = unichar_to_id(other_case);
      } else {
        properties.other_case = ch;
      }
      const char* mirror_str = src.id_to_unichar(properties.mirror);
      if (contains_unichar(mirror_str)) {
        properties.mirror = unichar_to_id(mirror_str);
      } else {
        properties.mirror = ch;
      }
      unichars[ch].properties.CopyFrom(properties);
424
      set_normed_ids(ch);
425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442
    }
  }
}

// Expands the tops and bottoms and widths for this unicharset given a
// src unicharset with ranges in it. The unicharsets don't have to be the
// same, and graphemes are correctly accounted for.
void UNICHARSET::ExpandRangesFromOther(const UNICHARSET& src) {
  for (int ch = 0; ch < size_used; ++ch) {
    const char* utf8 = id_to_unichar(ch);
    UNICHAR_PROPERTIES properties;
    if (src.GetStrProperties(utf8, &properties)) {
      // Expand just the ranges from properties.
      unichars[ch].properties.ExpandRangesFrom(properties);
    }
  }
}

443 444
// Makes this a copy of src. Clears this completely first, so the automatic
// ids will not be present in this if not in src. Does NOT reorder the set!
445 446
void UNICHARSET::CopyFrom(const UNICHARSET& src) {
  clear();
447 448 449
  for (int ch = 0; ch < src.size_used; ++ch) {
    const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
    const char* utf8 = src.id_to_unichar(ch);
450
    unichar_insert_backwards_compatible(utf8);
451 452 453 454 455
    unichars[ch].properties.ExpandRangesFrom(src_props);
  }
  // Set properties, including mirror and other_case, WITHOUT reordering
  // the unicharset.
  PartialSetPropertiesFromOther(0, src);
456 457
}

458 459 460 461
// For each id in src, if it does not occur in this, add it, as in
// SetPropertiesFromOther, otherwise expand the ranges, as in
// ExpandRangesFromOther.
void UNICHARSET::AppendOtherUnicharset(const UNICHARSET& src) {
462
  int initial_used = size_used;
463 464 465 466 467 468 469 470 471
  for (int ch = 0; ch < src.size_used; ++ch) {
    const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
    const char* utf8 = src.id_to_unichar(ch);
    int id = size_used;
    if (contains_unichar(utf8)) {
      id = unichar_to_id(utf8);
      // Just expand current ranges.
      unichars[id].properties.ExpandRangesFrom(src_props);
    } else {
472
      unichar_insert_backwards_compatible(utf8);
473
      unichars[id].properties.SetRangesEmpty();
474 475
    }
  }
476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534
  // Set properties, including mirror and other_case, WITHOUT reordering
  // the unicharset.
  PartialSetPropertiesFromOther(initial_used, src);
}

// Returns true if the acceptable ranges of the tops of the characters do
// not overlap, making their x-height calculations distinct.
bool UNICHARSET::SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const {
  int overlap = MIN(unichars[id1].properties.max_top,
                    unichars[id2].properties.max_top) -
                MAX(unichars[id1].properties.min_top,
                    unichars[id2].properties.min_top);
  return overlap <= 0;
}

// Internal recursive version of encode_string above.
// Seeks to encode the given string as a sequence of UNICHAR_IDs such that
// each UNICHAR_ID uses the least possible part of the utf8 str.
// It does this by depth-first tail recursion on increasing length matches
// to the UNICHARSET, saving the first encountered result that encodes the
// maximum total length of str. It stops on a failure to encode to make
// the overall process of encoding a partially failed string more efficient.
// See unicharset.h for definition of the args.
void UNICHARSET::encode_string(const char* str, int str_index, int str_length,
                               GenericVector<UNICHAR_ID>* encoding,
                               GenericVector<char>* lengths,
                               int* best_total_length,
                               GenericVector<UNICHAR_ID>* best_encoding,
                               GenericVector<char>* best_lengths) const {
  if (str_index > *best_total_length) {
    // This is the best result so far.
    *best_total_length = str_index;
    *best_encoding = *encoding;
    if (best_lengths != NULL)
      *best_lengths = *lengths;
  }
  if (str_index == str_length) return;
  int encoding_index = encoding->size();
  // Find the length of the first matching unicharset member.
  int length = ids.minmatch(str + str_index);
  if (length == 0 || str_index + length > str_length) return;
  do {
    if (ids.contains(str + str_index, length)) {
      // Successful encoding so far.
      UNICHAR_ID id = ids.unichar_to_id(str + str_index, length);
      encoding->push_back(id);
      lengths->push_back(length);
      encode_string(str, str_index + length, str_length, encoding, lengths,
                    best_total_length, best_encoding, best_lengths);
      if (*best_total_length == str_length)
        return;  // Tail recursion success!
      // Failed with that length, truncate back and try again.
      encoding->truncate(encoding_index);
      lengths->truncate(encoding_index);
    }
    int step = UNICHAR::utf8_step(str + str_index + length);
    if (step == 0) step = 1;
    length += step;
  } while (length <= UNICHAR_LEN && str_index + length <= str_length);
535 536 537 538 539 540 541 542 543 544 545 546
}

// Gets the properties for a grapheme string, combining properties for
// multiple characters in a meaningful way where possible.
// Returns false if no valid match was found in the unicharset.
// NOTE that script_id, mirror, and other_case refer to this unicharset on
// return and will need translation if the target unicharset is different.
bool UNICHARSET::GetStrProperties(const char* utf8_str,
                                  UNICHAR_PROPERTIES* props) const {
  props->Init();
  props->SetRangesEmpty();
  int total_unicodes = 0;
547 548 549 550 551
  GenericVector<UNICHAR_ID> encoding;
  if (!encode_string(utf8_str, true, &encoding, NULL, NULL))
    return false;  // Some part was invalid.
  for (int i = 0; i < encoding.size(); ++i) {
    int id = encoding[i];
552 553 554 555 556 557 558 559 560 561 562 563 564 565
    const UNICHAR_PROPERTIES& src_props = unichars[id].properties;
    // Logical OR all the bools.
    if (src_props.isalpha) props->isalpha = true;
    if (src_props.islower) props->islower = true;
    if (src_props.isupper) props->isupper = true;
    if (src_props.isdigit) props->isdigit = true;
    if (src_props.ispunctuation) props->ispunctuation = true;
    if (src_props.isngram) props->isngram = true;
    if (src_props.enabled) props->enabled = true;
    // Min/max the tops/bottoms.
    UpdateRange(src_props.min_bottom, &props->min_bottom, &props->max_bottom);
    UpdateRange(src_props.max_bottom, &props->min_bottom, &props->max_bottom);
    UpdateRange(src_props.min_top, &props->min_top, &props->max_top);
    UpdateRange(src_props.max_top, &props->min_top, &props->max_top);
566 567 568 569 570 571 572
    float bearing = props->advance + src_props.bearing;
    if (total_unicodes == 0 || bearing < props->bearing) {
      props->bearing = bearing;
      props->bearing_sd = props->advance_sd + src_props.bearing_sd;
    }
    props->advance += src_props.advance;
    props->advance_sd += src_props.advance_sd;
573
    // With a single width, just use the widths stored in the unicharset.
574 575
    props->width = src_props.width;
    props->width_sd = src_props.width_sd;
576 577 578 579 580 581 582 583 584 585 586 587 588 589 590
    // Use the first script id, other_case, mirror, direction.
    // Note that these will need translation, except direction.
    if (total_unicodes == 0) {
      props->script_id = src_props.script_id;
      props->other_case = src_props.other_case;
      props->mirror = src_props.mirror;
      props->direction = src_props.direction;
    }
    // The normed string for the compound character is the concatenation of
    // the normed versions of the individual characters.
    props->normed += src_props.normed;
    ++total_unicodes;
  }
  if (total_unicodes > 1) {
    // Estimate the total widths from the advance - bearing.
591 592
    props->width = props->advance - props->bearing;
    props->width_sd = props->advance_sd + props->bearing_sd;
593 594 595 596
  }
  return total_unicodes > 0;
}

597 598
// TODO(rays) clean-up the order of functions to match unicharset.h.

599 600 601 602 603 604 605 606 607 608 609 610 611 612
unsigned int UNICHARSET::get_properties(UNICHAR_ID id) const {
  unsigned int properties = 0;
  if (this->get_isalpha(id))
    properties |= ISALPHA_MASK;
  if (this->get_islower(id))
    properties |= ISLOWER_MASK;
  if (this->get_isupper(id))
    properties |= ISUPPER_MASK;
  if (this->get_isdigit(id))
    properties |= ISDIGIT_MASK;
  if (this->get_ispunctuation(id))
    properties |= ISPUNCTUATION_MASK;
  return properties;
}
T
theraysmith 已提交
613

614 615 616 617 618 619 620 621
char UNICHARSET::get_chartype(UNICHAR_ID id) const {
  if (this->get_isupper(id)) return 'A';
  if (this->get_islower(id)) return 'a';
  if (this->get_isalpha(id)) return 'x';
  if (this->get_isdigit(id)) return '0';
  if (this->get_ispunctuation(id)) return 'p';
  return 0;
}
T
theraysmith 已提交
622

623 624 625 626 627 628 629 630 631 632
void UNICHARSET::unichar_insert(const char* const unichar_repr,
                                OldUncleanUnichars old_style) {
  if (old_style == OldUncleanUnichars::kTrue) old_style_included_ = true;
  string cleaned =
      old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
  if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) {
    const char* str = cleaned.c_str();
    GenericVector<int> encoding;
    if (!old_style_included_ &&
        encode_string(str, true, &encoding, nullptr, nullptr))
T
theraysmith 已提交
633
      return;
634
    if (size_used == size_reserved) {
T
theraysmith 已提交
635 636 637 638 639
      if (size_used == 0)
        reserve(8);
      else
        reserve(2 * size_used);
    }
640 641 642 643 644 645 646 647 648 649
    int index = 0;
    do {
      if (index > UNICHAR_LEN) {
        fprintf(stderr, "Utf8 buffer too big, size>%d for %s\n", UNICHAR_LEN,
                unichar_repr);
        return;
      }
      unichars[size_used].representation[index++] = *str++;
    } while (*str != '\0');
    unichars[size_used].representation[index] = '\0';
T
theraysmith 已提交
650 651 652 653 654
    this->set_script(size_used, null_script);
    // If the given unichar_repr represents a fragmented character, set
    // fragment property to a pointer to CHAR_FRAGMENT class instance with
    // information parsed from the unichar representation. Use the script
    // of the base unichar for the fragmented character if possible.
655 656
    CHAR_FRAGMENT* frag =
        CHAR_FRAGMENT::parse_from_string(unichars[size_used].representation);
T
theraysmith 已提交
657 658 659 660 661
    this->unichars[size_used].properties.fragment = frag;
    if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
      this->unichars[size_used].properties.script_id =
        this->get_script(frag->get_unichar());
    }
662
    this->unichars[size_used].properties.enabled = true;
663
    ids.insert(unichars[size_used].representation, size_used);
T
theraysmith 已提交
664 665 666 667
    ++size_used;
  }
}

T
theraysmith 已提交
668
bool UNICHARSET::contains_unichar(const char* const unichar_repr) const {
669 670 671
  string cleaned =
      old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
  return ids.contains(cleaned.data(), cleaned.size());
T
theraysmith 已提交
672 673
}

T
theraysmith 已提交
674 675 676 677 678
bool UNICHARSET::contains_unichar(const char* const unichar_repr,
                                  int length) const {
  if (length == 0) {
    return false;
  }
679 680 681
  string cleaned(unichar_repr, length);
  if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
  return ids.contains(cleaned.data(), cleaned.size());
682 683
}

T
theraysmith 已提交
684 685
bool UNICHARSET::eq(UNICHAR_ID unichar_id,
                    const char* const unichar_repr) const {
T
theraysmith 已提交
686 687 688
  return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
}

689 690 691 692 693
bool UNICHARSET::save_to_string(STRING *str) const {
  const int kFileBufSize = 1024;
  char buffer[kFileBufSize + 1];
  snprintf(buffer, kFileBufSize, "%d\n", this->size());
  *str = buffer;
T
theraysmith 已提交
694
  for (UNICHAR_ID id = 0; id < this->size(); ++id) {
695 696
    int min_bottom, max_bottom, min_top, max_top;
    get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
697 698 699 700 701 702
    float width, width_sd;
    get_width_stats(id, &width, &width_sd);
    float bearing, bearing_sd;
    get_bearing_stats(id, &bearing, &bearing_sd);
    float advance, advance_sd;
    get_advance_stats(id, &advance, &advance_sd);
703
    unsigned int properties = this->get_properties(id);
704
    if (strcmp(this->id_to_unichar(id), " ") == 0) {
705
      snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
T
theraysmith 已提交
706 707
              this->get_script_from_script_id(this->get_script(id)),
              this->get_other_case(id));
708
    } else {
709
      snprintf(buffer, kFileBufSize,
710
              "%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %s %d %d %d %s\t# %s\n",
711
              this->id_to_unichar(id), properties,
712 713
              min_bottom, max_bottom, min_top, max_top, width, width_sd,
              bearing, bearing_sd, advance, advance_sd,
T
theraysmith 已提交
714
              this->get_script_from_script_id(this->get_script(id)),
715 716 717 718
              this->get_other_case(id), this->get_direction(id),
              this->get_mirror(id), this->get_normed_unichar(id),
              this->debug_str(id).string());
    }
719
    *str += buffer;
T
theraysmith 已提交
720 721 722 723
  }
  return true;
}

724
// TODO(rays) Replace with TFile everywhere.
725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771
class InMemoryFilePointer {
 public:
  InMemoryFilePointer(const char *memory, int mem_size)
      : memory_(memory), fgets_ptr_(memory), mem_size_(mem_size) { }

  char *fgets(char *orig_dst, int size) {
    const char *src_end = memory_ + mem_size_;
    char *dst_end = orig_dst + size - 1;
    if (size < 1) {
      return fgets_ptr_ < src_end ? orig_dst : NULL;
    }

    char *dst = orig_dst;
    char ch = '^';
    while (fgets_ptr_ < src_end && dst < dst_end && ch != '\n') {
      ch = *dst++ = *fgets_ptr_++;
    }
    *dst = 0;
    return (dst == orig_dst) ? NULL : orig_dst;
  }

 private:
  const char *memory_;
  const char *fgets_ptr_;
  const int mem_size_;
};

bool UNICHARSET::load_from_inmemory_file(
    const char *memory, int mem_size, bool skip_fragments) {
  InMemoryFilePointer mem_fp(memory, mem_size);
  TessResultCallback2<char *, char *, int> *fgets_cb =
      NewPermanentTessCallback(&mem_fp, &InMemoryFilePointer::fgets);
  bool success = load_via_fgets(fgets_cb, skip_fragments);
  delete fgets_cb;
  return success;
}

class LocalFilePointer {
 public:
  LocalFilePointer(FILE *stream) : fp_(stream) {}
  char *fgets(char *dst, int size) {
    return ::fgets(dst, size, fp_);
  }
 private:
  FILE *fp_;
};

772
bool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) {
773 774 775 776 777 778 779 780
  LocalFilePointer lfp(file);
  TessResultCallback2<char *, char *, int> *fgets_cb =
      NewPermanentTessCallback(&lfp, &LocalFilePointer::fgets);
  bool success = load_via_fgets(fgets_cb, skip_fragments);
  delete fgets_cb;
  return success;
}

781 782 783 784 785 786 787 788
bool UNICHARSET::load_from_file(tesseract::TFile *file, bool skip_fragments) {
  TessResultCallback2<char *, char *, int> *fgets_cb =
      NewPermanentTessCallback(file, &tesseract::TFile::FGets);
  bool success = load_via_fgets(fgets_cb, skip_fragments);
  delete fgets_cb;
  return success;
}

789 790 791
bool UNICHARSET::load_via_fgets(
    TessResultCallback2<char *, char *, int> *fgets_cb,
    bool skip_fragments) {
T
theraysmith 已提交
792 793 794 795
  int unicharset_size;
  char buffer[256];

  this->clear();
796
  if (fgets_cb->Run(buffer, sizeof(buffer)) == NULL ||
797
      sscanf(buffer, "%d", &unicharset_size) != 1) {
T
theraysmith 已提交
798 799 800 801 802
    return false;
  }
  this->reserve(unicharset_size);
  for (UNICHAR_ID id = 0; id < unicharset_size; ++id) {
    char unichar[256];
803
    unsigned int properties;
804
    char script[64];
T
theraysmith 已提交
805

T
theraysmith 已提交
806
    strcpy(script, null_script);
807 808 809 810
    int min_bottom = 0;
    int max_bottom = MAX_UINT8;
    int min_top = 0;
    int max_top = MAX_UINT8;
811 812 813 814 815 816
    float width = 0.0f;
    float width_sd = 0.0f;
    float bearing = 0.0f;
    float bearing_sd = 0.0f;
    float advance = 0.0f;
    float advance_sd = 0.0f;
817 818 819 820 821 822 823 824 825
    // TODO(eger): check that this default it ok
    // after enabling BiDi iterator for Arabic+Cube.
    int direction = UNICHARSET::U_LEFT_TO_RIGHT;
    UNICHAR_ID other_case = id;
    UNICHAR_ID mirror = id;
    char normed[64];
    int v = -1;
    if (fgets_cb->Run(buffer, sizeof (buffer)) == NULL ||
        ((v = sscanf(buffer,
826
                     "%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %63s %d %d %d %63s",
827 828
                     unichar, &properties,
                     &min_bottom, &max_bottom, &min_top, &max_top,
829 830
                     &width, &width_sd, &bearing, &bearing_sd,
                     &advance, &advance_sd, script, &other_case,
831 832
                     &direction, &mirror, normed)) != 17 &&
         (v = sscanf(buffer,
833
                     "%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %63s %d %d %d",
834 835
                     unichar, &properties,
                     &min_bottom, &max_bottom, &min_top, &max_top,
836 837 838
                     &width, &width_sd, &bearing, &bearing_sd,
                     &advance, &advance_sd, script, &other_case,
                     &direction, &mirror)) != 16 &&
839 840 841 842 843 844 845 846 847 848 849
          (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d %d %d",
                      unichar, &properties,
                      &min_bottom, &max_bottom, &min_top, &max_top,
                      script, &other_case, &direction, &mirror)) != 10 &&
          (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d", unichar, &properties,
                      &min_bottom, &max_bottom, &min_top, &max_top,
                      script, &other_case)) != 8 &&
          (v = sscanf(buffer, "%s %x %63s %d", unichar, &properties,
                      script, &other_case)) != 4 &&
          (v = sscanf(buffer, "%s %x %63s",
                      unichar, &properties, script)) != 3 &&
T
theraysmith@gmail.com 已提交
850
          (v = sscanf(buffer, "%s %x", unichar, &properties)) != 2)) {
T
theraysmith 已提交
851 852
      return false;
    }
853

854 855 856
    // Skip fragments if needed.
    CHAR_FRAGMENT *frag = NULL;
    if (skip_fragments && (frag = CHAR_FRAGMENT::parse_from_string(unichar))) {
R
Ray Smith 已提交
857
      int num_pieces = frag->get_total();
858
      delete frag;
R
Ray Smith 已提交
859 860 861
      // Skip multi-element fragments, but keep singles like UNICHAR_BROKEN in.
      if (num_pieces > 1)
        continue;
862 863
    }
    // Insert unichar into unicharset and set its properties.
T
theraysmith 已提交
864 865 866
    if (strcmp(unichar, "NULL") == 0)
      this->unichar_insert(" ");
    else
867
      this->unichar_insert_backwards_compatible(unichar);
868

869 870 871 872 873
    this->set_isalpha(id, properties & ISALPHA_MASK);
    this->set_islower(id, properties & ISLOWER_MASK);
    this->set_isupper(id, properties & ISUPPER_MASK);
    this->set_isdigit(id, properties & ISDIGIT_MASK);
    this->set_ispunctuation(id, properties & ISPUNCTUATION_MASK);
T
theraysmith 已提交
874 875
    this->set_isngram(id, false);
    this->set_script(id, script);
876
    this->unichars[id].properties.enabled = true;
877
    this->set_top_bottom(id, min_bottom, max_bottom, min_top, max_top);
878 879 880
    this->set_width_stats(id, width, width_sd);
    this->set_bearing_stats(id, bearing, bearing_sd);
    this->set_advance_stats(id, advance, advance_sd);
881 882 883 884 885 886
    this->set_direction(id, static_cast<UNICHARSET::Direction>(direction));
    ASSERT_HOST(other_case < unicharset_size);
    this->set_other_case(id, (v>3) ? other_case : id);
    ASSERT_HOST(mirror < unicharset_size);
    this->set_mirror(id, (v>8) ? mirror : id);
    this->set_normed(id, (v>16) ? normed : unichar);
887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919
  }
  post_load_setup();
  return true;
}

// Sets up internal data after loading the file, based on the char
// properties. Called from load_from_file, but also needs to be run
// during set_unicharset_properties.
void UNICHARSET::post_load_setup() {
  // Number of alpha chars with the case property minus those without,
  // in order to determine that half the alpha chars have case.
  int net_case_alphas = 0;
  int x_height_alphas = 0;
  int cap_height_alphas = 0;
  top_bottom_set_ = false;
  for (UNICHAR_ID id = 0; id < size_used; ++id) {
    int min_bottom = 0;
    int max_bottom = MAX_UINT8;
    int min_top = 0;
    int max_top = MAX_UINT8;
    get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
    if (min_top > 0)
      top_bottom_set_ = true;
    if (get_isalpha(id)) {
      if (get_islower(id) || get_isupper(id))
        ++net_case_alphas;
      else
        --net_case_alphas;
      if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold)
        ++x_height_alphas;
      else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold)
        ++cap_height_alphas;
    }
920
    set_normed_ids(id);
T
theraysmith 已提交
921
  }
922

923 924
  script_has_upper_lower_ = net_case_alphas > 0;
  script_has_xheight_ = script_has_upper_lower_ ||
925 926
      (x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
       cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
T
theraysmith 已提交
927 928 929 930 931 932 933 934

  null_sid_ = get_script_id_from_name(null_script);
  ASSERT_HOST(null_sid_ == 0);
  common_sid_ = get_script_id_from_name("Common");
  latin_sid_ = get_script_id_from_name("Latin");
  cyrillic_sid_ = get_script_id_from_name("Cyrillic");
  greek_sid_ = get_script_id_from_name("Greek");
  han_sid_ = get_script_id_from_name("Han");
935 936
  hiragana_sid_ = get_script_id_from_name("Hiragana");
  katakana_sid_ = get_script_id_from_name("Katakana");
937 938
  thai_sid_ = get_script_id_from_name("Thai");
  hangul_sid_ = get_script_id_from_name("Hangul");
939

940 941
  // Compute default script. Use the highest-counting alpha script, that is
  // not the common script, as that still contains some "alphas".
942 943
  int* script_counts = new int[script_table_size_used];
  memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
944 945 946 947 948
  for (int id = 0; id < size_used; ++id) {
    if (get_isalpha(id)) {
      ++script_counts[get_script(id)];
    }
  }
949 950 951 952 953 954 955 956
  default_sid_ = 0;
  for (int s = 1; s < script_table_size_used; ++s) {
    if (script_counts[s] > script_counts[default_sid_] && s != common_sid_)
      default_sid_ = s;
  }
  delete [] script_counts;
}

957 958 959 960 961 962 963 964 965 966 967 968 969
// Returns true if right_to_left scripts are significant in the unicharset,
// but without being so sensitive that "universal" unicharsets containing
// characters from many scripts, like orientation and script detection,
// look like they are right_to_left.
bool UNICHARSET::major_right_to_left() const {
  int ltr_count = 0;
  int rtl_count = 0;
  for (int id = 0; id < size_used; ++id) {
    int dir = get_direction(id);
    if (dir == UNICHARSET::U_LEFT_TO_RIGHT) ltr_count++;
    if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
        dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC ||
        dir == UNICHARSET::U_ARABIC_NUMBER) rtl_count++;
970
  }
971
  return rtl_count > ltr_count;
T
theraysmith 已提交
972
}
973 974 975 976

// Set a whitelist and/or blacklist of characters to recognize.
// An empty or NULL whitelist enables everything (minus any blacklist).
// An empty or NULL blacklist disables nothing.
R
Ray Smith 已提交
977
// An empty or NULL blacklist has no effect.
978
void UNICHARSET::set_black_and_whitelist(const char* blacklist,
R
Ray Smith 已提交
979 980
                                         const char* whitelist,
                                         const char* unblacklist) {
981 982 983 984 985 986
  bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
  // Set everything to default
  for (int ch = 0; ch < size_used; ++ch)
    unichars[ch].properties.enabled = def_enabled;
  if (!def_enabled) {
    // Enable the whitelist.
987 988 989 990 991
    GenericVector<UNICHAR_ID> encoding;
    encode_string(whitelist, false, &encoding, NULL, NULL);
    for (int i = 0; i < encoding.size(); ++i) {
      if (encoding[i] != INVALID_UNICHAR_ID)
        unichars[encoding[i]].properties.enabled = true;
992 993 994 995
    }
  }
  if (blacklist != NULL && blacklist[0] != '\0') {
    // Disable the blacklist.
996 997 998 999 1000
    GenericVector<UNICHAR_ID> encoding;
    encode_string(blacklist, false, &encoding, NULL, NULL);
    for (int i = 0; i < encoding.size(); ++i) {
      if (encoding[i] != INVALID_UNICHAR_ID)
        unichars[encoding[i]].properties.enabled = false;
1001 1002
    }
  }
R
Ray Smith 已提交
1003 1004 1005 1006 1007 1008 1009 1010 1011
  if (unblacklist != NULL && unblacklist[0] != '\0') {
    // Re-enable the unblacklist.
    GenericVector<UNICHAR_ID> encoding;
    encode_string(unblacklist, false, &encoding, NULL, NULL);
    for (int i = 0; i < encoding.size(); ++i) {
      if (encoding[i] != INVALID_UNICHAR_ID)
        unichars[encoding[i]].properties.enabled = true;
    }
  }
1012 1013
}

1014 1015 1016 1017 1018 1019 1020
// Returns true if there are any repeated unicodes in the normalized
// text of any unichar-id in the unicharset.
bool UNICHARSET::AnyRepeatedUnicodes() const {
  int start_id = 0;
  if (has_special_codes()) start_id = SPECIAL_UNICHAR_CODES_COUNT;
  for (int id = start_id; id < size_used; ++id) {
    // Convert to unicodes.
1021 1022 1023
    std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(get_normed_unichar(id));
    for (int u = 1; u < unicodes.size(); ++u) {
      if (unicodes[u - 1] == unicodes[u]) return true;
1024 1025 1026 1027 1028
    }
  }
  return false;
}

T
theraysmith 已提交
1029
int UNICHARSET::add_script(const char* script) {
1030 1031
  for (int i = 0; i < script_table_size_used; ++i) {
    if (strcmp(script, script_table[i]) == 0)
T
theraysmith 已提交
1032
      return i;
1033 1034 1035 1036
  }
  if (script_table_size_reserved == 0) {
    script_table_size_reserved = 8;
    script_table = new char*[script_table_size_reserved];
1037 1038 1039 1040
  } else if (script_table_size_used >= script_table_size_reserved) {
    assert(script_table_size_used == script_table_size_reserved);
    script_table_size_reserved += script_table_size_reserved;
    char** new_script_table = new char*[script_table_size_reserved];
1041 1042
    memcpy(new_script_table, script_table,
           script_table_size_used * sizeof(char*));
1043 1044 1045 1046 1047
    delete[] script_table;
    script_table = new_script_table;
  }
  script_table[script_table_size_used] = new char[strlen(script) + 1];
  strcpy(script_table[script_table_size_used], script);
T
theraysmith 已提交
1048 1049 1050
  return script_table_size_used++;
}

1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065
// Returns the string that represents a fragment
// with the given unichar, pos and total.
STRING CHAR_FRAGMENT::to_string(const char *unichar, int pos, int total,
                                bool natural) {
  if (total == 1) return STRING(unichar);
  STRING result = "";
  result += kSeparator;
  result += unichar;
  char buffer[kMaxLen];
  snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos,
           natural ? kNaturalFlag : kSeparator, total);
  result += buffer;
  return result;
}

T
theraysmith 已提交
1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085
CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) {
  const char *ptr = string;
  int len = strlen(string);
  if (len < kMinLen || *ptr != kSeparator) {
    return NULL;  // this string can not represent a fragment
  }
  ptr++;  // move to the next character
  int step = 0;
  while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) {
    step += UNICHAR::utf8_step(ptr + step);
  }
  if (step == 0 || step > UNICHAR_LEN) {
    return NULL;  // no character for unichar or the character is too long
  }
  char unichar[UNICHAR_LEN + 1];
  strncpy(unichar, ptr, step);
  unichar[step] = '\0';  // null terminate unichar
  ptr += step;  // move to the next fragment separator
  int pos = 0;
  int total = 0;
1086
  bool natural = false;
T
theraysmith 已提交
1087 1088 1089
  char *end_ptr = NULL;
  for (int i = 0; i < 2; i++) {
    if (ptr > string + len || *ptr != kSeparator) {
1090 1091 1092 1093
      if (i == 1 && *ptr == kNaturalFlag)
        natural = true;
      else
        return NULL;  // Failed to parse fragment representation.
T
theraysmith 已提交
1094 1095 1096 1097 1098 1099 1100 1101 1102 1103
    }
    ptr++;  // move to the next character
    i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10))
      : total = static_cast<int>(strtol(ptr, &end_ptr, 10));
    ptr = end_ptr;
  }
  if (ptr != string + len) {
    return NULL;  // malformed fragment representation
  }
  CHAR_FRAGMENT *fragment = new CHAR_FRAGMENT();
1104
  fragment->set_all(unichar, pos, total, natural);
T
theraysmith 已提交
1105 1106 1107 1108 1109 1110 1111 1112 1113
  return fragment;
}

int UNICHARSET::get_script_id_from_name(const char* script_name) const {
  for (int i = 0; i < script_table_size_used; ++i) {
    if (strcmp(script_name, script_table[i]) == 0)
      return i;
  }
  return 0;  // 0 is always the null_script
1114
}
1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143

// Removes/replaces content that belongs in rendered text, but not in the
// unicharset.
/* static */
string UNICHARSET::CleanupString(const char* utf8_str, int length) {
  string result;
  result.reserve(length);
  char ch;
  while ((ch = *utf8_str) != '\0' && --length >= 0) {
    int key_index = 0;
    const char* key;
    while ((key = kCleanupMaps[key_index][0]) != nullptr) {
      int match = 0;
      while (key[match] != '\0' && key[match] == utf8_str[match]) ++match;
      if (key[match] == '\0') {
        utf8_str += match;
        break;
      }
      ++key_index;
    }
    if (key == nullptr) {
      result.push_back(ch);
      ++utf8_str;
    } else {
      result.append(kCleanupMaps[key_index][1]);
    }
  }
  return result;
}