unicharcompress.cpp 16.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
///////////////////////////////////////////////////////////////////////
// File:        unicharcompress.cpp
// Description: Unicode re-encoding using a sequence of smaller numbers in
//              place of a single large code for CJK, similarly for Indic,
//              and dissection of ligatures for other scripts.
// Author:      Ray Smith
// Created:     Wed Mar 04 14:45:01 PST 2015
//
// (C) Copyright 2015, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#include "unicharcompress.h"
#include "tprintf.h"

namespace tesseract {

// String used to represent the null_id in direct_set.
const char* kNullChar = "<nul>";

// Local struct used only for processing the radical-stroke table.
struct RadicalStroke {
  RadicalStroke() : num_strokes(0) {}
  RadicalStroke(const STRING& r, int s) : radical(r), num_strokes(s) {}

  bool operator==(const RadicalStroke& other) const {
    return radical == other.radical && num_strokes == other.num_strokes;
  }

  // The radical is encoded as a string because its format is of an int with
  // an optional ' mark to indicate a simplified shape. To treat these as
  // distinct, we use a string and a UNICHARSET to do the integer mapping.
  STRING radical;
  // The number of strokes we treat as dense and just take the face value from
  // the table.
  int num_strokes;
};

// Hash functor for RadicalStroke.
struct RadicalStrokedHash {
  size_t operator()(const RadicalStroke& rs) const {
    size_t result = rs.num_strokes;
    for (int i = 0; i < rs.radical.length(); ++i) {
      result ^= rs.radical[i] << (6 * i + 8);
    }
    return result;
  }
};

// A hash map to convert unicodes to radical,stroke pair.
A
amitdo 已提交
60
typedef std::unordered_map<int, RadicalStroke> RSMap;
61
// A hash map to count occurrences of each radical,stroke pair.
A
amitdo 已提交
62
typedef std::unordered_map<RadicalStroke, int, RadicalStrokedHash> RSCounts;
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150

// Helper function builds the RSMap from the radical-stroke file, which has
// already been read into a STRING. Returns false on error.
// The radical_stroke_table is non-const because it gets split and the caller
// is unlikely to want to use it again.
static bool DecodeRadicalStrokeTable(STRING* radical_stroke_table,
                                     RSMap* radical_map) {
  GenericVector<STRING> lines;
  radical_stroke_table->split('\n', &lines);
  for (int i = 0; i < lines.size(); ++i) {
    if (lines[i].length() == 0 || lines[i][0] == '#') continue;
    int unicode, radical, strokes;
    STRING str_radical;
    if (sscanf(lines[i].string(), "%x\t%d.%d", &unicode, &radical, &strokes) ==
        3) {
      str_radical.add_str_int("", radical);
    } else if (sscanf(lines[i].string(), "%x\t%d'.%d", &unicode, &radical,
                      &strokes) == 3) {
      str_radical.add_str_int("'", radical);
    } else {
      tprintf("Invalid format in radical stroke table at line %d: %s\n", i,
              lines[i].string());
      return false;
    }
    (*radical_map)[unicode] = RadicalStroke(str_radical, strokes);
  }
  return true;
}

UnicharCompress::UnicharCompress() : code_range_(0) {}
UnicharCompress::UnicharCompress(const UnicharCompress& src) { *this = src; }
UnicharCompress::~UnicharCompress() { Cleanup(); }
UnicharCompress& UnicharCompress::operator=(const UnicharCompress& src) {
  Cleanup();
  encoder_ = src.encoder_;
  code_range_ = src.code_range_;
  SetupDecoder();
  return *this;
}

// Computes the encoding for the given unicharset. It is a requirement that
// the file training/langdata/radical-stroke.txt have been read into the
// input string radical_stroke_table.
// Returns false if the encoding cannot be constructed.
bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
                                      STRING* radical_stroke_table) {
  RSMap radical_map;
  if (!DecodeRadicalStrokeTable(radical_stroke_table, &radical_map))
    return false;
  encoder_.clear();
  UNICHARSET direct_set;
  UNICHARSET radicals;
  // To avoid unused codes, clear the special codes from the unicharsets.
  direct_set.clear();
  radicals.clear();
  // Always keep space as 0;
  direct_set.unichar_insert(" ");
  // Null char is next if we have one.
  if (null_id >= 0) {
    direct_set.unichar_insert(kNullChar);
  }
  RSCounts radical_counts;
  // In the initial map, codes [0, unicharset.size()) are
  // reserved for non-han/hangul sequences of 1 or more unicodes.
  int hangul_offset = unicharset.size();
  // Hangul takes the next range [hangul_offset, hangul_offset + kTotalJamos).
  const int kTotalJamos = kLCount + kVCount + kTCount;
  // Han takes the codes beyond hangul_offset + kTotalJamos. Since it is hard
  // to measure the number of radicals and strokes, initially we use the same
  // code range for all 3 Han code positions, and fix them after.
  int han_offset = hangul_offset + kTotalJamos;
  int max_num_strokes = -1;
  for (int u = 0; u <= unicharset.size(); ++u) {
    bool self_normalized = false;
    // We special-case allow null_id to be equal to unicharset.size() in case
    // there is no space in unicharset for it.
    if (u == unicharset.size()) {
      if (u == null_id) {
        self_normalized = true;
      } else {
        break;  // Finished.
      }
    } else {
      self_normalized = strcmp(unicharset.id_to_unichar(u),
                               unicharset.get_normed_unichar(u)) == 0;
    }
    RecodedCharID code;
    // Convert to unicodes.
151
    std::vector<char32> unicodes;
152
    if (u < unicharset.size() &&
153 154
        (unicodes = UNICHAR::UTF8ToUTF32(unicharset.get_normed_unichar(u)))
                .size() == 1) {
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317
      // Check single unicodes for Hangul/Han and encode if so.
      int unicode = unicodes[0];
      int leading, vowel, trailing;
      auto it = radical_map.find(unicode);
      if (it != radical_map.end()) {
        // This is Han. Convert to radical, stroke, index.
        if (!radicals.contains_unichar(it->second.radical.string())) {
          radicals.unichar_insert(it->second.radical.string());
        }
        int radical = radicals.unichar_to_id(it->second.radical.string());
        int num_strokes = it->second.num_strokes;
        int num_samples = radical_counts[it->second]++;
        if (num_strokes > max_num_strokes) max_num_strokes = num_strokes;
        code.Set3(radical + han_offset, num_strokes + han_offset,
                  num_samples + han_offset);
      } else if (DecomposeHangul(unicode, &leading, &vowel, &trailing)) {
        // This is Hangul. Since we know the exact size of each part at compile
        // time, it gets the bottom set of codes.
        code.Set3(leading + hangul_offset, vowel + kLCount + hangul_offset,
                  trailing + kLCount + kVCount + hangul_offset);
      }
    }
    // If the code is still empty, it wasn't Han or Hangul.
    if (code.length() == 0) {
      // Special cases.
      if (u == UNICHAR_SPACE) {
        code.Set(0, 0);  // Space.
      } else if (u == null_id || (unicharset.has_special_codes() &&
                                  u < SPECIAL_UNICHAR_CODES_COUNT)) {
        code.Set(0, direct_set.unichar_to_id(kNullChar));
      } else {
        // Add the direct_set unichar-ids of the unicodes in sequence to the
        // code.
        for (int i = 0; i < unicodes.size(); ++i) {
          int position = code.length();
          if (position >= RecodedCharID::kMaxCodeLen) {
            tprintf("Unichar %d=%s->%s is too long to encode!!\n", u,
                    unicharset.id_to_unichar(u),
                    unicharset.get_normed_unichar(u));
            return false;
          }
          int uni = unicodes[i];
          UNICHAR unichar(uni);
          char* utf8 = unichar.utf8_str();
          if (!direct_set.contains_unichar(utf8))
            direct_set.unichar_insert(utf8);
          code.Set(position, direct_set.unichar_to_id(utf8));
          delete[] utf8;
          if (direct_set.size() > unicharset.size()) {
            // Code space got bigger!
            tprintf("Code space expanded from original unicharset!!\n");
            return false;
          }
        }
      }
    }
    code.set_self_normalized(self_normalized);
    encoder_.push_back(code);
  }
  // Now renumber Han to make all codes unique. We already added han_offset to
  // all Han. Now separate out the radical, stroke, and count codes for Han.
  // In the uniqued Han encoding, the 1st code uses the next radical_map.size()
  // values, the 2nd code uses the next max_num_strokes+1 values, and the 3rd
  // code uses the rest for the max number of duplicated radical/stroke combos.
  int num_radicals = radicals.size();
  for (int u = 0; u < unicharset.size(); ++u) {
    RecodedCharID* code = &encoder_[u];
    if ((*code)(0) >= han_offset) {
      code->Set(1, (*code)(1) + num_radicals);
      code->Set(2, (*code)(2) + num_radicals + max_num_strokes + 1);
    }
  }
  DefragmentCodeValues(null_id >= 0 ? 1 : -1);
  SetupDecoder();
  return true;
}

// Sets up an encoder that doesn't change the unichars at all, so it just
// passes them through unchanged.
void UnicharCompress::SetupPassThrough(const UNICHARSET& unicharset) {
  GenericVector<RecodedCharID> codes;
  for (int u = 0; u < unicharset.size(); ++u) {
    RecodedCharID code;
    code.Set(0, u);
    codes.push_back(code);
  }
  SetupDirect(codes);
}

// Sets up an encoder directly using the given encoding vector, which maps
// unichar_ids to the given codes.
void UnicharCompress::SetupDirect(const GenericVector<RecodedCharID>& codes) {
  encoder_ = codes;
  ComputeCodeRange();
  SetupDecoder();
}

// Renumbers codes to eliminate unused values.
void UnicharCompress::DefragmentCodeValues(int encoded_null) {
  // There may not be any Hangul, but even if there is, it is possible that not
  // all codes are used. Likewise with the Han encoding, it is possible that not
  // all numbers of strokes are used.
  ComputeCodeRange();
  GenericVector<int> offsets;
  offsets.init_to_size(code_range_, 0);
  // Find which codes are used
  for (int c = 0; c < encoder_.size(); ++c) {
    const RecodedCharID& code = encoder_[c];
    for (int i = 0; i < code.length(); ++i) {
      offsets[code(i)] = 1;
    }
  }
  // Compute offsets based on code use.
  int offset = 0;
  for (int i = 0; i < offsets.size(); ++i) {
    // If not used, decrement everything above here.
    // We are moving encoded_null to the end, so it is not "used".
    if (offsets[i] == 0 || i == encoded_null) {
      --offset;
    } else {
      offsets[i] = offset;
    }
  }
  if (encoded_null >= 0) {
    // The encoded_null is moving to the end, for the benefit of TensorFlow,
    // which is offsets.size() + offsets.back().
    offsets[encoded_null] = offsets.size() + offsets.back() - encoded_null;
  }
  // Now apply the offsets.
  for (int c = 0; c < encoder_.size(); ++c) {
    RecodedCharID* code = &encoder_[c];
    for (int i = 0; i < code->length(); ++i) {
      int value = (*code)(i);
      code->Set(i, value + offsets[value]);
    }
  }
  ComputeCodeRange();
}

// Encodes a single unichar_id. Returns the length of the code, or zero if
// invalid input, and the encoding itself
int UnicharCompress::EncodeUnichar(int unichar_id, RecodedCharID* code) const {
  if (unichar_id < 0 || unichar_id >= encoder_.size()) return 0;
  *code = encoder_[unichar_id];
  return code->length();
}

// Decodes code, returning the original unichar-id, or
// INVALID_UNICHAR_ID if the input is invalid.
int UnicharCompress::DecodeUnichar(const RecodedCharID& code) const {
  int len = code.length();
  if (len <= 0 || len > RecodedCharID::kMaxCodeLen) return INVALID_UNICHAR_ID;
  auto it = decoder_.find(code);
  if (it == decoder_.end()) return INVALID_UNICHAR_ID;
  return it->second;
}

// Writes to the given file. Returns false in case of error.
bool UnicharCompress::Serialize(TFile* fp) const {
  return encoder_.SerializeClasses(fp);
}

// Reads from the given file. Returns false in case of error.
318 319
bool UnicharCompress::DeSerialize(TFile* fp) {
  if (!encoder_.DeSerializeClasses(fp)) return false;
320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438
  ComputeCodeRange();
  SetupDecoder();
  return true;
}

// Returns a STRING containing a text file that describes the encoding thus:
// <index>[,<index>]*<tab><UTF8-str><newline>
// In words, a comma-separated list of one or more indices, followed by a tab
// and the UTF-8 string that the code represents per line. Most simple scripts
// will encode a single index to a UTF8-string, but Chinese, Japanese, Korean
// and the Indic scripts will contain a many-to-many mapping.
// See the class comment above for details.
STRING UnicharCompress::GetEncodingAsString(
    const UNICHARSET& unicharset) const {
  STRING encoding;
  for (int c = 0; c < encoder_.size(); ++c) {
    const RecodedCharID& code = encoder_[c];
    if (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && code == encoder_[c - 1]) {
      // Don't show the duplicate entry.
      continue;
    }
    encoding.add_str_int("", code(0));
    for (int i = 1; i < code.length(); ++i) {
      encoding.add_str_int(",", code(i));
    }
    encoding += "\t";
    if (c >= unicharset.size() || (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT &&
                                   unicharset.has_special_codes())) {
      encoding += kNullChar;
    } else {
      encoding += unicharset.id_to_unichar(c);
    }
    encoding += "\n";
  }
  return encoding;
}

// Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing.
// Note that the returned values are 0-based indices, NOT unicode Jamo.
// Returns false if the input is not in the Hangul unicode range.
/* static */
bool UnicharCompress::DecomposeHangul(int unicode, int* leading, int* vowel,
                                      int* trailing) {
  if (unicode < kFirstHangul) return false;
  int offset = unicode - kFirstHangul;
  if (offset >= kNumHangul) return false;
  const int kNCount = kVCount * kTCount;
  *leading = offset / kNCount;
  *vowel = (offset % kNCount) / kTCount;
  *trailing = offset % kTCount;
  return true;
}

// Computes the value of code_range_ from the encoder_.
void UnicharCompress::ComputeCodeRange() {
  code_range_ = -1;
  for (int c = 0; c < encoder_.size(); ++c) {
    const RecodedCharID& code = encoder_[c];
    for (int i = 0; i < code.length(); ++i) {
      if (code(i) > code_range_) code_range_ = code(i);
    }
  }
  ++code_range_;
}

// Initializes the decoding hash_map from the encoding array.
void UnicharCompress::SetupDecoder() {
  Cleanup();
  is_valid_start_.init_to_size(code_range_, false);
  for (int c = 0; c < encoder_.size(); ++c) {
    const RecodedCharID& code = encoder_[c];
    if (code.self_normalized() || decoder_.find(code) == decoder_.end())
      decoder_[code] = c;
    is_valid_start_[code(0)] = true;
    RecodedCharID prefix = code;
    int len = code.length() - 1;
    prefix.Truncate(len);
    auto final_it = final_codes_.find(prefix);
    if (final_it == final_codes_.end()) {
      GenericVectorEqEq<int>* code_list = new GenericVectorEqEq<int>;
      code_list->push_back(code(len));
      final_codes_[prefix] = code_list;
      while (--len >= 0) {
        prefix.Truncate(len);
        auto next_it = next_codes_.find(prefix);
        if (next_it == next_codes_.end()) {
          GenericVectorEqEq<int>* code_list = new GenericVectorEqEq<int>;
          code_list->push_back(code(len));
          next_codes_[prefix] = code_list;
        } else {
          // We still have to search the list as we may get here via multiple
          // lengths of code.
          if (!next_it->second->contains(code(len)))
            next_it->second->push_back(code(len));
          break;  // This prefix has been processed.
        }
      }
    } else {
      if (!final_it->second->contains(code(len)))
        final_it->second->push_back(code(len));
    }
  }
}

// Frees allocated memory.
void UnicharCompress::Cleanup() {
  decoder_.clear();
  is_valid_start_.clear();
  for (auto it = next_codes_.begin(); it != next_codes_.end(); ++it) {
    delete it->second;
  }
  for (auto it = final_codes_.begin(); it != final_codes_.end(); ++it) {
    delete it->second;
  }
  next_codes_.clear();
  final_codes_.clear();
}

}  // namespace tesseract.