unicharcompress.cpp 15.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
///////////////////////////////////////////////////////////////////////
// File:        unicharcompress.cpp
// Description: Unicode re-encoding using a sequence of smaller numbers in
//              place of a single large code for CJK, similarly for Indic,
//              and dissection of ligatures for other scripts.
// Author:      Ray Smith
// Created:     Wed Mar 04 14:45:01 PST 2015
//
// (C) Copyright 2015, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#include "unicharcompress.h"
23
#include <memory>
H
Hintz 已提交
24
#include <algorithm>
25 26 27 28 29 30
#include "tprintf.h"

namespace tesseract {

// String used to represent the null_id in direct_set.
const char* kNullChar = "<nul>";
31 32
// Radix to make unique values from the stored radical codes.
const int kRadicalRadix = 29;
33

34 35 36 37 38 39 40 41
// "Hash" function for const std::vector<int> computes the sum of elements.
// Build a unique number for each code sequence that we can use as the index in
// a hash map of ints instead of trying to hash the vectors.
static int RadicalPreHash(const std::vector<int>& rs) {
  size_t result = 0;
  for (int radical : rs) {
    result *= kRadicalRadix;
    result += radical;
42
  }
43 44
  return result;
}
45

46 47 48 49
// A hash map to convert unicodes to radical encoding.
typedef std::unordered_map<int, std::unique_ptr<std::vector<int>>> RSMap;
// A hash map to count occurrences of each radical encoding.
typedef std::unordered_map<int, int> RSCounts;
50

51 52 53 54 55 56 57 58 59 60 61 62 63 64
static bool DecodeRadicalLine(STRING* radical_data_line, RSMap* radical_map) {
  if (radical_data_line->length() == 0 || (*radical_data_line)[0] == '#')
    return true;
  GenericVector<STRING> entries;
  radical_data_line->split(' ', &entries);
  if (entries.size() < 2) return false;
  char* end = nullptr;
  int unicode = strtol(&entries[0][0], &end, 10);
  if (*end != '\0') return false;
  std::unique_ptr<std::vector<int>> radicals(new std::vector<int>);
  for (int i = 1; i < entries.size(); ++i) {
    int radical = strtol(&entries[i][0], &end, 10);
    if (*end != '\0') return false;
    radicals->push_back(radical);
65
  }
66 67 68
  (*radical_map)[unicode] = std::move(radicals);
  return true;
}
69 70 71 72 73

// Helper function builds the RSMap from the radical-stroke file, which has
// already been read into a STRING. Returns false on error.
// The radical_stroke_table is non-const because it gets split and the caller
// is unlikely to want to use it again.
74
static bool DecodeRadicalTable(STRING* radical_data, RSMap* radical_map) {
75
  GenericVector<STRING> lines;
76
  radical_data->split('\n', &lines);
77
  for (int i = 0; i < lines.size(); ++i) {
78 79
    if (!DecodeRadicalLine(&lines[i], radical_map)) {
      tprintf("Invalid format in radical table at line %d: %s\n", i,
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
              lines[i].string());
      return false;
    }
  }
  return true;
}

UnicharCompress::UnicharCompress() : code_range_(0) {}
UnicharCompress::UnicharCompress(const UnicharCompress& src) { *this = src; }
UnicharCompress::~UnicharCompress() { Cleanup(); }
UnicharCompress& UnicharCompress::operator=(const UnicharCompress& src) {
  Cleanup();
  encoder_ = src.encoder_;
  code_range_ = src.code_range_;
  SetupDecoder();
  return *this;
}

// Computes the encoding for the given unicharset. It is a requirement that
// the file training/langdata/radical-stroke.txt have been read into the
// input string radical_stroke_table.
// Returns false if the encoding cannot be constructed.
bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
                                      STRING* radical_stroke_table) {
  RSMap radical_map;
105 106
  if (radical_stroke_table != nullptr &&
      !DecodeRadicalTable(radical_stroke_table, &radical_map))
107 108 109
    return false;
  encoder_.clear();
  UNICHARSET direct_set;
110
  // To avoid unused codes, clear the special codes from the direct_set.
111 112
  direct_set.clear();
  // Always keep space as 0;
113
  direct_set.unichar_insert(" ", OldUncleanUnichars::kTrue);
114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
  // Null char is next if we have one.
  if (null_id >= 0) {
    direct_set.unichar_insert(kNullChar);
  }
  RSCounts radical_counts;
  // In the initial map, codes [0, unicharset.size()) are
  // reserved for non-han/hangul sequences of 1 or more unicodes.
  int hangul_offset = unicharset.size();
  // Hangul takes the next range [hangul_offset, hangul_offset + kTotalJamos).
  const int kTotalJamos = kLCount + kVCount + kTCount;
  // Han takes the codes beyond hangul_offset + kTotalJamos. Since it is hard
  // to measure the number of radicals and strokes, initially we use the same
  // code range for all 3 Han code positions, and fix them after.
  int han_offset = hangul_offset + kTotalJamos;
  int max_num_strokes = -1;
  for (int u = 0; u <= unicharset.size(); ++u) {
    // We special-case allow null_id to be equal to unicharset.size() in case
    // there is no space in unicharset for it.
132
    if (u == unicharset.size() && u != null_id) break;  // Finished
133 134
    RecodedCharID code;
    // Convert to unicodes.
135
    std::vector<char32> unicodes;
136 137 138
    string cleaned;
    if (u < unicharset.size())
      cleaned = UNICHARSET::CleanupString(unicharset.id_to_unichar(u));
139
    if (u < unicharset.size() &&
140
        (unicodes = UNICHAR::UTF8ToUTF32(cleaned.c_str())).size() == 1) {
141 142 143 144 145
      // Check single unicodes for Hangul/Han and encode if so.
      int unicode = unicodes[0];
      int leading, vowel, trailing;
      auto it = radical_map.find(unicode);
      if (it != radical_map.end()) {
146 147 148 149
        // This is Han. Use the radical codes directly.
        int num_radicals = it->second->size();
        for (int c = 0; c < num_radicals; ++c) {
          code.Set(c, han_offset + (*it->second)[c]);
150
        }
151 152 153 154
        int pre_hash = RadicalPreHash(*it->second);
        int num_samples = radical_counts[pre_hash]++;
        if (num_samples > 0)
          code.Set(num_radicals, han_offset + num_samples + kRadicalRadix);
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
      } else if (DecomposeHangul(unicode, &leading, &vowel, &trailing)) {
        // This is Hangul. Since we know the exact size of each part at compile
        // time, it gets the bottom set of codes.
        code.Set3(leading + hangul_offset, vowel + kLCount + hangul_offset,
                  trailing + kLCount + kVCount + hangul_offset);
      }
    }
    // If the code is still empty, it wasn't Han or Hangul.
    if (code.length() == 0) {
      // Special cases.
      if (u == UNICHAR_SPACE) {
        code.Set(0, 0);  // Space.
      } else if (u == null_id || (unicharset.has_special_codes() &&
                                  u < SPECIAL_UNICHAR_CODES_COUNT)) {
        code.Set(0, direct_set.unichar_to_id(kNullChar));
      } else {
        // Add the direct_set unichar-ids of the unicodes in sequence to the
        // code.
        for (int i = 0; i < unicodes.size(); ++i) {
          int position = code.length();
          if (position >= RecodedCharID::kMaxCodeLen) {
176 177
            tprintf("Unichar %d=%s is too long to encode!!\n", u,
                    unicharset.id_to_unichar(u));
178 179 180 181 182 183 184 185 186
            return false;
          }
          int uni = unicodes[i];
          UNICHAR unichar(uni);
          char* utf8 = unichar.utf8_str();
          if (!direct_set.contains_unichar(utf8))
            direct_set.unichar_insert(utf8);
          code.Set(position, direct_set.unichar_to_id(utf8));
          delete[] utf8;
187 188
          if (direct_set.size() >
              unicharset.size() + !unicharset.has_special_codes()) {
189 190 191 192 193 194 195 196 197 198 199 200 201 202
            // Code space got bigger!
            tprintf("Code space expanded from original unicharset!!\n");
            return false;
          }
        }
      }
    }
    encoder_.push_back(code);
  }
  // Now renumber Han to make all codes unique. We already added han_offset to
  // all Han. Now separate out the radical, stroke, and count codes for Han.
  // In the uniqued Han encoding, the 1st code uses the next radical_map.size()
  // values, the 2nd code uses the next max_num_strokes+1 values, and the 3rd
  // code uses the rest for the max number of duplicated radical/stroke combos.
203 204 205 206 207 208 209 210
  int code_offset = 0;
  for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
    int max_offset = 0;
    for (int u = 0; u < unicharset.size(); ++u) {
      RecodedCharID* code = &encoder_[u];
      if (code->length() <= i) continue;
      max_offset = std::max(max_offset, (*code)(i)-han_offset);
      code->Set(i, (*code)(i) + code_offset);
211
    }
212 213
    if (max_offset == 0) break;
    code_offset += max_offset + 1;
214 215 216 217 218 219 220 221 222 223 224 225 226 227 228
  }
  DefragmentCodeValues(null_id >= 0 ? 1 : -1);
  SetupDecoder();
  return true;
}

// Sets up an encoder that doesn't change the unichars at all, so it just
// passes them through unchanged.
void UnicharCompress::SetupPassThrough(const UNICHARSET& unicharset) {
  GenericVector<RecodedCharID> codes;
  for (int u = 0; u < unicharset.size(); ++u) {
    RecodedCharID code;
    code.Set(0, u);
    codes.push_back(code);
  }
229 230 231 232 233
  if (!unicharset.has_special_codes()) {
    RecodedCharID code;
    code.Set(0, unicharset.size());
    codes.push_back(code);
  }
234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310
  SetupDirect(codes);
}

// Sets up an encoder directly using the given encoding vector, which maps
// unichar_ids to the given codes.
void UnicharCompress::SetupDirect(const GenericVector<RecodedCharID>& codes) {
  encoder_ = codes;
  ComputeCodeRange();
  SetupDecoder();
}

// Renumbers codes to eliminate unused values.
void UnicharCompress::DefragmentCodeValues(int encoded_null) {
  // There may not be any Hangul, but even if there is, it is possible that not
  // all codes are used. Likewise with the Han encoding, it is possible that not
  // all numbers of strokes are used.
  ComputeCodeRange();
  GenericVector<int> offsets;
  offsets.init_to_size(code_range_, 0);
  // Find which codes are used
  for (int c = 0; c < encoder_.size(); ++c) {
    const RecodedCharID& code = encoder_[c];
    for (int i = 0; i < code.length(); ++i) {
      offsets[code(i)] = 1;
    }
  }
  // Compute offsets based on code use.
  int offset = 0;
  for (int i = 0; i < offsets.size(); ++i) {
    // If not used, decrement everything above here.
    // We are moving encoded_null to the end, so it is not "used".
    if (offsets[i] == 0 || i == encoded_null) {
      --offset;
    } else {
      offsets[i] = offset;
    }
  }
  if (encoded_null >= 0) {
    // The encoded_null is moving to the end, for the benefit of TensorFlow,
    // which is offsets.size() + offsets.back().
    offsets[encoded_null] = offsets.size() + offsets.back() - encoded_null;
  }
  // Now apply the offsets.
  for (int c = 0; c < encoder_.size(); ++c) {
    RecodedCharID* code = &encoder_[c];
    for (int i = 0; i < code->length(); ++i) {
      int value = (*code)(i);
      code->Set(i, value + offsets[value]);
    }
  }
  ComputeCodeRange();
}

// Encodes a single unichar_id. Returns the length of the code, or zero if
// invalid input, and the encoding itself
int UnicharCompress::EncodeUnichar(int unichar_id, RecodedCharID* code) const {
  if (unichar_id < 0 || unichar_id >= encoder_.size()) return 0;
  *code = encoder_[unichar_id];
  return code->length();
}

// Decodes code, returning the original unichar-id, or
// INVALID_UNICHAR_ID if the input is invalid.
int UnicharCompress::DecodeUnichar(const RecodedCharID& code) const {
  int len = code.length();
  if (len <= 0 || len > RecodedCharID::kMaxCodeLen) return INVALID_UNICHAR_ID;
  auto it = decoder_.find(code);
  if (it == decoder_.end()) return INVALID_UNICHAR_ID;
  return it->second;
}

// Writes to the given file. Returns false in case of error.
bool UnicharCompress::Serialize(TFile* fp) const {
  return encoder_.SerializeClasses(fp);
}

// Reads from the given file. Returns false in case of error.
311 312
bool UnicharCompress::DeSerialize(TFile* fp) {
  if (!encoder_.DeSerializeClasses(fp)) return false;
313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
  ComputeCodeRange();
  SetupDecoder();
  return true;
}

// Returns a STRING containing a text file that describes the encoding thus:
// <index>[,<index>]*<tab><UTF8-str><newline>
// In words, a comma-separated list of one or more indices, followed by a tab
// and the UTF-8 string that the code represents per line. Most simple scripts
// will encode a single index to a UTF8-string, but Chinese, Japanese, Korean
// and the Indic scripts will contain a many-to-many mapping.
// See the class comment above for details.
STRING UnicharCompress::GetEncodingAsString(
    const UNICHARSET& unicharset) const {
  STRING encoding;
  for (int c = 0; c < encoder_.size(); ++c) {
    const RecodedCharID& code = encoder_[c];
    if (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && code == encoder_[c - 1]) {
      // Don't show the duplicate entry.
      continue;
    }
    encoding.add_str_int("", code(0));
    for (int i = 1; i < code.length(); ++i) {
      encoding.add_str_int(",", code(i));
    }
    encoding += "\t";
    if (c >= unicharset.size() || (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT &&
                                   unicharset.has_special_codes())) {
      encoding += kNullChar;
    } else {
      encoding += unicharset.id_to_unichar(c);
    }
    encoding += "\n";
  }
  return encoding;
}

// Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing.
// Note that the returned values are 0-based indices, NOT unicode Jamo.
// Returns false if the input is not in the Hangul unicode range.
/* static */
bool UnicharCompress::DecomposeHangul(int unicode, int* leading, int* vowel,
                                      int* trailing) {
  if (unicode < kFirstHangul) return false;
  int offset = unicode - kFirstHangul;
  if (offset >= kNumHangul) return false;
  const int kNCount = kVCount * kTCount;
  *leading = offset / kNCount;
  *vowel = (offset % kNCount) / kTCount;
  *trailing = offset % kTCount;
  return true;
}

// Computes the value of code_range_ from the encoder_.
void UnicharCompress::ComputeCodeRange() {
  code_range_ = -1;
  for (int c = 0; c < encoder_.size(); ++c) {
    const RecodedCharID& code = encoder_[c];
    for (int i = 0; i < code.length(); ++i) {
      if (code(i) > code_range_) code_range_ = code(i);
    }
  }
  ++code_range_;
}

// Initializes the decoding hash_map from the encoding array.
void UnicharCompress::SetupDecoder() {
  Cleanup();
  is_valid_start_.init_to_size(code_range_, false);
  for (int c = 0; c < encoder_.size(); ++c) {
    const RecodedCharID& code = encoder_[c];
384
    decoder_[code] = c;
385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430
    is_valid_start_[code(0)] = true;
    RecodedCharID prefix = code;
    int len = code.length() - 1;
    prefix.Truncate(len);
    auto final_it = final_codes_.find(prefix);
    if (final_it == final_codes_.end()) {
      GenericVectorEqEq<int>* code_list = new GenericVectorEqEq<int>;
      code_list->push_back(code(len));
      final_codes_[prefix] = code_list;
      while (--len >= 0) {
        prefix.Truncate(len);
        auto next_it = next_codes_.find(prefix);
        if (next_it == next_codes_.end()) {
          GenericVectorEqEq<int>* code_list = new GenericVectorEqEq<int>;
          code_list->push_back(code(len));
          next_codes_[prefix] = code_list;
        } else {
          // We still have to search the list as we may get here via multiple
          // lengths of code.
          if (!next_it->second->contains(code(len)))
            next_it->second->push_back(code(len));
          break;  // This prefix has been processed.
        }
      }
    } else {
      if (!final_it->second->contains(code(len)))
        final_it->second->push_back(code(len));
    }
  }
}

// Frees allocated memory.
void UnicharCompress::Cleanup() {
  decoder_.clear();
  is_valid_start_.clear();
  for (auto it = next_codes_.begin(); it != next_codes_.end(); ++it) {
    delete it->second;
  }
  for (auto it = final_codes_.begin(); it != final_codes_.end(); ++it) {
    delete it->second;
  }
  next_codes_.clear();
  final_codes_.clear();
}

}  // namespace tesseract.