decoder_utils.cpp 2.6 KB
Newer Older
Y
Yibing Liu 已提交
1
#include "decoder_utils.h"
Y
Yibing Liu 已提交
2 3
#include <algorithm>
#include <cmath>
Y
Yibing Liu 已提交
4
#include <limits>
Y
Yibing Liu 已提交
5

6
size_t get_utf8_str_len(const std::string& str) {
Y
Yibing Liu 已提交
7 8 9 10 11
  size_t str_len = 0;
  for (char c : str) {
    str_len += ((c & 0xc0) != 0x80);
  }
  return str_len;
12
}
13

Y
Yibing Liu 已提交
14
std::vector<std::string> split_utf8_str(const std::string& str) {
15 16 17
  std::vector<std::string> result;
  std::string out_str;

Y
Yibing Liu 已提交
18 19
  for (char c : str) {
    if ((c & 0xc0) != 0x80)  // new UTF-8 character
20
    {
Y
Yibing Liu 已提交
21 22 23 24
      if (!out_str.empty()) {
        result.push_back(out_str);
        out_str.clear();
      }
25
    }
Y
Yibing Liu 已提交
26 27 28

    out_str.append(1, c);
  }
29 30 31 32
  result.push_back(out_str);
  return result;
}

Y
Yibing Liu 已提交
33 34 35 36 37 38 39 40 41 42 43 44 45 46
std::vector<std::string> split_str(const std::string& s,
                                   const std::string& delim) {
  std::vector<std::string> result;
  std::size_t start = 0, delim_len = delim.size();
  while (true) {
    std::size_t end = s.find(delim, start);
    if (end == std::string::npos) {
      if (start < s.size()) {
        result.push_back(s.substr(start));
      }
      break;
    }
    if (end > start) {
      result.push_back(s.substr(start, end - start));
Y
Yibing Liu 已提交
47
    }
Y
Yibing Liu 已提交
48 49 50
    start = end + delim_len;
  }
  return result;
Y
Yibing Liu 已提交
51 52
}

Y
Yibing Liu 已提交
53 54 55 56
bool prefix_compare(const PathTrie* x, const PathTrie* y) {
  if (x->score == y->score) {
    if (x->character == y->character) {
      return false;
57
    } else {
Y
Yibing Liu 已提交
58
      return (x->character < y->character);
59
    }
Y
Yibing Liu 已提交
60 61 62
  } else {
    return x->score > y->score;
  }
63
}
64 65 66

void add_word_to_fst(const std::vector<int>& word,
                     fst::StdVectorFst* dictionary) {
Y
Yibing Liu 已提交
67 68 69 70 71 72 73 74 75 76 77 78 79
  if (dictionary->NumStates() == 0) {
    fst::StdVectorFst::StateId start = dictionary->AddState();
    assert(start == 0);
    dictionary->SetStart(start);
  }
  fst::StdVectorFst::StateId src = dictionary->Start();
  fst::StdVectorFst::StateId dst;
  for (auto c : word) {
    dst = dictionary->AddState();
    dictionary->AddArc(src, fst::StdArc(c, c, 0, dst));
    src = dst;
  }
  dictionary->SetFinal(dst, fst::StdArc::Weight::One());
80
}
81

Y
Yibing Liu 已提交
82 83 84 85 86 87 88
bool add_word_to_dictionary(
    const std::string& word,
    const std::unordered_map<std::string, int>& char_map,
    bool add_space,
    int SPACE_ID,
    fst::StdVectorFst* dictionary) {
  auto characters = split_utf8_str(word);
89

Y
Yibing Liu 已提交
90
  std::vector<int> int_word;
91

Y
Yibing Liu 已提交
92 93 94 95 96 97 98 99 100 101
  for (auto& c : characters) {
    if (c == " ") {
      int_word.push_back(SPACE_ID);
    } else {
      auto int_c = char_map.find(c);
      if (int_c != char_map.end()) {
        int_word.push_back(int_c->second);
      } else {
        return false;  // return without adding
      }
102
    }
Y
Yibing Liu 已提交
103
  }
104

Y
Yibing Liu 已提交
105 106 107
  if (add_space) {
    int_word.push_back(SPACE_ID);
  }
108

Y
Yibing Liu 已提交
109 110
  add_word_to_fst(int_word, dictionary);
  return true;
111
}