#include #include #include #include "decoder_utils.h" size_t get_utf8_str_len(const std::string& str) { size_t str_len = 0; for (char c : str) { str_len += ((c & 0xc0) != 0x80); } return str_len; } //------------------------------------------------------ //Splits string into vector of strings representing //UTF-8 characters (not same as chars) //------------------------------------------------------ std::vector split_utf8_str(const std::string& str) { std::vector result; std::string out_str; for (char c : str) { if ((c & 0xc0) != 0x80) //new UTF-8 character { if (!out_str.empty()) { result.push_back(out_str); out_str.clear(); } } out_str.append(1, c); } result.push_back(out_str); return result; } // Split a string into a list of strings on a given string // delimiter. NB: delimiters on beginning / end of string are // trimmed. Eg, "FooBarFoo" split on "Foo" returns ["Bar"]. std::vector split_str(const std::string &s, const std::string &delim) { std::vector result; std::size_t start = 0, delim_len = delim.size(); while (true) { std::size_t end = s.find(delim, start); if (end == std::string::npos) { if (start < s.size()) { result.push_back(s.substr(start)); } break; } if (end > start) { result.push_back(s.substr(start, end - start)); } start = end + delim_len; } return result; } //------------------------------------------------------- // Overriding less than operator for sorting //------------------------------------------------------- bool prefix_compare(const PathTrie* x, const PathTrie* y) { if (x->_score == y->_score) { if (x->_character == y->_character) { return false; } else { return (x->_character < y->_character); } } else { return x->_score > y->_score; } } //---------- End path_compare --------------------------- // -------------------------------------------------------------- // Adds word to fst without copying entire dictionary // -------------------------------------------------------------- void add_word_to_fst(const std::vector& word, fst::StdVectorFst* dictionary) { if (dictionary->NumStates() == 0) { fst::StdVectorFst::StateId start = dictionary->AddState(); assert(start == 0); dictionary->SetStart(start); } fst::StdVectorFst::StateId src = dictionary->Start(); fst::StdVectorFst::StateId dst; for (auto c : word) { dst = dictionary->AddState(); dictionary->AddArc(src, fst::StdArc(c, c, 0, dst)); src = dst; } dictionary->SetFinal(dst, fst::StdArc::Weight::One()); } // ------------ End of add_word_to_fst ----------------------- // --------------------------------------------------------- // Adds a word to the dictionary FST based on char_map // --------------------------------------------------------- bool add_word_to_dictionary(const std::string& word, const std::unordered_map& char_map, bool add_space, int SPACE, fst::StdVectorFst* dictionary) { auto characters = split_utf8_str(word); std::vector int_word; for (auto& c : characters) { if (c == " ") { int_word.push_back(SPACE); } else { auto int_c = char_map.find(c); if (int_c != char_map.end()) { int_word.push_back(int_c->second); } else { return false; // return without adding } } } if (add_space) { int_word.push_back(SPACE); } add_word_to_fst(int_word, dictionary); return true; } // -------------- End of addWordToDictionary ------------