提交 30563981 编写于 作者: H Hui Zhang

replace space when build vocab

上级 86f34784
......@@ -28,6 +28,7 @@
#include "path_trie.h"
using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>;
constexpr kSPACE = "<space>"
std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
const std::vector<std::vector<double>> &probs_seq,
......@@ -46,13 +47,8 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
"The shape of probs_seq does not match with "
"the shape of the vocabulary");
}
// assign blank id
// size_t blank_id = vocabulary.size();
// size_t blank_id = 0;
// assign space id
auto it = std::find(vocabulary.begin(), vocabulary.end(), " ");
auto it = std::find(vocabulary.begin(), vocabulary.end(), kSPACE);
int space_id = it - vocabulary.begin();
// if no space in vocabulary
if ((size_t)space_id >= vocabulary.size()) {
......
......@@ -54,9 +54,9 @@ class TextFeaturizer():
self.sp = spm.SentencePieceProcessor()
self.sp.Load(spm_model)
def tokenize(self, text):
def tokenize(self, text, replace_space=True):
if self.unit_type == 'char':
tokens = self.char_tokenize(text)
tokens = self.char_tokenize(text, replace_space)
elif self.unit_type == 'word':
tokens = self.word_tokenize(text)
else: # spm
......@@ -106,17 +106,19 @@ class TextFeaturizer():
text = self.detokenize(tokens)
return text
def char_tokenize(self, text):
def char_tokenize(self, text, replace_space=True):
"""Character tokenizer.
Args:
text (str): text string.
replace_space (bool): False only used by build_vocab.py.
Returns:
List[str]: tokens.
"""
text = text.strip()
text = text.replace(" ", SPACE)
if replace_space:
text = text.replace(" ", SPACE)
return list(text)
def char_detokenize(self, tokens):
......
......@@ -61,7 +61,7 @@ args = parser.parse_args()
def count_manifest(counter, text_feature, manifest_path):
manifest_jsons = read_manifest(manifest_path)
for line_json in manifest_jsons:
line = text_feature.tokenize(line_json['text'])
line = text_feature.tokenize(line_json['text'], replace_space=False)
counter.update(line)
def dump_text_manifest(fileobj, manifest_path, key='text'):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册