提交 eb72d386 编写于 作者: K kinghuin 提交者: wuzewu

debug tiny reader

上级 2ff7132d
......@@ -49,7 +49,7 @@ class BaseReader(object):
in_tokens=False):
self.max_seq_len = max_seq_len
if sp_model_path and word_dict_path:
self.tokenizer = tokenization.WSSPTokenizer(
self.tzokenizer = tokenization.WSSPTokenizer(
vocab_path, sp_model_path, word_dict_path, ws=True, lower=True)
else:
self.tokenizer = tokenization.FullTokenizer(
......
......@@ -186,11 +186,11 @@ class WSSPTokenizer(object):
return words
def tokenize(self, text):
sen = text.decode('utf8')
text = convert_to_unicode(text)
if self.ws:
sen = [s for s in self.cut(sen) if s != ' ']
sen = [s for s in self.cut(text) if s != ' ']
else:
sen = sen.split(' ')
sen = text.split(' ')
if self.lower:
sen = [s.lower() for s in sen]
sen = ' '.join(sen)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册