提交 eb72d386 编写于 作者: K kinghuin 提交者: wuzewu

debug tiny reader

上级 2ff7132d
...@@ -49,7 +49,7 @@ class BaseReader(object): ...@@ -49,7 +49,7 @@ class BaseReader(object):
in_tokens=False): in_tokens=False):
self.max_seq_len = max_seq_len self.max_seq_len = max_seq_len
if sp_model_path and word_dict_path: if sp_model_path and word_dict_path:
self.tokenizer = tokenization.WSSPTokenizer( self.tzokenizer = tokenization.WSSPTokenizer(
vocab_path, sp_model_path, word_dict_path, ws=True, lower=True) vocab_path, sp_model_path, word_dict_path, ws=True, lower=True)
else: else:
self.tokenizer = tokenization.FullTokenizer( self.tokenizer = tokenization.FullTokenizer(
......
...@@ -186,11 +186,11 @@ class WSSPTokenizer(object): ...@@ -186,11 +186,11 @@ class WSSPTokenizer(object):
return words return words
def tokenize(self, text): def tokenize(self, text):
sen = text.decode('utf8') text = convert_to_unicode(text)
if self.ws: if self.ws:
sen = [s for s in self.cut(sen) if s != ' '] sen = [s for s in self.cut(text) if s != ' ']
else: else:
sen = sen.split(' ') sen = text.split(' ')
if self.lower: if self.lower:
sen = [s.lower() for s in sen] sen = [s.lower() for s in sen]
sen = ' '.join(sen) sen = ' '.join(sen)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册