提交 bfbe27f8 编写于 作者: K kinghuin 提交者: wuzewu

debug ernie tiny reader

上级 f6a710ef
...@@ -169,7 +169,7 @@ class WSSPTokenizer(object): ...@@ -169,7 +169,7 @@ class WSSPTokenizer(object):
self.window_size = 5 self.window_size = 5
self.sp_model.Load(sp_model_dir) self.sp_model.Load(sp_model_dir)
def cut(self, chars, unk_token="[UNK]"): def cut(self, chars):
words = [] words = []
idx = 0 idx = 0
while idx < len(chars): while idx < len(chars):
...@@ -182,21 +182,26 @@ class WSSPTokenizer(object): ...@@ -182,21 +182,26 @@ class WSSPTokenizer(object):
break break
if not matched: if not matched:
i = 1 i = 1
words.append(unk_token) words.append(chars[idx])
print(chars[idx])
idx += i idx += i
return words return words
def tokenize(self, text): def tokenize(self, text, unk_token="[UNK]"):
text = convert_to_unicode(text) text = convert_to_unicode(text)
if self.ws: if self.ws:
sen = [s for s in self.cut(text) if s != ' '] text = [s for s in self.cut(text) if s != ' ']
else: else:
sen = text.split(' ') text = text.split(' ')
if self.lower: if self.lower:
sen = [s.lower() for s in sen] text = [s.lower() for s in text]
sen = ' '.join(sen) in_vocab = []
ret = self.sp_model.EncodeAsPieces(sen) for word in text:
if word in self.vocab:
in_vocab.append(word)
else:
in_vocab.append(unk_token)
text = ' '.join(in_vocab)
ret = self.sp_model.EncodeAsPieces(text)
return ret return ret
def convert_tokens_to_ids(self, tokens): def convert_tokens_to_ids(self, tokens):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册