提交 a2a9719b 编写于 作者: K kinghuin 提交者: wuzewu

debug ernie tiny reader

上级 79b33748
...@@ -81,6 +81,7 @@ def load_vocab(vocab_file): ...@@ -81,6 +81,7 @@ def load_vocab(vocab_file):
index = items[1] if len(items) == 2 else num index = items[1] if len(items) == 2 else num
token = token.strip() token = token.strip()
vocab[token] = int(index) vocab[token] = int(index)
fin.close()
return vocab return vocab
...@@ -168,7 +169,7 @@ class WSSPTokenizer(object): ...@@ -168,7 +169,7 @@ class WSSPTokenizer(object):
self.window_size = 5 self.window_size = 5
self.sp_model.Load(sp_model_dir) self.sp_model.Load(sp_model_dir)
def cut(self, chars): def cut(self, chars, unk_token="[UNK]"):
words = [] words = []
idx = 0 idx = 0
while idx < len(chars): while idx < len(chars):
...@@ -181,7 +182,7 @@ class WSSPTokenizer(object): ...@@ -181,7 +182,7 @@ class WSSPTokenizer(object):
break break
if not matched: if not matched:
i = 1 i = 1
words.append(chars[idx]) words.append(unk_token)
idx += i idx += i
return words return words
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册