未验证 提交 a2d33595 编写于 作者: A Andy 提交者: GitHub

Delete some duplicate codes (#832)

- Delete some duplicate codes
- Fix the problem of not being able to process unlogged words
上级 2afdd196
......@@ -78,7 +78,11 @@ class CustomTokenizer(object):
def _convert_token_to_id(self, token):
""" Converts a token (str) in an id using the vocab. """
return self.vocab.get(token, None)
v = self.vocab.get(token, None)
if v:
return v
else:
return 0
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
......@@ -123,8 +127,8 @@ class CustomTokenizer(object):
ids = []
for token in tokens:
wid = self._convert_token_to_id(token)
if wid:
ids.append(self._convert_token_to_id(token))
if wid is not None:
ids.append(wid)
return ids
def tokenize(self, text):
......@@ -204,7 +208,7 @@ class CustomTokenizer(object):
if isinstance(text, str):
tokens = self.tokenize(text)
ids = self.convert_tokens_to_ids(tokens)
return self.convert_tokens_to_ids(tokens)
return ids
elif isinstance(text,
(list, tuple)) and len(text) > 0 and isinstance(
text[0], str):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册