未验证 提交 a2d33595 编写于 作者: A Andy 提交者: GitHub

Delete some duplicate codes (#832)

- Delete some duplicate codes
- Fix the problem of not being able to process unlogged words
上级 2afdd196
...@@ -78,7 +78,11 @@ class CustomTokenizer(object): ...@@ -78,7 +78,11 @@ class CustomTokenizer(object):
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
""" Converts a token (str) in an id using the vocab. """ """ Converts a token (str) in an id using the vocab. """
return self.vocab.get(token, None) v = self.vocab.get(token, None)
if v:
return v
else:
return 0
def _convert_id_to_token(self, index): def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab.""" """Converts an index (integer) in a token (str) using the vocab."""
...@@ -123,8 +127,8 @@ class CustomTokenizer(object): ...@@ -123,8 +127,8 @@ class CustomTokenizer(object):
ids = [] ids = []
for token in tokens: for token in tokens:
wid = self._convert_token_to_id(token) wid = self._convert_token_to_id(token)
if wid: if wid is not None:
ids.append(self._convert_token_to_id(token)) ids.append(wid)
return ids return ids
def tokenize(self, text): def tokenize(self, text):
...@@ -204,7 +208,7 @@ class CustomTokenizer(object): ...@@ -204,7 +208,7 @@ class CustomTokenizer(object):
if isinstance(text, str): if isinstance(text, str):
tokens = self.tokenize(text) tokens = self.tokenize(text)
ids = self.convert_tokens_to_ids(tokens) ids = self.convert_tokens_to_ids(tokens)
return self.convert_tokens_to_ids(tokens) return ids
elif isinstance(text, elif isinstance(text,
(list, tuple)) and len(text) > 0 and isinstance( (list, tuple)) and len(text) > 0 and isinstance(
text[0], str): text[0], str):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册