未验证 提交 a2d33595 编写于 作者: A Andy 提交者: GitHub

Delete some duplicate codes (#832)

- Delete some duplicate codes
- Fix the problem of not being able to process unlogged words
上级 2afdd196
...@@ -78,7 +78,11 @@ class CustomTokenizer(object): ...@@ -78,7 +78,11 @@ class CustomTokenizer(object):
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
""" Converts a token (str) in an id using the vocab. """ """ Converts a token (str) in an id using the vocab. """
return self.vocab.get(token, None) v = self.vocab.get(token, None)
if v:
return v
else:
return 0
def _convert_id_to_token(self, index): def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab.""" """Converts an index (integer) in a token (str) using the vocab."""
...@@ -123,8 +127,8 @@ class CustomTokenizer(object): ...@@ -123,8 +127,8 @@ class CustomTokenizer(object):
ids = [] ids = []
for token in tokens: for token in tokens:
wid = self._convert_token_to_id(token) wid = self._convert_token_to_id(token)
if wid: if wid is not None:
ids.append(self._convert_token_to_id(token)) ids.append(wid)
return ids return ids
def tokenize(self, text): def tokenize(self, text):
...@@ -204,14 +208,14 @@ class CustomTokenizer(object): ...@@ -204,14 +208,14 @@ class CustomTokenizer(object):
if isinstance(text, str): if isinstance(text, str):
tokens = self.tokenize(text) tokens = self.tokenize(text)
ids = self.convert_tokens_to_ids(tokens) ids = self.convert_tokens_to_ids(tokens)
return self.convert_tokens_to_ids(tokens) return ids
elif isinstance(text, elif isinstance(text,
(list, tuple)) and len(text) > 0 and isinstance( (list, tuple)) and len(text) > 0 and isinstance(
text[0], str): text[0], str):
return self.convert_tokens_to_ids(text) return self.convert_tokens_to_ids(text)
elif isinstance(text, elif isinstance(text,
(list, tuple)) and len(text) > 0 and isinstance( (list, tuple)) and len(text) > 0 and isinstance(
text[0], int): text[0], int):
return text return text
else: else:
raise ValueError( raise ValueError(
...@@ -350,7 +354,7 @@ class CustomTokenizer(object): ...@@ -350,7 +354,7 @@ class CustomTokenizer(object):
""" """
out_string = (out_string.replace(" .", ".").replace(" ?", "?").replace( out_string = (out_string.replace(" .", ".").replace(" ?", "?").replace(
" !", "!").replace(" ,", ",").replace(" ' ", "'").replace( " !", "!").replace(" ,", ",").replace(" ' ", "'").replace(
" n't", " n't",
"n't").replace(" 'm", "'m").replace(" 's", "'s").replace( "n't").replace(" 'm", "'m").replace(" 's", "'s").replace(
" 've", "'ve").replace(" 're", "'re")) " 've", "'ve").replace(" 're", "'re"))
return out_string return out_string
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册