未验证 提交 a2d33595 编写于 作者: A Andy 提交者: GitHub

Delete some duplicate codes (#832)

- Delete some duplicate codes
- Fix the problem of not being able to process unlogged words
上级 2afdd196
......@@ -78,7 +78,11 @@ class CustomTokenizer(object):
def _convert_token_to_id(self, token):
""" Converts a token (str) in an id using the vocab. """
return self.vocab.get(token, None)
v = self.vocab.get(token, None)
if v:
return v
else:
return 0
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
......@@ -123,8 +127,8 @@ class CustomTokenizer(object):
ids = []
for token in tokens:
wid = self._convert_token_to_id(token)
if wid:
ids.append(self._convert_token_to_id(token))
if wid is not None:
ids.append(wid)
return ids
def tokenize(self, text):
......@@ -204,14 +208,14 @@ class CustomTokenizer(object):
if isinstance(text, str):
tokens = self.tokenize(text)
ids = self.convert_tokens_to_ids(tokens)
return self.convert_tokens_to_ids(tokens)
return ids
elif isinstance(text,
(list, tuple)) and len(text) > 0 and isinstance(
text[0], str):
text[0], str):
return self.convert_tokens_to_ids(text)
elif isinstance(text,
(list, tuple)) and len(text) > 0 and isinstance(
text[0], int):
text[0], int):
return text
else:
raise ValueError(
......@@ -350,7 +354,7 @@ class CustomTokenizer(object):
"""
out_string = (out_string.replace(" .", ".").replace(" ?", "?").replace(
" !", "!").replace(" ,", ",").replace(" ' ", "'").replace(
" n't",
"n't").replace(" 'm", "'m").replace(" 's", "'s").replace(
" 've", "'ve").replace(" 're", "'re"))
" n't",
"n't").replace(" 'm", "'m").replace(" 's", "'s").replace(
" 've", "'ve").replace(" 're", "'re"))
return out_string
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册