提交 df92bdfc 编写于 作者: X xixiaoyao

fix bug

上级 c21afb28
...@@ -162,10 +162,12 @@ class BasicTokenizer(object): ...@@ -162,10 +162,12 @@ class BasicTokenizer(object):
def __init__(self, do_lower_case=True): def __init__(self, do_lower_case=True):
"""Constructs a BasicTokenizer. """Constructs a BasicTokenizer.
Args: Args:
do_lower_case: Whether to lower case the input. do_lower_case: Whether to lower case the input.
""" """
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self._never_lowercase = ['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']
def tokenize(self, text): def tokenize(self, text):
"""Tokenizes a piece of text.""" """Tokenizes a piece of text."""
...@@ -183,9 +185,12 @@ class BasicTokenizer(object): ...@@ -183,9 +185,12 @@ class BasicTokenizer(object):
orig_tokens = whitespace_tokenize(text) orig_tokens = whitespace_tokenize(text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if self.do_lower_case: if self.do_lower_case and token not in self._never_lowercase:
token = token.lower() token = token.lower()
token = self._run_strip_accents(token) token = self._run_strip_accents(token)
if token in self._never_lowercase:
split_tokens.extend([token])
else:
split_tokens.extend(self._run_split_on_punc(token)) split_tokens.extend(self._run_split_on_punc(token))
output_tokens = whitespace_tokenize(" ".join(split_tokens)) output_tokens = whitespace_tokenize(" ".join(split_tokens))
...@@ -281,14 +286,18 @@ class WordpieceTokenizer(object): ...@@ -281,14 +286,18 @@ class WordpieceTokenizer(object):
def tokenize(self, text): def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces. """Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary. using the given vocabulary.
For example: For example:
input = "unaffable" input = "unaffable"
output = ["un", "##aff", "##able"] output = ["un", "##aff", "##able"]
Args: Args:
text: A single token or whitespace separated tokens. This should have text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer. already been passed through `BasicTokenizer.
Returns: Returns:
A list of wordpiece tokens. A list of wordpiece tokens.
""" """
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册