diff --git a/paddlehub/tokenizer/tokenizer.py b/paddlehub/tokenizer/tokenizer.py index 4ad6db48cf0f9da4da9b8c7285cad96d0d2bf974..f2f9111a1fbd974cb6ac1801a604d766228f29b0 100644 --- a/paddlehub/tokenizer/tokenizer.py +++ b/paddlehub/tokenizer/tokenizer.py @@ -78,7 +78,11 @@ class CustomTokenizer(object): def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ - return self.vocab.get(token, None) + v = self.vocab.get(token, None) + if v: + return v + else: + return 0 def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" @@ -123,8 +127,8 @@ class CustomTokenizer(object): ids = [] for token in tokens: wid = self._convert_token_to_id(token) - if wid: - ids.append(self._convert_token_to_id(token)) + if wid is not None: + ids.append(wid) return ids def tokenize(self, text): @@ -204,14 +208,14 @@ class CustomTokenizer(object): if isinstance(text, str): tokens = self.tokenize(text) ids = self.convert_tokens_to_ids(tokens) - return self.convert_tokens_to_ids(tokens) + return ids elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance( - text[0], str): + text[0], str): return self.convert_tokens_to_ids(text) elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance( - text[0], int): + text[0], int): return text else: raise ValueError( @@ -350,7 +354,7 @@ class CustomTokenizer(object): """ out_string = (out_string.replace(" .", ".").replace(" ?", "?").replace( " !", "!").replace(" ,", ",").replace(" ' ", "'").replace( - " n't", - "n't").replace(" 'm", "'m").replace(" 's", "'s").replace( - " 've", "'ve").replace(" 're", "'re")) + " n't", + "n't").replace(" 'm", "'m").replace(" 's", "'s").replace( + " 've", "'ve").replace(" 're", "'re")) return out_string