Delete some duplicate codes (#832)

- Delete some duplicate codes - Fix the problem of not being able to process unlogged words

Delete some duplicate codes (#832)
- Delete some duplicate codes - Fix the problem of not being able to process unlogged words
a2d33595 · Andy · GitHub · 2afdd196 · a2d33595
显示空白变更内容
内联并排

Showing with 13 addition and 9 deletion

paddlehub/tokenizer/tokenizer.py paddlehub/tokenizer/tokenizer.py +13 -9

未找到文件。
--- a/paddlehub/tokenizer/tokenizer.py
+++ b/paddlehub/tokenizer/tokenizer.py
@@ -78,7 +78,11 @@ class CustomTokenizer(object):

    def _convert_token_to_id(self, token):
        """ Converts a token (str) in an id using the vocab. """
-        return self.vocab.get(token, None)
+        v = self.vocab.get(token, None)
+        if v:
+            return v
+        else:
+            return 0

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
@@ -123,8 +127,8 @@ class CustomTokenizer(object):
        ids = []
        for token in tokens:
            wid = self._convert_token_to_id(token)
-            if wid:
-                ids.append(self._convert_token_to_id(token))
+            if wid is not None:
+                ids.append(wid)
        return ids

    def tokenize(self, text):
@@ -204,7 +208,7 @@ class CustomTokenizer(object):
            if isinstance(text, str):
                tokens = self.tokenize(text)
                ids = self.convert_tokens_to_ids(tokens)
-                return self.convert_tokens_to_ids(tokens)
+                return ids
            elif isinstance(text,
                            (list, tuple)) and len(text) > 0 and isinstance(
                text[0], str):