Delete some duplicate codes (#832)

- Delete some duplicate codes - Fix the problem of not being able to process unlogged words

Delete some duplicate codes (#832)
- Delete some duplicate codes - Fix the problem of not being able to process unlogged words
a2d33595 · Andy · GitHub · 2afdd196 · a2d33595
隐藏空白更改
内联并排

Showing with 13 addition and 9 deletion

paddlehub/tokenizer/tokenizer.py paddlehub/tokenizer/tokenizer.py +13 -9

未找到文件。
--- a/paddlehub/tokenizer/tokenizer.py
+++ b/paddlehub/tokenizer/tokenizer.py
@@ -78,7 +78,11 @@ class CustomTokenizer(object):
    def _convert_token_to_id(self, token):
        """ Converts a token (str) in an id using the vocab. """
-        return self.vocab.get(token, None)
+        v = self.vocab.get(token, None)
+        if v:
+            return v
+        else:
+            return 0
    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
@@ -123,8 +127,8 @@ class CustomTokenizer(object):
        ids = []
        for token in tokens:
            wid = self._convert_token_to_id(token)
-            if wid:
+            if wid is not None:
-                ids.append(self._convert_token_to_id(token))
+                ids.append(wid)
        return ids
    def tokenize(self, text):
@@ -204,14 +208,14 @@ class CustomTokenizer(object):
            if isinstance(text, str):
                tokens = self.tokenize(text)
                ids = self.convert_tokens_to_ids(tokens)
-                return self.convert_tokens_to_ids(tokens)
+                return ids
            elif isinstance(text,
                            (list, tuple)) and len(text) > 0 and isinstance(
-                                text[0], str):
+                text[0], str):
                return self.convert_tokens_to_ids(text)
            elif isinstance(text,
                            (list, tuple)) and len(text) > 0 and isinstance(
-                                text[0], int):
+                text[0], int):
                return text
            else:
                raise ValueError(
@@ -350,7 +354,7 @@ class CustomTokenizer(object):
        """
        out_string = (out_string.replace(" .", ".").replace(" ?", "?").replace(
            " !", "!").replace(" ,", ",").replace(" ' ", "'").replace(
-                " n't",
+            " n't",
-                "n't").replace(" 'm", "'m").replace(" 's", "'s").replace(
+            "n't").replace(" 'm", "'m").replace(" 's", "'s").replace(
-                    " 've", "'ve").replace(" 're", "'re"))
+            " 've", "'ve").replace(" 're", "'re"))
        return out_string