diff --git a/PaddleNLP/neural_machine_translation/transformer/infer.py b/PaddleNLP/neural_machine_translation/transformer/infer.py index cb40e6851db11c54faf0a393bd26f5263b16ae65..8a6804e6a496888e503449190ddc74846b1cff6e 100644 --- a/PaddleNLP/neural_machine_translation/transformer/infer.py +++ b/PaddleNLP/neural_machine_translation/transformer/infer.py @@ -48,14 +48,14 @@ def parse_args(): help="The buffer size to pool data.") parser.add_argument( "--special_token", - type=str, - default=["", "", ""], + type=lambda x: x.encode(), + default=[b"", b"", b""], nargs=3, help="The , and tokens in the dictionary.") parser.add_argument( "--token_delimiter", - type=lambda x: str(x.encode().decode("unicode-escape")), - default=" ", + type=lambda x: x.encode(), + default=b" ", help="The delimiter used to split tokens in source or target sentences. " "For EN-DE BPE data we provided, use spaces as token delimiter. ") parser.add_argument( diff --git a/PaddleNLP/neural_machine_translation/transformer/reader.py b/PaddleNLP/neural_machine_translation/transformer/reader.py index 5cceec3377ef1ad833d87672f94bb8dd119a571b..df8a45082dad1caf3561090249dd41de0a6d2b17 100644 --- a/PaddleNLP/neural_machine_translation/transformer/reader.py +++ b/PaddleNLP/neural_machine_translation/transformer/reader.py @@ -183,11 +183,11 @@ class DataReader(object): shuffle_seed=None, shuffle_batch=False, use_token_batch=False, - field_delimiter="\t", - token_delimiter=" ", - start_mark="", - end_mark="", - unk_mark="", + field_delimiter=b"\t", + token_delimiter=b" ", + start_mark=b"", + end_mark=b"", + unk_mark=b"", seed=0): self._src_vocab = self.load_dict(src_vocab_fpath) self._only_src = True @@ -254,9 +254,9 @@ class DataReader(object): if tar_fname is None: raise Exception("If tar file provided, please set tar_fname.") - f = tarfile.open(fpaths[0], "r") + f = tarfile.open(fpaths[0], "rb") for line in f.extractfile(tar_fname): - fields = line.strip("\n").split(self._field_delimiter) + fields = line.strip(b"\n").split(self._field_delimiter) if (not self._only_src and len(fields) == 2) or ( self._only_src and len(fields) == 1): yield fields @@ -267,9 +267,7 @@ class DataReader(object): with open(fpath, "rb") as f: for line in f: - if six.PY3: - line = line.decode("utf8", errors="ignore") - fields = line.strip("\n").split(self._field_delimiter) + fields = line.strip(b"\n").split(self._field_delimiter) if (not self._only_src and len(fields) == 2) or ( self._only_src and len(fields) == 1): yield fields @@ -279,12 +277,10 @@ class DataReader(object): word_dict = {} with open(dict_path, "rb") as fdict: for idx, line in enumerate(fdict): - if six.PY3: - line = line.decode("utf8", errors="ignore") if reverse: - word_dict[idx] = line.strip("\n") + word_dict[idx] = line.strip(b"\n") else: - word_dict[line.strip("\n")] = idx + word_dict[line.strip(b"\n")] = idx return word_dict def batch_generator(self): diff --git a/PaddleNLP/neural_machine_translation/transformer/train.py b/PaddleNLP/neural_machine_translation/transformer/train.py index fd27820bd431799b57e3c082fd14407e1d1d48e9..09c50e453fd5b5bb7b25a69e65deb83c5e91b62f 100644 --- a/PaddleNLP/neural_machine_translation/transformer/train.py +++ b/PaddleNLP/neural_machine_translation/transformer/train.py @@ -86,14 +86,14 @@ def parse_args(): help="The flag indicating whether to shuffle the data batches.") parser.add_argument( "--special_token", - type=str, - default=["", "", ""], + type=lambda x: x.encode(), + default=[b"", b"", b""], nargs=3, help="The , and tokens in the dictionary.") parser.add_argument( "--token_delimiter", - type=lambda x: str(x.encode().decode("unicode-escape")), - default=" ", + type=lambda x: x.encode(), + default=b" ", help="The delimiter used to split tokens in source or target sentences. " "For EN-DE BPE data we provided, use spaces as token delimiter. ") parser.add_argument( diff --git a/PaddleNLP/unarchived/neural_machine_translation/transformer/infer.py b/PaddleNLP/unarchived/neural_machine_translation/transformer/infer.py index cf89607d7b103cbe65f28868a6ce6f92691f618a..9f168199b7a89697489c047d0e3b724b187421fc 100644 --- a/PaddleNLP/unarchived/neural_machine_translation/transformer/infer.py +++ b/PaddleNLP/unarchived/neural_machine_translation/transformer/infer.py @@ -46,14 +46,14 @@ def parse_args(): help="The buffer size to pool data.") parser.add_argument( "--special_token", - type=str, - default=["", "", ""], + type=lambda x: x.encode(), + default=[b"", b"", b""], nargs=3, help="The , and tokens in the dictionary.") parser.add_argument( "--token_delimiter", - type=lambda x: str(x.encode().decode("unicode-escape")), - default=" ", + type=lambda x: x.encode(), + default=b" ", help="The delimiter used to split tokens in source or target sentences. " "For EN-DE BPE data we provided, use spaces as token delimiter. ") parser.add_argument( diff --git a/PaddleNLP/unarchived/neural_machine_translation/transformer/reader.py b/PaddleNLP/unarchived/neural_machine_translation/transformer/reader.py index 0a846825e0027d2fef9ee4515a0ac2b887806c1c..923e818717d0d90548afb35177a58f461c27ec9c 100644 --- a/PaddleNLP/unarchived/neural_machine_translation/transformer/reader.py +++ b/PaddleNLP/unarchived/neural_machine_translation/transformer/reader.py @@ -182,11 +182,11 @@ class DataReader(object): shuffle=True, shuffle_batch=False, use_token_batch=False, - field_delimiter="\t", - token_delimiter=" ", - start_mark="", - end_mark="", - unk_mark="", + field_delimiter=b"\t", + token_delimiter=b" ", + start_mark=b"", + end_mark=b"", + unk_mark=b"", seed=0): self._src_vocab = self.load_dict(src_vocab_fpath) self._only_src = True @@ -252,9 +252,9 @@ class DataReader(object): if tar_fname is None: raise Exception("If tar file provided, please set tar_fname.") - f = tarfile.open(fpaths[0], "r") + f = tarfile.open(fpaths[0], "rb") for line in f.extractfile(tar_fname): - fields = line.strip("\n").split(self._field_delimiter) + fields = line.strip(b"\n").split(self._field_delimiter) if (not self._only_src and len(fields) == 2) or ( self._only_src and len(fields) == 1): yield fields @@ -265,9 +265,7 @@ class DataReader(object): with open(fpath, "rb") as f: for line in f: - if six.PY3: - line = line.decode("utf8", errors="ignore") - fields = line.strip("\n").split(self._field_delimiter) + fields = line.strip(b"\n").split(self._field_delimiter) if (not self._only_src and len(fields) == 2) or ( self._only_src and len(fields) == 1): yield fields @@ -277,12 +275,10 @@ class DataReader(object): word_dict = {} with open(dict_path, "rb") as fdict: for idx, line in enumerate(fdict): - if six.PY3: - line = line.decode("utf8", errors="ignore") if reverse: - word_dict[idx] = line.strip("\n") + word_dict[idx] = line.strip(b"\n") else: - word_dict[line.strip("\n")] = idx + word_dict[line.strip(b"\n")] = idx return word_dict def batch_generator(self): diff --git a/PaddleNLP/unarchived/neural_machine_translation/transformer/train.py b/PaddleNLP/unarchived/neural_machine_translation/transformer/train.py index 4313f8b441ee194935c7c47abc52271589c7765d..ade645997c8698c783372cf1cfd7c87a2c67effa 100644 --- a/PaddleNLP/unarchived/neural_machine_translation/transformer/train.py +++ b/PaddleNLP/unarchived/neural_machine_translation/transformer/train.py @@ -74,14 +74,14 @@ def parse_args(): help="The flag indicating whether to shuffle the data batches.") parser.add_argument( "--special_token", - type=str, - default=["", "", ""], + type=lambda x: x.encode(), + default=[b"", b"", b""], nargs=3, help="The , and tokens in the dictionary.") parser.add_argument( "--token_delimiter", - type=lambda x: str(x.encode().decode("unicode-escape")), - default=" ", + type=lambda x: x.encode(), + default=b" ", help="The delimiter used to split tokens in source or target sentences. " "For EN-DE BPE data we provided, use spaces as token delimiter. ") parser.add_argument(