From f3c92f118b6176995f092624a6b81be23c1b6bcb Mon Sep 17 00:00:00 2001 From: guoshengCS Date: Wed, 17 Jul 2019 12:44:36 +0800 Subject: [PATCH] Fix text decoding in Transformer under python3. --- .../transformer/infer.py | 8 +++---- .../transformer/reader.py | 24 ++++++++----------- .../transformer/train.py | 8 +++---- .../transformer/infer.py | 8 +++---- .../transformer/reader.py | 24 ++++++++----------- .../transformer/train.py | 8 +++---- 6 files changed, 36 insertions(+), 44 deletions(-) diff --git a/PaddleNLP/neural_machine_translation/transformer/infer.py b/PaddleNLP/neural_machine_translation/transformer/infer.py index cb40e685..8a6804e6 100644 --- a/PaddleNLP/neural_machine_translation/transformer/infer.py +++ b/PaddleNLP/neural_machine_translation/transformer/infer.py @@ -48,14 +48,14 @@ def parse_args(): help="The buffer size to pool data.") parser.add_argument( "--special_token", - type=str, - default=["", "", ""], + type=lambda x: x.encode(), + default=[b"", b"", b""], nargs=3, help="The , and tokens in the dictionary.") parser.add_argument( "--token_delimiter", - type=lambda x: str(x.encode().decode("unicode-escape")), - default=" ", + type=lambda x: x.encode(), + default=b" ", help="The delimiter used to split tokens in source or target sentences. " "For EN-DE BPE data we provided, use spaces as token delimiter. ") parser.add_argument( diff --git a/PaddleNLP/neural_machine_translation/transformer/reader.py b/PaddleNLP/neural_machine_translation/transformer/reader.py index 5cceec33..df8a4508 100644 --- a/PaddleNLP/neural_machine_translation/transformer/reader.py +++ b/PaddleNLP/neural_machine_translation/transformer/reader.py @@ -183,11 +183,11 @@ class DataReader(object): shuffle_seed=None, shuffle_batch=False, use_token_batch=False, - field_delimiter="\t", - token_delimiter=" ", - start_mark="", - end_mark="", - unk_mark="", + field_delimiter=b"\t", + token_delimiter=b" ", + start_mark=b"", + end_mark=b"", + unk_mark=b"", seed=0): self._src_vocab = self.load_dict(src_vocab_fpath) self._only_src = True @@ -254,9 +254,9 @@ class DataReader(object): if tar_fname is None: raise Exception("If tar file provided, please set tar_fname.") - f = tarfile.open(fpaths[0], "r") + f = tarfile.open(fpaths[0], "rb") for line in f.extractfile(tar_fname): - fields = line.strip("\n").split(self._field_delimiter) + fields = line.strip(b"\n").split(self._field_delimiter) if (not self._only_src and len(fields) == 2) or ( self._only_src and len(fields) == 1): yield fields @@ -267,9 +267,7 @@ class DataReader(object): with open(fpath, "rb") as f: for line in f: - if six.PY3: - line = line.decode("utf8", errors="ignore") - fields = line.strip("\n").split(self._field_delimiter) + fields = line.strip(b"\n").split(self._field_delimiter) if (not self._only_src and len(fields) == 2) or ( self._only_src and len(fields) == 1): yield fields @@ -279,12 +277,10 @@ class DataReader(object): word_dict = {} with open(dict_path, "rb") as fdict: for idx, line in enumerate(fdict): - if six.PY3: - line = line.decode("utf8", errors="ignore") if reverse: - word_dict[idx] = line.strip("\n") + word_dict[idx] = line.strip(b"\n") else: - word_dict[line.strip("\n")] = idx + word_dict[line.strip(b"\n")] = idx return word_dict def batch_generator(self): diff --git a/PaddleNLP/neural_machine_translation/transformer/train.py b/PaddleNLP/neural_machine_translation/transformer/train.py index fd27820b..09c50e45 100644 --- a/PaddleNLP/neural_machine_translation/transformer/train.py +++ b/PaddleNLP/neural_machine_translation/transformer/train.py @@ -86,14 +86,14 @@ def parse_args(): help="The flag indicating whether to shuffle the data batches.") parser.add_argument( "--special_token", - type=str, - default=["", "", ""], + type=lambda x: x.encode(), + default=[b"", b"", b""], nargs=3, help="The , and tokens in the dictionary.") parser.add_argument( "--token_delimiter", - type=lambda x: str(x.encode().decode("unicode-escape")), - default=" ", + type=lambda x: x.encode(), + default=b" ", help="The delimiter used to split tokens in source or target sentences. " "For EN-DE BPE data we provided, use spaces as token delimiter. ") parser.add_argument( diff --git a/PaddleNLP/unarchived/neural_machine_translation/transformer/infer.py b/PaddleNLP/unarchived/neural_machine_translation/transformer/infer.py index cf89607d..9f168199 100644 --- a/PaddleNLP/unarchived/neural_machine_translation/transformer/infer.py +++ b/PaddleNLP/unarchived/neural_machine_translation/transformer/infer.py @@ -46,14 +46,14 @@ def parse_args(): help="The buffer size to pool data.") parser.add_argument( "--special_token", - type=str, - default=["", "", ""], + type=lambda x: x.encode(), + default=[b"", b"", b""], nargs=3, help="The , and tokens in the dictionary.") parser.add_argument( "--token_delimiter", - type=lambda x: str(x.encode().decode("unicode-escape")), - default=" ", + type=lambda x: x.encode(), + default=b" ", help="The delimiter used to split tokens in source or target sentences. " "For EN-DE BPE data we provided, use spaces as token delimiter. ") parser.add_argument( diff --git a/PaddleNLP/unarchived/neural_machine_translation/transformer/reader.py b/PaddleNLP/unarchived/neural_machine_translation/transformer/reader.py index 0a846825..923e8187 100644 --- a/PaddleNLP/unarchived/neural_machine_translation/transformer/reader.py +++ b/PaddleNLP/unarchived/neural_machine_translation/transformer/reader.py @@ -182,11 +182,11 @@ class DataReader(object): shuffle=True, shuffle_batch=False, use_token_batch=False, - field_delimiter="\t", - token_delimiter=" ", - start_mark="", - end_mark="", - unk_mark="", + field_delimiter=b"\t", + token_delimiter=b" ", + start_mark=b"", + end_mark=b"", + unk_mark=b"", seed=0): self._src_vocab = self.load_dict(src_vocab_fpath) self._only_src = True @@ -252,9 +252,9 @@ class DataReader(object): if tar_fname is None: raise Exception("If tar file provided, please set tar_fname.") - f = tarfile.open(fpaths[0], "r") + f = tarfile.open(fpaths[0], "rb") for line in f.extractfile(tar_fname): - fields = line.strip("\n").split(self._field_delimiter) + fields = line.strip(b"\n").split(self._field_delimiter) if (not self._only_src and len(fields) == 2) or ( self._only_src and len(fields) == 1): yield fields @@ -265,9 +265,7 @@ class DataReader(object): with open(fpath, "rb") as f: for line in f: - if six.PY3: - line = line.decode("utf8", errors="ignore") - fields = line.strip("\n").split(self._field_delimiter) + fields = line.strip(b"\n").split(self._field_delimiter) if (not self._only_src and len(fields) == 2) or ( self._only_src and len(fields) == 1): yield fields @@ -277,12 +275,10 @@ class DataReader(object): word_dict = {} with open(dict_path, "rb") as fdict: for idx, line in enumerate(fdict): - if six.PY3: - line = line.decode("utf8", errors="ignore") if reverse: - word_dict[idx] = line.strip("\n") + word_dict[idx] = line.strip(b"\n") else: - word_dict[line.strip("\n")] = idx + word_dict[line.strip(b"\n")] = idx return word_dict def batch_generator(self): diff --git a/PaddleNLP/unarchived/neural_machine_translation/transformer/train.py b/PaddleNLP/unarchived/neural_machine_translation/transformer/train.py index 4313f8b4..ade64599 100644 --- a/PaddleNLP/unarchived/neural_machine_translation/transformer/train.py +++ b/PaddleNLP/unarchived/neural_machine_translation/transformer/train.py @@ -74,14 +74,14 @@ def parse_args(): help="The flag indicating whether to shuffle the data batches.") parser.add_argument( "--special_token", - type=str, - default=["", "", ""], + type=lambda x: x.encode(), + default=[b"", b"", b""], nargs=3, help="The , and tokens in the dictionary.") parser.add_argument( "--token_delimiter", - type=lambda x: str(x.encode().decode("unicode-escape")), - default=" ", + type=lambda x: x.encode(), + default=b" ", help="The delimiter used to split tokens in source or target sentences. " "For EN-DE BPE data we provided, use spaces as token delimiter. ") parser.add_argument( -- GitLab