diff --git a/python/paddle/dataset/wmt14.py b/python/paddle/dataset/wmt14.py index f8c1a33574e642b21feb6843d115b7f4205ef250..adc0c1aac80cbdb0b0c04535fc39b6a172d23eec 100644 --- a/python/paddle/dataset/wmt14.py +++ b/python/paddle/dataset/wmt14.py @@ -89,7 +89,8 @@ def reader_creator(tar_file, file_name, dict_size): ] for name in names: for line in f.extractfile(name): - line_split = line.strip().split(six.b('\t')) + line = cpt.to_text(line) + line_split = line.strip().split('\t') if len(line_split) != 2: continue src_seq = line_split[0] # one source sequence diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py index f30dcd518ea6c0c685d027ede3ad6e0a1cb0c82c..9c02e0f41b04e113251e0fda72ca8abd976ab6f7 100644 --- a/python/paddle/dataset/wmt16.py +++ b/python/paddle/dataset/wmt16.py @@ -64,7 +64,8 @@ def __build_dict(tar_file, dict_size, save_path, lang): word_dict = defaultdict(int) with tarfile.open(tar_file, mode="r") as f: for line in f.extractfile("wmt16/train"): - line_split = line.strip().split(six.b("\t")) + line = cpt.to_text(line) + line_split = line.strip().split("\t") if len(line_split) != 2: continue sen = line_split[0] if lang == "en" else line_split[1] for w in sen.split(): @@ -123,7 +124,8 @@ def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang): with tarfile.open(tar_file, mode="r") as f: for line in f.extractfile(file_name): - line_split = line.strip().split(six.b("\t")) + line = cpt.to_text(line) + line_split = line.strip().split("\t") if len(line_split) != 2: continue src_words = line_split[src_col].split()