From 2f16f47e945b2352060392a49982b6ea67af4379 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Sun, 30 Sep 2018 12:29:26 +0800 Subject: [PATCH] Fix dataset wmt16 --- python/paddle/dataset/wmt16.py | 3 ++- python/paddle/v2/dataset/wmt16.py | 9 ++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py index 9c02e0f41b0..aa66696fae7 100644 --- a/python/paddle/dataset/wmt16.py +++ b/python/paddle/dataset/wmt16.py @@ -78,7 +78,8 @@ def __build_dict(tar_file, dict_size, save_path, lang): six.iteritems(word_dict), key=lambda x: x[1], reverse=True)): if idx + 3 == dict_size: break - fout.write("%s\n" % (word[0])) + fout.write(word[0].encode('utf-8')) + fout.write('\n') def __load_dict(tar_file, dict_size, lang, reverse=False): diff --git a/python/paddle/v2/dataset/wmt16.py b/python/paddle/v2/dataset/wmt16.py index c8818f715be..5793002091b 100644 --- a/python/paddle/v2/dataset/wmt16.py +++ b/python/paddle/v2/dataset/wmt16.py @@ -72,7 +72,8 @@ def __build_dict(tar_file, dict_size, save_path, lang): sorted( word_dict.iteritems(), key=lambda x: x[1], reverse=True)): if idx + 3 == dict_size: break - fout.write("%s\n" % (word[0])) + fout.write(word[0].encode('utf-8')) + fout.write('\n') def __load_dict(tar_file, dict_size, lang, reverse=False): @@ -300,8 +301,10 @@ def get_dict(lang, dict_size, reverse=False): dict: The word dictionary for the specific language. """ - if lang == "en": dict_size = min(dict_size, TOTAL_EN_WORDS) - else: dict_size = min(dict_size, TOTAL_DE_WORDS) + if lang == "en": + dict_size = min(dict_size, TOTAL_EN_WORDS) + else: + dict_size = min(dict_size, TOTAL_DE_WORDS) dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME, "wmt16/%s_%d.dict" % (lang, dict_size)) -- GitLab