From cbabaa45444e3f2fe183ff69c78d753f3a5c2234 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 4 Aug 2017 18:05:50 +0800 Subject: [PATCH] convert dataset into recordio format --- python/paddle/v2/dataset/cifar.py | 8 +++---- python/paddle/v2/dataset/common.py | 30 ++++++++++++++++++++----- python/paddle/v2/dataset/conll05.py | 4 ++-- python/paddle/v2/dataset/imdb.py | 4 ++-- python/paddle/v2/dataset/imikolov.py | 5 +++-- python/paddle/v2/dataset/mnist.py | 4 ++-- python/paddle/v2/dataset/movielens.py | 4 ++-- python/paddle/v2/dataset/sentiment.py | 4 ++-- python/paddle/v2/dataset/uci_housing.py | 4 ++-- python/paddle/v2/dataset/wmt14.py | 5 +++-- 10 files changed, 46 insertions(+), 26 deletions(-) diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py index f885b2834e..0a2a1ced11 100644 --- a/python/paddle/v2/dataset/cifar.py +++ b/python/paddle/v2/dataset/cifar.py @@ -133,7 +133,7 @@ def convert(path): """ Converts dataset to recordio format """ - paddle.v2.dataset.common.convert(path, train100(), 10, "cifar_train100") - paddle.v2.dataset.common.convert(path, test100(), 10, "cifar_test100") - paddle.v2.dataset.common.convert(path, train10(), 10, "cifar_train10") - paddle.v2.dataset.common.convert(path, test10(), 10, "cifar_test10") + paddle.v2.dataset.common.convert(path, train100(), 1000, "cifar_train100") + paddle.v2.dataset.common.convert(path, test100(), 1000, "cifar_test100") + paddle.v2.dataset.common.convert(path, train10(), 1000, "cifar_train10") + paddle.v2.dataset.common.convert(path, test10(), 1000, "cifar_test10") diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py index 111496618d..053ae151c5 100644 --- a/python/paddle/v2/dataset/common.py +++ b/python/paddle/v2/dataset/common.py @@ -32,17 +32,22 @@ __all__ = [ DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset') + # When running unit tests, there could be multiple processes that # trying to create DATA_HOME directory simultaneously, so we cannot # use a if condition to check for the existence of the directory; # instead, we use the filesystem as the synchronization mechanism by # catching returned errors. -try: - os.makedirs(DATA_HOME) -except OSError as exc: - if exc.errno != errno.EEXIST: - raise - pass +def must_mkdirs(path): + try: + os.makedirs(DATA_HOME) + except OSError as exc: + if exc.errno != errno.EEXIST: + raise + pass + + +must_mkdirs(DATA_HOME) def md5file(fname): @@ -93,6 +98,19 @@ def fetch_all(): "fetch")() +def fetch_all_recordio(path): + for module_name in filter(lambda x: not x.startswith("__"), + dir(paddle.v2.dataset)): + if "convert" in dir( + importlib.import_module("paddle.v2.dataset.%s" % module_name)) and \ + not module_name == "common": + ds_path = os.path.join(path, module_name) + must_mkdirs(ds_path) + getattr( + importlib.import_module("paddle.v2.dataset.%s" % module_name), + "convert")(ds_path) + + def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump): """ you can call the function as: diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py index f8aae52e7c..23f5a24a1c 100644 --- a/python/paddle/v2/dataset/conll05.py +++ b/python/paddle/v2/dataset/conll05.py @@ -233,5 +233,5 @@ def convert(path): """ Converts dataset to recordio format """ - paddle.v2.dataset.common.convert(path, test(), 10, "conl105_train") - paddle.v2.dataset.common.convert(path, test(), 10, "conl105_test") + paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_test") diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py index c0ec5992e0..93dd3e8f7d 100644 --- a/python/paddle/v2/dataset/imdb.py +++ b/python/paddle/v2/dataset/imdb.py @@ -173,5 +173,5 @@ def convert(path): Converts dataset to recordio format """ w = word_dict() - paddle.v2.dataset.common.convert(path, lambda: train(w), 10, "imdb_train") - paddle.v2.dataset.common.convert(path, lambda: test(w), 10, "imdb_test") + paddle.v2.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train") + paddle.v2.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test") diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py index b18ee8e9ba..617c722c41 100644 --- a/python/paddle/v2/dataset/imikolov.py +++ b/python/paddle/v2/dataset/imikolov.py @@ -155,6 +155,7 @@ def convert(path): N = 5 word_dict = build_dict() paddle.v2.dataset.common.convert(path, - train(word_dict, N), 10, "imikolov_train") + train(word_dict, N), 1000, + "imikolov_train") paddle.v2.dataset.common.convert(path, - test(word_dict, N), 10, "imikolov_test") + test(word_dict, N), 1000, "imikolov_test") diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py index ea5891f4f3..9f675bed89 100644 --- a/python/paddle/v2/dataset/mnist.py +++ b/python/paddle/v2/dataset/mnist.py @@ -119,5 +119,5 @@ def convert(path): """ Converts dataset to recordio format """ - paddle.v2.dataset.common.convert(path, train(), 10, "minist_train") - paddle.v2.dataset.common.convert(path, test(), 10, "minist_test") + paddle.v2.dataset.common.convert(path, train(), 1000, "minist_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "minist_test") diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py index d9372d422a..5b61a9420a 100644 --- a/python/paddle/v2/dataset/movielens.py +++ b/python/paddle/v2/dataset/movielens.py @@ -254,8 +254,8 @@ def convert(path): """ Converts dataset to recordio format """ - paddle.v2.dataset.common.convert(path, train(), 10, "movielens_train") - paddle.v2.dataset.common.convert(path, test(), 10, "movielens_test") + paddle.v2.dataset.common.convert(path, train(), 1000, "movielens_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "movielens_test") if __name__ == '__main__': diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py index e33f120c87..b0b9757c1a 100644 --- a/python/paddle/v2/dataset/sentiment.py +++ b/python/paddle/v2/dataset/sentiment.py @@ -137,5 +137,5 @@ def convert(path): """ Converts dataset to recordio format """ - paddle.v2.dataset.common.convert(path, train, 10, "sentiment_train") - paddle.v2.dataset.common.convert(path, test, 10, "sentiment_test") + paddle.v2.dataset.common.convert(path, train, 1000, "sentiment_train") + paddle.v2.dataset.common.convert(path, test, 1000, "sentiment_test") diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py index ec10ce646e..ce60aa21c2 100644 --- a/python/paddle/v2/dataset/uci_housing.py +++ b/python/paddle/v2/dataset/uci_housing.py @@ -119,5 +119,5 @@ def convert(path): """ Converts dataset to recordio format """ - paddle.v2.dataset.common.convert(path, train(), 10, "uci_housing_train") - paddle.v2.dataset.common.convert(path, test(), 10, "uci_houseing_test") + paddle.v2.dataset.common.convert(path, train(), 1000, "uci_housing_train") + paddle.v2.dataset.common.convert(path, test(), 1000, "uci_houseing_test") diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py index 2a631c365f..95a35d97ce 100644 --- a/python/paddle/v2/dataset/wmt14.py +++ b/python/paddle/v2/dataset/wmt14.py @@ -169,5 +169,6 @@ def convert(path): Converts dataset to recordio format """ dict_size = 30000 - paddle.v2.dataset.common.convert(path, train(dict_size), 10, "wmt14_train") - paddle.v2.dataset.common.convert(path, test(dict_size), 10, "wmt14_test") + paddle.v2.dataset.common.convert(path, + train(dict_size), 1000, "wmt14_train") + paddle.v2.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test") -- GitLab