提交 cbabaa45 编写于 作者: Y Yancey1989

convert dataset into recordio format

上级 6512893b
...@@ -133,7 +133,7 @@ def convert(path): ...@@ -133,7 +133,7 @@ def convert(path):
""" """
Converts dataset to recordio format Converts dataset to recordio format
""" """
paddle.v2.dataset.common.convert(path, train100(), 10, "cifar_train100") paddle.v2.dataset.common.convert(path, train100(), 1000, "cifar_train100")
paddle.v2.dataset.common.convert(path, test100(), 10, "cifar_test100") paddle.v2.dataset.common.convert(path, test100(), 1000, "cifar_test100")
paddle.v2.dataset.common.convert(path, train10(), 10, "cifar_train10") paddle.v2.dataset.common.convert(path, train10(), 1000, "cifar_train10")
paddle.v2.dataset.common.convert(path, test10(), 10, "cifar_test10") paddle.v2.dataset.common.convert(path, test10(), 1000, "cifar_test10")
...@@ -32,19 +32,24 @@ __all__ = [ ...@@ -32,19 +32,24 @@ __all__ = [
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset') DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
# When running unit tests, there could be multiple processes that # When running unit tests, there could be multiple processes that
# trying to create DATA_HOME directory simultaneously, so we cannot # trying to create DATA_HOME directory simultaneously, so we cannot
# use a if condition to check for the existence of the directory; # use a if condition to check for the existence of the directory;
# instead, we use the filesystem as the synchronization mechanism by # instead, we use the filesystem as the synchronization mechanism by
# catching returned errors. # catching returned errors.
try: def must_mkdirs(path):
try:
os.makedirs(DATA_HOME) os.makedirs(DATA_HOME)
except OSError as exc: except OSError as exc:
if exc.errno != errno.EEXIST: if exc.errno != errno.EEXIST:
raise raise
pass pass
must_mkdirs(DATA_HOME)
def md5file(fname): def md5file(fname):
hash_md5 = hashlib.md5() hash_md5 = hashlib.md5()
f = open(fname, "rb") f = open(fname, "rb")
...@@ -93,6 +98,19 @@ def fetch_all(): ...@@ -93,6 +98,19 @@ def fetch_all():
"fetch")() "fetch")()
def fetch_all_recordio(path):
for module_name in filter(lambda x: not x.startswith("__"),
dir(paddle.v2.dataset)):
if "convert" in dir(
importlib.import_module("paddle.v2.dataset.%s" % module_name)) and \
not module_name == "common":
ds_path = os.path.join(path, module_name)
must_mkdirs(ds_path)
getattr(
importlib.import_module("paddle.v2.dataset.%s" % module_name),
"convert")(ds_path)
def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump): def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
""" """
you can call the function as: you can call the function as:
......
...@@ -233,5 +233,5 @@ def convert(path): ...@@ -233,5 +233,5 @@ def convert(path):
""" """
Converts dataset to recordio format Converts dataset to recordio format
""" """
paddle.v2.dataset.common.convert(path, test(), 10, "conl105_train") paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_train")
paddle.v2.dataset.common.convert(path, test(), 10, "conl105_test") paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_test")
...@@ -173,5 +173,5 @@ def convert(path): ...@@ -173,5 +173,5 @@ def convert(path):
Converts dataset to recordio format Converts dataset to recordio format
""" """
w = word_dict() w = word_dict()
paddle.v2.dataset.common.convert(path, lambda: train(w), 10, "imdb_train") paddle.v2.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train")
paddle.v2.dataset.common.convert(path, lambda: test(w), 10, "imdb_test") paddle.v2.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test")
...@@ -155,6 +155,7 @@ def convert(path): ...@@ -155,6 +155,7 @@ def convert(path):
N = 5 N = 5
word_dict = build_dict() word_dict = build_dict()
paddle.v2.dataset.common.convert(path, paddle.v2.dataset.common.convert(path,
train(word_dict, N), 10, "imikolov_train") train(word_dict, N), 1000,
"imikolov_train")
paddle.v2.dataset.common.convert(path, paddle.v2.dataset.common.convert(path,
test(word_dict, N), 10, "imikolov_test") test(word_dict, N), 1000, "imikolov_test")
...@@ -119,5 +119,5 @@ def convert(path): ...@@ -119,5 +119,5 @@ def convert(path):
""" """
Converts dataset to recordio format Converts dataset to recordio format
""" """
paddle.v2.dataset.common.convert(path, train(), 10, "minist_train") paddle.v2.dataset.common.convert(path, train(), 1000, "minist_train")
paddle.v2.dataset.common.convert(path, test(), 10, "minist_test") paddle.v2.dataset.common.convert(path, test(), 1000, "minist_test")
...@@ -254,8 +254,8 @@ def convert(path): ...@@ -254,8 +254,8 @@ def convert(path):
""" """
Converts dataset to recordio format Converts dataset to recordio format
""" """
paddle.v2.dataset.common.convert(path, train(), 10, "movielens_train") paddle.v2.dataset.common.convert(path, train(), 1000, "movielens_train")
paddle.v2.dataset.common.convert(path, test(), 10, "movielens_test") paddle.v2.dataset.common.convert(path, test(), 1000, "movielens_test")
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -137,5 +137,5 @@ def convert(path): ...@@ -137,5 +137,5 @@ def convert(path):
""" """
Converts dataset to recordio format Converts dataset to recordio format
""" """
paddle.v2.dataset.common.convert(path, train, 10, "sentiment_train") paddle.v2.dataset.common.convert(path, train, 1000, "sentiment_train")
paddle.v2.dataset.common.convert(path, test, 10, "sentiment_test") paddle.v2.dataset.common.convert(path, test, 1000, "sentiment_test")
...@@ -119,5 +119,5 @@ def convert(path): ...@@ -119,5 +119,5 @@ def convert(path):
""" """
Converts dataset to recordio format Converts dataset to recordio format
""" """
paddle.v2.dataset.common.convert(path, train(), 10, "uci_housing_train") paddle.v2.dataset.common.convert(path, train(), 1000, "uci_housing_train")
paddle.v2.dataset.common.convert(path, test(), 10, "uci_houseing_test") paddle.v2.dataset.common.convert(path, test(), 1000, "uci_houseing_test")
...@@ -169,5 +169,6 @@ def convert(path): ...@@ -169,5 +169,6 @@ def convert(path):
Converts dataset to recordio format Converts dataset to recordio format
""" """
dict_size = 30000 dict_size = 30000
paddle.v2.dataset.common.convert(path, train(dict_size), 10, "wmt14_train") paddle.v2.dataset.common.convert(path,
paddle.v2.dataset.common.convert(path, test(dict_size), 10, "wmt14_test") train(dict_size), 1000, "wmt14_train")
paddle.v2.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test")
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册