diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py index d9f7a830ee60a331b55a1e218923e690103e1c5b..3a8b98b8f045b0eb58be69649486cbd0a571f118 100644 --- a/python/paddle/v2/dataset/cifar.py +++ b/python/paddle/v2/dataset/cifar.py @@ -20,7 +20,7 @@ TODO(yuyang18): Complete the comments. import cPickle import itertools import numpy -import paddle.v2.dataset.common +from common import download import tarfile __all__ = ['train100', 'test100', 'train10', 'test10'] @@ -55,23 +55,23 @@ def reader_creator(filename, sub_name): def train100(): return reader_creator( - paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5), - 'train') + download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'train') def test100(): - return reader_creator( - paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5), - 'test') + return reader_creator(download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'test') def train10(): return reader_creator( - paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5), - 'data_batch') + download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'data_batch') def test10(): return reader_creator( - paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5), - 'test_batch') + download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'test_batch') + + +def fetch(): + download(CIFAR10_URL, 'cifar', CIFAR10_MD5) + download(CIFAR100_URL, 'cifar', CIFAR100_MD5) diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py index 3021b68ddb02ecaa874e21681796c0912ad4cc06..7021a6da05dec6be216534112c2df2586e73390f 100644 --- a/python/paddle/v2/dataset/common.py +++ b/python/paddle/v2/dataset/common.py @@ -17,6 +17,8 @@ import hashlib import os import shutil import sys +import importlib +import paddle.v2.dataset __all__ = ['DATA_HOME', 'download', 'md5file'] @@ -69,3 +71,13 @@ def dict_add(a_dict, ele): a_dict[ele] += 1 else: a_dict[ele] = 1 + + +def fetch_all(): + for module_name in filter(lambda x: not x.startswith("__"), + dir(paddle.v2.dataset)): + if "fetch" in dir( + importlib.import_module("paddle.v2.dataset.%s" % module_name)): + getattr( + importlib.import_module("paddle.v2.dataset.%s" % module_name), + "fetch")() diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py index 9eab49ee39325c1c60fc511e0bd834e83aa987f0..f1b0ce16f21ad13d4564242c2359355236093032 100644 --- a/python/paddle/v2/dataset/conll05.py +++ b/python/paddle/v2/dataset/conll05.py @@ -196,3 +196,11 @@ def test(): words_name='conll05st-release/test.wsj/words/test.wsj.words.gz', props_name='conll05st-release/test.wsj/props/test.wsj.props.gz') return reader_creator(reader, word_dict, verb_dict, label_dict) + + +def fetch(): + download(WORDDICT_URL, 'conll05st', WORDDICT_MD5) + download(VERBDICT_URL, 'conll05st', VERBDICT_MD5) + download(TRGDICT_URL, 'conll05st', TRGDICT_MD5) + download(EMB_URL, 'conll05st', EMB_MD5) + download(DATA_URL, 'conll05st', DATA_MD5) diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py index 76019d9f54020ff6f02c17eb6047cbd014a8ccf2..5284017ce08de8beb559f58fb6006639f40f5580 100644 --- a/python/paddle/v2/dataset/imdb.py +++ b/python/paddle/v2/dataset/imdb.py @@ -123,3 +123,7 @@ def test(word_idx): def word_dict(): return build_dict( re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150) + + +def fetch(): + paddle.v2.dataset.common.download(URL, 'imdb', MD5) diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py index 97c160f111d09d61eb860c7f02552e635f2400a7..2931d06e7eb65bde887c56a8bc20e7a9c5e4d4e4 100644 --- a/python/paddle/v2/dataset/imikolov.py +++ b/python/paddle/v2/dataset/imikolov.py @@ -89,3 +89,7 @@ def train(word_idx, n): def test(word_idx, n): return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n) + + +def fetch(): + paddle.v2.dataset.common.download(URL, "imikolov", MD5) diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py index 16f2fcb99de4cb1971a7375a97b5daa209ee95ef..48a39b5493a8004d6eb034498a797af9c662bd19 100644 --- a/python/paddle/v2/dataset/mnist.py +++ b/python/paddle/v2/dataset/mnist.py @@ -106,3 +106,10 @@ def test(): TEST_IMAGE_MD5), paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5), 100) + + +def fetch(): + paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5) + paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5) + paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5) + paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5) diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py index 25fd8227da2f219d75c6b830e65627ecf35be453..e148ddeca0370cd76128a31ce3a4d488e9737d98 100644 --- a/python/paddle/v2/dataset/movielens.py +++ b/python/paddle/v2/dataset/movielens.py @@ -30,6 +30,9 @@ __all__ = [ age_table = [1, 18, 25, 35, 45, 50, 56] +URL = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip' +MD5 = 'c4d9eecfca2ab87c1945afe126590906' + class MovieInfo(object): def __init__(self, index, categories, title): @@ -77,10 +80,7 @@ USER_INFO = None def __initialize_meta_info__(): - fn = download( - url='http://files.grouplens.org/datasets/movielens/ml-1m.zip', - module_name='movielens', - md5sum='c4d9eecfca2ab87c1945afe126590906') + fn = download(URL, "movielens", MD5) global MOVIE_INFO if MOVIE_INFO is None: pattern = re.compile(r'^(.*)\((\d+)\)$') @@ -205,5 +205,9 @@ def unittest(): print train_count, test_count +def fetch(): + download(URL, "movielens", MD5) + + if __name__ == '__main__': unittest() diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py index 71689fd61b6b14a7b5072caff4e2fd48a7f74072..0eeb6d5affd8c280fb74edc82cf24bf418ca8ef9 100644 --- a/python/paddle/v2/dataset/sentiment.py +++ b/python/paddle/v2/dataset/sentiment.py @@ -125,3 +125,7 @@ def test(): """ data_set = load_sentiment_data() return reader_creator(data_set[NUM_TRAINING_INSTANCES:]) + + +def fetch(): + nltk.download('movie_reviews', download_dir=common.DATA_HOME) diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py index 27f454b137e3a40febd19cf085e2f4034cc16b24..dab8620441c966b19d8218025f8d8fa5b40d1c2c 100644 --- a/python/paddle/v2/dataset/uci_housing.py +++ b/python/paddle/v2/dataset/uci_housing.py @@ -89,3 +89,7 @@ def test(): yield d[:-1], d[-1:] return reader + + +def fetch(): + download(URL, 'uci_housing', MD5) diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py index c686870a497668517d1c78c11c616ad8a71a2980..ee63a93f5ad918b5bbc949ae6ba29082b3f6abd5 100644 --- a/python/paddle/v2/dataset/wmt14.py +++ b/python/paddle/v2/dataset/wmt14.py @@ -16,7 +16,7 @@ wmt14 dataset """ import tarfile -import paddle.v2.dataset.common +from paddle.v2.dataset.common import download __all__ = ['train', 'test', 'build_dict'] @@ -95,11 +95,13 @@ def reader_creator(tar_file, file_name, dict_size): def train(dict_size): return reader_creator( - paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), - 'train/train', dict_size) + download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'train/train', dict_size) def test(dict_size): return reader_creator( - paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), - 'test/test', dict_size) + download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'test/test', dict_size) + + +def fetch(): + download(URL_TRAIN, 'wmt14', MD5_TRAIN)