From 807c7a4747308cd9910248ec8dde80592bbb0173 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Mon, 26 Aug 2019 13:29:41 +0800 Subject: [PATCH] remove recordio convert in dataset, test=develop (#19387) --- python/paddle/dataset/cifar.py | 12 +---------- python/paddle/dataset/common.py | 14 ------------- python/paddle/dataset/conll05.py | 10 +-------- python/paddle/dataset/imdb.py | 11 +--------- python/paddle/dataset/imikolov.py | 14 +------------ python/paddle/dataset/mnist.py | 10 +-------- python/paddle/dataset/movielens.py | 11 +--------- python/paddle/dataset/sentiment.py | 10 +-------- python/paddle/dataset/uci_housing.py | 10 +-------- python/paddle/dataset/wmt14.py | 10 --------- python/paddle/dataset/wmt16.py | 31 ---------------------------- 11 files changed, 8 insertions(+), 135 deletions(-) diff --git a/python/paddle/dataset/cifar.py b/python/paddle/dataset/cifar.py index b83fa78c4c6..a68824a6608 100644 --- a/python/paddle/dataset/cifar.py +++ b/python/paddle/dataset/cifar.py @@ -37,7 +37,7 @@ import tarfile import six from six.moves import cPickle as pickle -__all__ = ['train100', 'test100', 'train10', 'test10', 'convert'] +__all__ = ['train100', 'test100', 'train10', 'test10'] URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/' CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz' @@ -144,13 +144,3 @@ def test10(cycle=False): def fetch(): paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5) paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5) - - -def convert(path): - """ - Converts dataset to recordio format - """ - paddle.dataset.common.convert(path, train100(), 1000, "cifar_train100") - paddle.dataset.common.convert(path, test100(), 1000, "cifar_test100") - paddle.dataset.common.convert(path, train10(), 1000, "cifar_train10") - paddle.dataset.common.convert(path, test10(), 1000, "cifar_test10") diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py index 3567ecfa009..ce16e3b8518 100644 --- a/python/paddle/dataset/common.py +++ b/python/paddle/dataset/common.py @@ -119,20 +119,6 @@ def fetch_all(): "fetch")() -def fetch_all_recordio(path): - for module_name in [ - x for x in dir(paddle.dataset) if not x.startswith("__") - ]: - if "convert" in dir( - importlib.import_module("paddle.dataset.%s" % module_name)) and \ - not module_name == "common": - ds_path = os.path.join(path, module_name) - must_mkdirs(ds_path) - getattr( - importlib.import_module("paddle.dataset.%s" % module_name), - "convert")(ds_path) - - def split(reader, line_count, suffix="%05d.pickle", dumper=pickle.dump): """ you can call the function as: diff --git a/python/paddle/dataset/conll05.py b/python/paddle/dataset/conll05.py index 55cfd92721e..81a8cfc2e6a 100644 --- a/python/paddle/dataset/conll05.py +++ b/python/paddle/dataset/conll05.py @@ -29,7 +29,7 @@ import paddle.dataset.common import paddle.compat as cpt from six.moves import zip, range -__all__ = ['test, get_dict', 'get_embedding', 'convert'] +__all__ = ['test, get_dict', 'get_embedding'] DATA_URL = 'http://paddlemodels.bj.bcebos.com/conll05st/conll05st-tests.tar.gz' DATA_MD5 = '387719152ae52d60422c016e92a742fc' @@ -248,11 +248,3 @@ def fetch(): paddle.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5) paddle.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5) paddle.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5) - - -def convert(path): - """ - Converts dataset to recordio format - """ - paddle.dataset.common.convert(path, test(), 1000, "conl105_train") - paddle.dataset.common.convert(path, test(), 1000, "conl105_test") diff --git a/python/paddle/dataset/imdb.py b/python/paddle/dataset/imdb.py index fd92523a947..99f4adc35c1 100644 --- a/python/paddle/dataset/imdb.py +++ b/python/paddle/dataset/imdb.py @@ -29,7 +29,7 @@ import re import string import six -__all__ = ['build_dict', 'train', 'test', 'convert'] +__all__ = ['build_dict', 'train', 'test'] URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz' MD5 = '7c2ac02c03563afcf9b574c7e56c153a' @@ -140,12 +140,3 @@ def word_dict(): def fetch(): paddle.dataset.common.download(URL, 'imdb', MD5) - - -def convert(path): - """ - Converts dataset to recordio format - """ - w = word_dict() - paddle.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train") - paddle.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test") diff --git a/python/paddle/dataset/imikolov.py b/python/paddle/dataset/imikolov.py index 8eecb75231d..83cde3526ea 100644 --- a/python/paddle/dataset/imikolov.py +++ b/python/paddle/dataset/imikolov.py @@ -26,7 +26,7 @@ import collections import tarfile import six -__all__ = ['train', 'test', 'build_dict', 'convert'] +__all__ = ['train', 'test', 'build_dict'] URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz' MD5 = '30177ea32e27c525793142b6bf2c8e2d' @@ -152,15 +152,3 @@ def test(word_idx, n, data_type=DataType.NGRAM): def fetch(): paddle.dataset.common.download(URL, "imikolov", MD5) - - -def convert(path): - """ - Converts dataset to recordio format - """ - N = 5 - word_dict = build_dict() - paddle.dataset.common.convert(path, - train(word_dict, N), 1000, "imikolov_train") - paddle.dataset.common.convert(path, - test(word_dict, N), 1000, "imikolov_test") diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py index 138b21fd734..f52ffa049bc 100644 --- a/python/paddle/dataset/mnist.py +++ b/python/paddle/dataset/mnist.py @@ -25,7 +25,7 @@ import gzip import numpy import struct from six.moves import range -__all__ = ['train', 'test', 'convert'] +__all__ = ['train', 'test'] URL_PREFIX = 'https://dataset.bj.bcebos.com/mnist/' TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz' @@ -126,11 +126,3 @@ def fetch(): paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5) paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5) paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5) - - -def convert(path): - """ - Converts dataset to recordio format - """ - paddle.dataset.common.convert(path, train(), 1000, "minist_train") - paddle.dataset.common.convert(path, test(), 1000, "minist_test") diff --git a/python/paddle/dataset/movielens.py b/python/paddle/dataset/movielens.py index 64bf7414819..eddd858ace8 100644 --- a/python/paddle/dataset/movielens.py +++ b/python/paddle/dataset/movielens.py @@ -35,8 +35,7 @@ import paddle.compat as cpt __all__ = [ 'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id', - 'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info', - 'convert' + 'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info' ] age_table = [1, 18, 25, 35, 45, 50, 56] @@ -259,13 +258,5 @@ def fetch(): paddle.dataset.common.download(URL, "movielens", MD5) -def convert(path): - """ - Converts dataset to recordio format - """ - paddle.dataset.common.convert(path, train(), 1000, "movielens_train") - paddle.dataset.common.convert(path, test(), 1000, "movielens_test") - - if __name__ == '__main__': unittest() diff --git a/python/paddle/dataset/sentiment.py b/python/paddle/dataset/sentiment.py index 8051acb8812..9a1eae3f82a 100644 --- a/python/paddle/dataset/sentiment.py +++ b/python/paddle/dataset/sentiment.py @@ -31,7 +31,7 @@ from nltk.corpus import movie_reviews import paddle.dataset.common -__all__ = ['train', 'test', 'get_word_dict', 'convert'] +__all__ = ['train', 'test', 'get_word_dict'] NUM_TRAINING_INSTANCES = 1600 NUM_TOTAL_INSTANCES = 2000 @@ -134,11 +134,3 @@ def test(): def fetch(): nltk.download('movie_reviews', download_dir=paddle.dataset.common.DATA_HOME) - - -def convert(path): - """ - Converts dataset to recordio format - """ - paddle.dataset.common.convert(path, train, 1000, "sentiment_train") - paddle.dataset.common.convert(path, test, 1000, "sentiment_test") diff --git a/python/paddle/dataset/uci_housing.py b/python/paddle/dataset/uci_housing.py index 32d44a6bf78..5bc9c1444d2 100644 --- a/python/paddle/dataset/uci_housing.py +++ b/python/paddle/dataset/uci_housing.py @@ -34,7 +34,7 @@ URL = 'http://paddlemodels.bj.bcebos.com/uci_housing/housing.data' MD5 = 'd4accdce7a25600298819f8e28e8d593' feature_names = [ 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', - 'PTRATIO', 'B', 'LSTAT', 'convert' + 'PTRATIO', 'B', 'LSTAT' ] UCI_TRAIN_DATA = None @@ -147,11 +147,3 @@ def predict_reader(): def fetch(): paddle.dataset.common.download(URL, 'uci_housing', MD5) - - -def convert(path): - """ - Converts dataset to recordio format - """ - paddle.dataset.common.convert(path, train(), 1000, "uci_housing_train") - paddle.dataset.common.convert(path, test(), 1000, "uci_houseing_test") diff --git a/python/paddle/dataset/wmt14.py b/python/paddle/dataset/wmt14.py index 450f159f9d1..129e1129fb9 100644 --- a/python/paddle/dataset/wmt14.py +++ b/python/paddle/dataset/wmt14.py @@ -33,7 +33,6 @@ __all__ = [ 'train', 'test', 'get_dict', - 'convert', ] URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/' @@ -167,12 +166,3 @@ def get_dict(dict_size, reverse=True): def fetch(): paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN) paddle.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL) - - -def convert(path): - """ - Converts dataset to recordio format - """ - dict_size = 30000 - paddle.dataset.common.convert(path, train(dict_size), 1000, "wmt14_train") - paddle.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test") diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py index 770efe03a80..3e9007c8aaf 100644 --- a/python/paddle/dataset/wmt16.py +++ b/python/paddle/dataset/wmt16.py @@ -43,7 +43,6 @@ __all__ = [ "train", "test", "validation", - "convert", "fetch", "get_dict", ] @@ -325,33 +324,3 @@ def fetch(): """ paddle.v4.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, "wmt16.tar.gz") - - -def convert(path, src_dict_size, trg_dict_size, src_lang): - """Converts dataset to recordio format. - """ - - paddle.dataset.common.convert( - path, - train( - src_dict_size=src_dict_size, - trg_dict_size=trg_dict_size, - src_lang=src_lang), - 1000, - "wmt16_train") - paddle.dataset.common.convert( - path, - test( - src_dict_size=src_dict_size, - trg_dict_size=trg_dict_size, - src_lang=src_lang), - 1000, - "wmt16_test") - paddle.dataset.common.convert( - path, - validation( - src_dict_size=src_dict_size, - trg_dict_size=trg_dict_size, - src_lang=src_lang), - 1000, - "wmt16_validation") -- GitLab