未验证 提交 807c7a47 编写于 作者: Z Zeng Jinle 提交者: GitHub

remove recordio convert in dataset, test=develop (#19387)

上级 11070cbf
...@@ -37,7 +37,7 @@ import tarfile ...@@ -37,7 +37,7 @@ import tarfile
import six import six
from six.moves import cPickle as pickle from six.moves import cPickle as pickle
__all__ = ['train100', 'test100', 'train10', 'test10', 'convert'] __all__ = ['train100', 'test100', 'train10', 'test10']
URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/' URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz' CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
...@@ -144,13 +144,3 @@ def test10(cycle=False): ...@@ -144,13 +144,3 @@ def test10(cycle=False):
def fetch(): def fetch():
paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5) paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5) paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
def convert(path):
"""
Converts dataset to recordio format
"""
paddle.dataset.common.convert(path, train100(), 1000, "cifar_train100")
paddle.dataset.common.convert(path, test100(), 1000, "cifar_test100")
paddle.dataset.common.convert(path, train10(), 1000, "cifar_train10")
paddle.dataset.common.convert(path, test10(), 1000, "cifar_test10")
...@@ -119,20 +119,6 @@ def fetch_all(): ...@@ -119,20 +119,6 @@ def fetch_all():
"fetch")() "fetch")()
def fetch_all_recordio(path):
for module_name in [
x for x in dir(paddle.dataset) if not x.startswith("__")
]:
if "convert" in dir(
importlib.import_module("paddle.dataset.%s" % module_name)) and \
not module_name == "common":
ds_path = os.path.join(path, module_name)
must_mkdirs(ds_path)
getattr(
importlib.import_module("paddle.dataset.%s" % module_name),
"convert")(ds_path)
def split(reader, line_count, suffix="%05d.pickle", dumper=pickle.dump): def split(reader, line_count, suffix="%05d.pickle", dumper=pickle.dump):
""" """
you can call the function as: you can call the function as:
......
...@@ -29,7 +29,7 @@ import paddle.dataset.common ...@@ -29,7 +29,7 @@ import paddle.dataset.common
import paddle.compat as cpt import paddle.compat as cpt
from six.moves import zip, range from six.moves import zip, range
__all__ = ['test, get_dict', 'get_embedding', 'convert'] __all__ = ['test, get_dict', 'get_embedding']
DATA_URL = 'http://paddlemodels.bj.bcebos.com/conll05st/conll05st-tests.tar.gz' DATA_URL = 'http://paddlemodels.bj.bcebos.com/conll05st/conll05st-tests.tar.gz'
DATA_MD5 = '387719152ae52d60422c016e92a742fc' DATA_MD5 = '387719152ae52d60422c016e92a742fc'
...@@ -248,11 +248,3 @@ def fetch(): ...@@ -248,11 +248,3 @@ def fetch():
paddle.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5) paddle.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
paddle.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5) paddle.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
paddle.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5) paddle.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5)
def convert(path):
"""
Converts dataset to recordio format
"""
paddle.dataset.common.convert(path, test(), 1000, "conl105_train")
paddle.dataset.common.convert(path, test(), 1000, "conl105_test")
...@@ -29,7 +29,7 @@ import re ...@@ -29,7 +29,7 @@ import re
import string import string
import six import six
__all__ = ['build_dict', 'train', 'test', 'convert'] __all__ = ['build_dict', 'train', 'test']
URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz' URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
MD5 = '7c2ac02c03563afcf9b574c7e56c153a' MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
...@@ -140,12 +140,3 @@ def word_dict(): ...@@ -140,12 +140,3 @@ def word_dict():
def fetch(): def fetch():
paddle.dataset.common.download(URL, 'imdb', MD5) paddle.dataset.common.download(URL, 'imdb', MD5)
def convert(path):
"""
Converts dataset to recordio format
"""
w = word_dict()
paddle.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train")
paddle.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test")
...@@ -26,7 +26,7 @@ import collections ...@@ -26,7 +26,7 @@ import collections
import tarfile import tarfile
import six import six
__all__ = ['train', 'test', 'build_dict', 'convert'] __all__ = ['train', 'test', 'build_dict']
URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz' URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
MD5 = '30177ea32e27c525793142b6bf2c8e2d' MD5 = '30177ea32e27c525793142b6bf2c8e2d'
...@@ -152,15 +152,3 @@ def test(word_idx, n, data_type=DataType.NGRAM): ...@@ -152,15 +152,3 @@ def test(word_idx, n, data_type=DataType.NGRAM):
def fetch(): def fetch():
paddle.dataset.common.download(URL, "imikolov", MD5) paddle.dataset.common.download(URL, "imikolov", MD5)
def convert(path):
"""
Converts dataset to recordio format
"""
N = 5
word_dict = build_dict()
paddle.dataset.common.convert(path,
train(word_dict, N), 1000, "imikolov_train")
paddle.dataset.common.convert(path,
test(word_dict, N), 1000, "imikolov_test")
...@@ -25,7 +25,7 @@ import gzip ...@@ -25,7 +25,7 @@ import gzip
import numpy import numpy
import struct import struct
from six.moves import range from six.moves import range
__all__ = ['train', 'test', 'convert'] __all__ = ['train', 'test']
URL_PREFIX = 'https://dataset.bj.bcebos.com/mnist/' URL_PREFIX = 'https://dataset.bj.bcebos.com/mnist/'
TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz' TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
...@@ -126,11 +126,3 @@ def fetch(): ...@@ -126,11 +126,3 @@ def fetch():
paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5) paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5) paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5) paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5)
def convert(path):
"""
Converts dataset to recordio format
"""
paddle.dataset.common.convert(path, train(), 1000, "minist_train")
paddle.dataset.common.convert(path, test(), 1000, "minist_test")
...@@ -35,8 +35,7 @@ import paddle.compat as cpt ...@@ -35,8 +35,7 @@ import paddle.compat as cpt
__all__ = [ __all__ = [
'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id', 'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info', 'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info'
'convert'
] ]
age_table = [1, 18, 25, 35, 45, 50, 56] age_table = [1, 18, 25, 35, 45, 50, 56]
...@@ -259,13 +258,5 @@ def fetch(): ...@@ -259,13 +258,5 @@ def fetch():
paddle.dataset.common.download(URL, "movielens", MD5) paddle.dataset.common.download(URL, "movielens", MD5)
def convert(path):
"""
Converts dataset to recordio format
"""
paddle.dataset.common.convert(path, train(), 1000, "movielens_train")
paddle.dataset.common.convert(path, test(), 1000, "movielens_test")
if __name__ == '__main__': if __name__ == '__main__':
unittest() unittest()
...@@ -31,7 +31,7 @@ from nltk.corpus import movie_reviews ...@@ -31,7 +31,7 @@ from nltk.corpus import movie_reviews
import paddle.dataset.common import paddle.dataset.common
__all__ = ['train', 'test', 'get_word_dict', 'convert'] __all__ = ['train', 'test', 'get_word_dict']
NUM_TRAINING_INSTANCES = 1600 NUM_TRAINING_INSTANCES = 1600
NUM_TOTAL_INSTANCES = 2000 NUM_TOTAL_INSTANCES = 2000
...@@ -134,11 +134,3 @@ def test(): ...@@ -134,11 +134,3 @@ def test():
def fetch(): def fetch():
nltk.download('movie_reviews', download_dir=paddle.dataset.common.DATA_HOME) nltk.download('movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
def convert(path):
"""
Converts dataset to recordio format
"""
paddle.dataset.common.convert(path, train, 1000, "sentiment_train")
paddle.dataset.common.convert(path, test, 1000, "sentiment_test")
...@@ -34,7 +34,7 @@ URL = 'http://paddlemodels.bj.bcebos.com/uci_housing/housing.data' ...@@ -34,7 +34,7 @@ URL = 'http://paddlemodels.bj.bcebos.com/uci_housing/housing.data'
MD5 = 'd4accdce7a25600298819f8e28e8d593' MD5 = 'd4accdce7a25600298819f8e28e8d593'
feature_names = [ feature_names = [
'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
'PTRATIO', 'B', 'LSTAT', 'convert' 'PTRATIO', 'B', 'LSTAT'
] ]
UCI_TRAIN_DATA = None UCI_TRAIN_DATA = None
...@@ -147,11 +147,3 @@ def predict_reader(): ...@@ -147,11 +147,3 @@ def predict_reader():
def fetch(): def fetch():
paddle.dataset.common.download(URL, 'uci_housing', MD5) paddle.dataset.common.download(URL, 'uci_housing', MD5)
def convert(path):
"""
Converts dataset to recordio format
"""
paddle.dataset.common.convert(path, train(), 1000, "uci_housing_train")
paddle.dataset.common.convert(path, test(), 1000, "uci_houseing_test")
...@@ -33,7 +33,6 @@ __all__ = [ ...@@ -33,7 +33,6 @@ __all__ = [
'train', 'train',
'test', 'test',
'get_dict', 'get_dict',
'convert',
] ]
URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/' URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/'
...@@ -167,12 +166,3 @@ def get_dict(dict_size, reverse=True): ...@@ -167,12 +166,3 @@ def get_dict(dict_size, reverse=True):
def fetch(): def fetch():
paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN) paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
paddle.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL) paddle.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
def convert(path):
"""
Converts dataset to recordio format
"""
dict_size = 30000
paddle.dataset.common.convert(path, train(dict_size), 1000, "wmt14_train")
paddle.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test")
...@@ -43,7 +43,6 @@ __all__ = [ ...@@ -43,7 +43,6 @@ __all__ = [
"train", "train",
"test", "test",
"validation", "validation",
"convert",
"fetch", "fetch",
"get_dict", "get_dict",
] ]
...@@ -325,33 +324,3 @@ def fetch(): ...@@ -325,33 +324,3 @@ def fetch():
""" """
paddle.v4.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, paddle.v4.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
"wmt16.tar.gz") "wmt16.tar.gz")
def convert(path, src_dict_size, trg_dict_size, src_lang):
"""Converts dataset to recordio format.
"""
paddle.dataset.common.convert(
path,
train(
src_dict_size=src_dict_size,
trg_dict_size=trg_dict_size,
src_lang=src_lang),
1000,
"wmt16_train")
paddle.dataset.common.convert(
path,
test(
src_dict_size=src_dict_size,
trg_dict_size=trg_dict_size,
src_lang=src_lang),
1000,
"wmt16_test")
paddle.dataset.common.convert(
path,
validation(
src_dict_size=src_dict_size,
trg_dict_size=trg_dict_size,
src_lang=src_lang),
1000,
"wmt16_validation")
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册