remove recordio convert in dataset, test=develop (#19387)

807c7a47 · Zeng Jinle · GitHub · 11070cbf · 807c7a47 · 807c7a47
11 changed file
--- a/python/paddle/dataset/cifar.py
+++ b/python/paddle/dataset/cifar.py
@@ -37,7 +37,7 @@ import tarfile
 import six
 from six.moves import cPickle as pickle
-__all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
+__all__ = ['train100', 'test100', 'train10', 'test10']
 URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
 CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
@@ -144,13 +144,3 @@ def test10(cycle=False):
 def fetch():
    paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
    paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    paddle.dataset.common.convert(path, train100(), 1000, "cifar_train100")
-    paddle.dataset.common.convert(path, test100(), 1000, "cifar_test100")
-    paddle.dataset.common.convert(path, train10(), 1000, "cifar_train10")
-    paddle.dataset.common.convert(path, test10(), 1000, "cifar_test10")
--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -119,20 +119,6 @@ def fetch_all():
                "fetch")()
-def fetch_all_recordio(path):
-    for module_name in [
-            x for x in dir(paddle.dataset) if not x.startswith("__")
-    ]:
-        if "convert" in dir(
-                importlib.import_module("paddle.dataset.%s" % module_name)) and \
-                not module_name == "common":
-            ds_path = os.path.join(path, module_name)
-            must_mkdirs(ds_path)
-            getattr(
-                importlib.import_module("paddle.dataset.%s" % module_name),
-                "convert")(ds_path)
 def split(reader, line_count, suffix="%05d.pickle", dumper=pickle.dump):
    """
    you can call the function as:

--- a/python/paddle/dataset/conll05.py
+++ b/python/paddle/dataset/conll05.py
@@ -29,7 +29,7 @@ import paddle.dataset.common
 import paddle.compat as cpt
 from six.moves import zip, range
-__all__ = ['test, get_dict', 'get_embedding', 'convert']
+__all__ = ['test, get_dict', 'get_embedding']
 DATA_URL = 'http://paddlemodels.bj.bcebos.com/conll05st/conll05st-tests.tar.gz'
 DATA_MD5 = '387719152ae52d60422c016e92a742fc'
@@ -248,11 +248,3 @@ def fetch():
    paddle.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
    paddle.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
    paddle.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5)
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    paddle.dataset.common.convert(path, test(), 1000, "conl105_train")
-    paddle.dataset.common.convert(path, test(), 1000, "conl105_test")
--- a/python/paddle/dataset/imdb.py
+++ b/python/paddle/dataset/imdb.py
@@ -29,7 +29,7 @@ import re
 import string
 import six
-__all__ = ['build_dict', 'train', 'test', 'convert']
+__all__ = ['build_dict', 'train', 'test']
 URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
 MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
@@ -140,12 +140,3 @@ def word_dict():
 def fetch():
    paddle.dataset.common.download(URL, 'imdb', MD5)
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    w = word_dict()
-    paddle.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train")
-    paddle.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test")
--- a/python/paddle/dataset/imikolov.py
+++ b/python/paddle/dataset/imikolov.py
@@ -26,7 +26,7 @@ import collections
 import tarfile
 import six
-__all__ = ['train', 'test', 'build_dict', 'convert']
+__all__ = ['train', 'test', 'build_dict']
 URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
 MD5 = '30177ea32e27c525793142b6bf2c8e2d'
@@ -152,15 +152,3 @@ def test(word_idx, n, data_type=DataType.NGRAM):
 def fetch():
    paddle.dataset.common.download(URL, "imikolov", MD5)
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    N = 5
-    word_dict = build_dict()
-    paddle.dataset.common.convert(path,
-                                  train(word_dict, N), 1000, "imikolov_train")
-    paddle.dataset.common.convert(path,
-                                  test(word_dict, N), 1000, "imikolov_test")
--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -25,7 +25,7 @@ import gzip
 import numpy
 import struct
 from six.moves import range
-__all__ = ['train', 'test', 'convert']
+__all__ = ['train', 'test']
 URL_PREFIX = 'https://dataset.bj.bcebos.com/mnist/'
 TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
@@ -126,11 +126,3 @@ def fetch():
    paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
    paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
    paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5)
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    paddle.dataset.common.convert(path, train(), 1000, "minist_train")
-    paddle.dataset.common.convert(path, test(), 1000, "minist_test")
--- a/python/paddle/dataset/movielens.py
+++ b/python/paddle/dataset/movielens.py
@@ -35,8 +35,7 @@ import paddle.compat as cpt
 __all__ = [
    'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
-    'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info',
+    'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info'
-    'convert'
 ]
 age_table = [1, 18, 25, 35, 45, 50, 56]
@@ -259,13 +258,5 @@ def fetch():
    paddle.dataset.common.download(URL, "movielens", MD5)
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    paddle.dataset.common.convert(path, train(), 1000, "movielens_train")
-    paddle.dataset.common.convert(path, test(), 1000, "movielens_test")
 if __name__ == '__main__':
    unittest()
--- a/python/paddle/dataset/sentiment.py
+++ b/python/paddle/dataset/sentiment.py
@@ -31,7 +31,7 @@ from nltk.corpus import movie_reviews
 import paddle.dataset.common
-__all__ = ['train', 'test', 'get_word_dict', 'convert']
+__all__ = ['train', 'test', 'get_word_dict']
 NUM_TRAINING_INSTANCES = 1600
 NUM_TOTAL_INSTANCES = 2000
@@ -134,11 +134,3 @@ def test():
 def fetch():
    nltk.download('movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    paddle.dataset.common.convert(path, train, 1000, "sentiment_train")
-    paddle.dataset.common.convert(path, test, 1000, "sentiment_test")
--- a/python/paddle/dataset/uci_housing.py
+++ b/python/paddle/dataset/uci_housing.py
@@ -34,7 +34,7 @@ URL = 'http://paddlemodels.bj.bcebos.com/uci_housing/housing.data'
 MD5 = 'd4accdce7a25600298819f8e28e8d593'
 feature_names = [
    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
-    'PTRATIO', 'B', 'LSTAT', 'convert'
+    'PTRATIO', 'B', 'LSTAT'
 ]
 UCI_TRAIN_DATA = None
@@ -147,11 +147,3 @@ def predict_reader():
 def fetch():
    paddle.dataset.common.download(URL, 'uci_housing', MD5)
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    paddle.dataset.common.convert(path, train(), 1000, "uci_housing_train")
-    paddle.dataset.common.convert(path, test(), 1000, "uci_houseing_test")
--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
@@ -33,7 +33,6 @@ __all__ = [
    'train',
    'test',
    'get_dict',
-    'convert',
 ]
 URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/'
@@ -167,12 +166,3 @@ def get_dict(dict_size, reverse=True):
 def fetch():
    paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
    paddle.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    dict_size = 30000
-    paddle.dataset.common.convert(path, train(dict_size), 1000, "wmt14_train")
-    paddle.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test")
--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -43,7 +43,6 @@ __all__ = [
    "train",
    "test",
    "validation",
-    "convert",
    "fetch",
    "get_dict",
 ]
@@ -325,33 +324,3 @@ def fetch():
    """
    paddle.v4.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
                                      "wmt16.tar.gz")
-def convert(path, src_dict_size, trg_dict_size, src_lang):
-    """Converts dataset to recordio format.
-    """
-    paddle.dataset.common.convert(
-        path,
-        train(
-            src_dict_size=src_dict_size,
-            trg_dict_size=trg_dict_size,
-            src_lang=src_lang),
-        1000,
-        "wmt16_train")
-    paddle.dataset.common.convert(
-        path,
-        test(
-            src_dict_size=src_dict_size,
-            trg_dict_size=trg_dict_size,
-            src_lang=src_lang),
-        1000,
-        "wmt16_test")
-    paddle.dataset.common.convert(
-        path,
-        validation(
-            src_dict_size=src_dict_size,
-            trg_dict_size=trg_dict_size,
-            src_lang=src_lang),
-        1000,
-        "wmt16_validation")