From e915aa9cf1784a82dce2b8cd0b77486c1219f6c3 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 26 Jun 2017 20:27:07 +0800 Subject: [PATCH] fix bugs --- python/paddle/v2/dataset/cifar.py | 2 +- python/paddle/v2/dataset/conll05.py | 4 ++-- python/paddle/v2/dataset/imdb.py | 12 +++++------- python/paddle/v2/dataset/imikolov.py | 12 +++++++----- python/paddle/v2/dataset/mnist.py | 2 +- python/paddle/v2/dataset/movielens.py | 3 ++- python/paddle/v2/dataset/sentiment.py | 2 +- python/paddle/v2/dataset/uci_housing.py | 2 +- python/paddle/v2/dataset/wmt14.py | 2 +- 9 files changed, 21 insertions(+), 20 deletions(-) diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py index 95984d980d..f885b2834e 100644 --- a/python/paddle/v2/dataset/cifar.py +++ b/python/paddle/v2/dataset/cifar.py @@ -34,7 +34,7 @@ import numpy import paddle.v2.dataset.common import tarfile -__all__ = ['train100', 'test100', 'train10', 'test10'] +__all__ = ['train100', 'test100', 'train10', 'test10', 'convert'] URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/' CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz' diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py index d4c2276b1b..f8aae52e7c 100644 --- a/python/paddle/v2/dataset/conll05.py +++ b/python/paddle/v2/dataset/conll05.py @@ -25,7 +25,7 @@ import gzip import itertools import paddle.v2.dataset.common -__all__ = ['test, get_dict', 'get_embedding'] +__all__ = ['test, get_dict', 'get_embedding', 'convert'] DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz' DATA_MD5 = '387719152ae52d60422c016e92a742fc' @@ -229,7 +229,7 @@ def fetch(): paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5) -def convert(): +def convert(path): """ Converts dataset to recordio format """ diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py index d939bc3065..c0ec5992e0 100644 --- a/python/paddle/v2/dataset/imdb.py +++ b/python/paddle/v2/dataset/imdb.py @@ -28,7 +28,7 @@ import re import string import threading -__all__ = ['build_dict', 'train', 'test'] +__all__ = ['build_dict', 'train', 'test', 'convert'] URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz' MD5 = '7c2ac02c03563afcf9b574c7e56c153a' @@ -168,12 +168,10 @@ def fetch(): paddle.v2.dataset.common.download(URL, 'imdb', MD5) -def convert(): +def convert(path): """ Converts dataset to recordio format """ - word_dict = ds.imdb.word_dict() - paddle.v2.dataset.common.convert(path, lambda: train(word_dict), 10, - "imdb_train") - paddle.v2.dataset.common.convert(path, lambda: test(word_dict), 10, - "imdb_test") + w = word_dict() + paddle.v2.dataset.common.convert(path, lambda: train(w), 10, "imdb_train") + paddle.v2.dataset.common.convert(path, lambda: test(w), 10, "imdb_test") diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py index 034f58c2c8..b18ee8e9ba 100644 --- a/python/paddle/v2/dataset/imikolov.py +++ b/python/paddle/v2/dataset/imikolov.py @@ -18,11 +18,11 @@ This module will download dataset from http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set into paddle reader creators. """ -import paddle.v2.dataset.common as common +import paddle.v2.dataset.common import collections import tarfile -__all__ = ['train', 'test', 'build_dict'] +__all__ = ['train', 'test', 'build_dict', 'convert'] URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz' MD5 = '30177ea32e27c525793142b6bf2c8e2d' @@ -145,7 +145,7 @@ def test(word_idx, n, data_type=DataType.NGRAM): def fetch(): - common.download(URL, "imikolov", MD5) + paddle.v2.dataset.common.download(URL, "imikolov", MD5) def convert(path): @@ -154,5 +154,7 @@ def convert(path): """ N = 5 word_dict = build_dict() - common.convert(path, train(word_dict, N), 10, "imikolov_train") - common.convert(path, test(word_dict, N), 10, "imikolov_test") + paddle.v2.dataset.common.convert(path, + train(word_dict, N), 10, "imikolov_train") + paddle.v2.dataset.common.convert(path, + test(word_dict, N), 10, "imikolov_test") diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py index 92d7f69b8d..ea5891f4f3 100644 --- a/python/paddle/v2/dataset/mnist.py +++ b/python/paddle/v2/dataset/mnist.py @@ -21,7 +21,7 @@ import paddle.v2.dataset.common import subprocess import numpy import platform -__all__ = ['train', 'test'] +__all__ = ['train', 'test', 'convert'] URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/' TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz' diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py index fb906cd4b6..d9372d422a 100644 --- a/python/paddle/v2/dataset/movielens.py +++ b/python/paddle/v2/dataset/movielens.py @@ -30,7 +30,8 @@ import functools __all__ = [ 'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id', - 'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info' + 'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info', + 'convert' ] age_table = [1, 18, 25, 35, 45, 50, 56] diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py index 89683c2063..e33f120c87 100644 --- a/python/paddle/v2/dataset/sentiment.py +++ b/python/paddle/v2/dataset/sentiment.py @@ -28,7 +28,7 @@ from nltk.corpus import movie_reviews import paddle.v2.dataset.common -__all__ = ['train', 'test', 'get_word_dict'] +__all__ = ['train', 'test', 'get_word_dict', 'convert'] NUM_TRAINING_INSTANCES = 1600 NUM_TOTAL_INSTANCES = 2000 diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py index 9e15000c02..c715ea9681 100644 --- a/python/paddle/v2/dataset/uci_housing.py +++ b/python/paddle/v2/dataset/uci_housing.py @@ -29,7 +29,7 @@ URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing MD5 = 'd4accdce7a25600298819f8e28e8d593' feature_names = [ 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', - 'PTRATIO', 'B', 'LSTAT' + 'PTRATIO', 'B', 'LSTAT', 'convert' ] UCI_TRAIN_DATA = None diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py index f29c9275f0..e1dc4f4c30 100644 --- a/python/paddle/v2/dataset/wmt14.py +++ b/python/paddle/v2/dataset/wmt14.py @@ -25,7 +25,7 @@ import gzip import paddle.v2.dataset.common from paddle.v2.parameters import Parameters -__all__ = ['train', 'test', 'build_dict'] +__all__ = ['train', 'test', 'build_dict', 'convert'] URL_DEV_TEST = 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz' MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5' -- GitLab