From 67d4d89cc411b529419bb5557228d115765bdbd2 Mon Sep 17 00:00:00 2001 From: qijun Date: Tue, 28 Mar 2017 21:34:06 +0800 Subject: [PATCH] add doc for some v2/dataset --- doc/api/v2/data.rst | 13 +++++--- doc/api/v2/run_logic.rst | 13 ++------ python/paddle/v2/data_feeder.py | 2 +- python/paddle/v2/dataset/cifar.py | 2 +- python/paddle/v2/dataset/conll05.py | 27 +++++++++++---- python/paddle/v2/dataset/imdb.py | 44 +++++++++++++++++++++++-- python/paddle/v2/dataset/imikolov.py | 34 +++++++++++++++++-- python/paddle/v2/dataset/mnist.py | 2 +- python/paddle/v2/dataset/movielens.py | 41 ++++++++++++++++++++--- python/paddle/v2/dataset/uci_housing.py | 22 ++++++++++++- python/paddle/v2/dataset/wmt14.py | 25 +++++++++++++- python/paddle/v2/event.py | 5 ++- python/paddle/v2/trainer.py | 8 ++--- 13 files changed, 197 insertions(+), 41 deletions(-) diff --git a/doc/api/v2/data.rst b/doc/api/v2/data.rst index 7fd71e743b8..69fdea79b16 100644 --- a/doc/api/v2/data.rst +++ b/doc/api/v2/data.rst @@ -49,7 +49,6 @@ mnist :members: :noindex: - cifar +++++ @@ -61,7 +60,7 @@ conll05 +++++++ .. automodule:: paddle.v2.dataset.conll05 - :members: + :members: get_dict,get_embedding,test :noindex: imdb @@ -79,12 +78,18 @@ imikolov :noindex: movielens -+++++++++ ++++++++++ .. automodule:: paddle.v2.dataset.movielens :members: :noindex: +.. autoclass:: paddle.v2.dataset.movielens.MovieInfo + :noindex: + +.. autoclass:: paddle.v2.dataset.movielens.UserInfo + :noindex: + sentiment +++++++++ @@ -102,7 +107,7 @@ uci_housing wmt14 +++++ -.. automodule:: paddle.v2.dataset.uci_housing +.. automodule:: paddle.v2.dataset.wmt14 :members: :noindex: diff --git a/doc/api/v2/run_logic.rst b/doc/api/v2/run_logic.rst index 9088e30b09f..1b3d23d1e8f 100644 --- a/doc/api/v2/run_logic.rst +++ b/doc/api/v2/run_logic.rst @@ -13,25 +13,18 @@ Trainer ======= .. automodule:: paddle.v2.trainer - :members: Trainer + :members: SGD :noindex: Event ===== .. automodule:: paddle.v2.event - :members: Event + :members: :noindex: Inference ========= -.. automodule:: paddle.v2.inference - :members: Inference - :noindex: - .. autofunction:: paddle.v2.infer - :members: - :noindex: - - + :noindex: \ No newline at end of file diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py index bda8e22fd28..e3aac452a54 100644 --- a/python/paddle/v2/data_feeder.py +++ b/python/paddle/v2/data_feeder.py @@ -52,7 +52,7 @@ class DataFeeder(DataProviderConverter): # [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ], # first sample # [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ] # second sample # ] - arg = feeder(minibatch_data) + arg = feeder.convert(minibatch_data) .. note:: diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py index d8554d4d8e5..adcf8fbe763 100644 --- a/python/paddle/v2/dataset/cifar.py +++ b/python/paddle/v2/dataset/cifar.py @@ -15,7 +15,7 @@ CIFAR dataset. This module will download dataset from https://www.cs.toronto.edu/~kriz/cifar.html and -parse train set and test set into paddle reader creators. +parse train/test set into paddle reader creators. The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images. diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py index 854b20f0c35..d9ea2d027fb 100644 --- a/python/paddle/v2/dataset/conll05.py +++ b/python/paddle/v2/dataset/conll05.py @@ -12,12 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Conll 2005 dataset. Paddle semantic role labeling Book and demo use this -dataset as an example. Because Conll 2005 is not free in public, the default -downloaded URL is test set of Conll 2005 (which is public). Users can change -URL and MD5 to their Conll dataset. - -TODO(yuyang18): Complete comments. +Conll05 dataset. +Paddle semantic role labeling Book and demo use this dataset as an example. Because +Conll05 is not free in public, the default downloaded URL is test set of +Conll05 (which is public). Users can change URL and MD5 to their Conll dataset. +And a pre-trained word vector model based on Wikipedia corpus is used to initialize SRL model. """ import tarfile @@ -180,6 +179,9 @@ def reader_creator(corpus_reader, def get_dict(): + """ + Get the word, verb and label dictionary of Wikipedia corpus. + """ word_dict = load_dict(download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)) verb_dict = load_dict(download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)) label_dict = load_dict(download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)) @@ -187,10 +189,23 @@ def get_dict(): def get_embedding(): + """ + Get the trained word vector based on Wikipedia corpus. + """ return download(EMB_URL, 'conll05st', EMB_MD5) def test(): + """ + Conll05 test set creator. + + Because the train dataset is not free, the test dataset is used for training. + It returns a reader creator, each sample in the reader is nine features, including sentence + sequence, predicate, predicate context, predicate context flag and tagged sequence. + + :return: Train reader creator + :rtype: callable + """ word_dict, verb_dict, label_dict = get_dict() reader = corpus_reader( download(DATA_URL, 'conll05st', DATA_MD5), diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py index 5284017ce08..e363e21d953 100644 --- a/python/paddle/v2/dataset/imdb.py +++ b/python/paddle/v2/dataset/imdb.py @@ -12,9 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -IMDB dataset: http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz +IMDB dataset. -TODO(yuyang18): Complete comments. +This module download IMDB dataset from +http://ai.stanford.edu/%7Eamaas/data/sentiment/, which contains a set of 25,000 +highly polar movie reviews for training, and 25,000 for testing. Besides, this +module also provides API for build dictionary and parse train set and test set +into paddle reader creators. """ import paddle.v2.dataset.common @@ -30,8 +34,11 @@ URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz' MD5 = '7c2ac02c03563afcf9b574c7e56c153a' -# Read files that match pattern. Tokenize and yield each file. def tokenize(pattern): + """ + Read files that match pattern. Tokenize and yield each file. + """ + with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb', MD5)) as tarf: # Note that we should use tarfile.next(), which does @@ -48,6 +55,9 @@ def tokenize(pattern): def build_dict(pattern, cutoff): + """ + Build a word dictionary, the key is word, and the value is index. + """ word_freq = {} for doc in tokenize(pattern): for word in doc: @@ -109,18 +119,46 @@ def reader_creator(pos_pattern, neg_pattern, word_idx, buffer_size): def train(word_idx): + """ + IMDB train set creator. + + It returns a reader creator, each sample in the reader is an index + sequence and label in [0, 1]. + + :param word_idx: word dictionary + :type word_idx: dict + :return: Train reader creator + :rtype: callable + """ return reader_creator( re.compile("aclImdb/train/pos/.*\.txt$"), re.compile("aclImdb/train/neg/.*\.txt$"), word_idx, 1000) def test(word_idx): + """ + IMDB test set creator. + + It returns a reader creator, each sample in the reader is an index + sequence and label in [0, 1]. + + :param word_idx: word dictionary + :type word_idx: dict + :return: Test reader creator + :rtype: callable + """ return reader_creator( re.compile("aclImdb/test/pos/.*\.txt$"), re.compile("aclImdb/test/neg/.*\.txt$"), word_idx, 1000) def word_dict(): + """ + Build word dictionary. + + :return: Word dictionary + :rtype: dict + """ return build_dict( re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150) diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py index 2931d06e7eb..6de5abe1079 100644 --- a/python/paddle/v2/dataset/imikolov.py +++ b/python/paddle/v2/dataset/imikolov.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -imikolov's simple dataset: http://www.fit.vutbr.cz/~imikolov/rnnlm/ +imikolov's simple dataset. -Complete comments. +This module will download dataset from http://www.fit.vutbr.cz/~imikolov/rnnlm/ and +parse train/test set into paddle reader creators. """ import paddle.v2.dataset.common import tarfile @@ -40,6 +41,9 @@ def word_count(f, word_freq=None): def build_dict(): + """ + Build a word dictionary, the key is word, and the value is index. + """ train_filename = './simple-examples/data/ptb.train.txt' test_filename = './simple-examples/data/ptb.valid.txt' with tarfile.open( @@ -84,10 +88,36 @@ def reader_creator(filename, word_idx, n): def train(word_idx, n): + """ + imikolov train set creator. + + It returns a reader creator, each sample in the reader is an index + tuple. + + :param word_idx: word dictionary + :type word_idx: dict + :param n: sliding window size + :type n: int + :return: Train reader creator + :rtype: callable + """ return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n) def test(word_idx, n): + """ + imikolov test set creator. + + It returns a reader creator, each sample in the reader is an index + tuple. + + :param word_idx: word dictionary + :type word_idx: dict + :param n: sliding window size + :type n: int + :return: Train reader creator + :rtype: callable + """ return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n) diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py index 48a39b5493a..2d6b3e376be 100644 --- a/python/paddle/v2/dataset/mnist.py +++ b/python/paddle/v2/dataset/mnist.py @@ -15,7 +15,7 @@ MNIST dataset. This module will download dataset from http://yann.lecun.com/exdb/mnist/ and -parse train set and test set into paddle reader creators. +parse train/test set into paddle reader creators. """ import paddle.v2.dataset.common import subprocess diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py index e304c986ba9..571868d8356 100644 --- a/python/paddle/v2/dataset/movielens.py +++ b/python/paddle/v2/dataset/movielens.py @@ -14,11 +14,11 @@ """ Movielens 1-M dataset. -GroupLens Research collected and made available rating data sets from the -MovieLens web site (http://movielens.org). Movielens 1-M dataset contains 1 million -ratings from 6000 users on 4000 movies. +Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000 movies, which was +collected by GroupLens Research. This module will download Movielens 1-M dataset from +http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse train/test set +into paddle reader creators. -TODO(yuyang18): Complete comments. """ import zipfile @@ -39,12 +39,18 @@ MD5 = 'c4d9eecfca2ab87c1945afe126590906' class MovieInfo(object): + """ + Movie id, title and categories information are stored in MovieInfo. + """ def __init__(self, index, categories, title): self.index = int(index) self.categories = categories self.title = title def value(self): + """ + Get information of a movie. + """ return [ self.index, [CATEGORIES_DICT[c] for c in self.categories], [MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()] @@ -59,6 +65,9 @@ class MovieInfo(object): class UserInfo(object): + """ + User id, gender, age, and job information are stored in UserInfo. + """ def __init__(self, index, gender, age, job_id): self.index = int(index) self.is_male = gender == 'M' @@ -66,6 +75,9 @@ class UserInfo(object): self.job_id = int(job_id) def value(self): + """ + Get information of a user. + """ return [self.index, 0 if self.is_male else 1, self.age, self.job_id] def __str__(self): @@ -152,6 +164,9 @@ test = functools.partial(__reader_creator__, is_test=True) def get_movie_title_dict(): + """ + Get movie title dictionary. + """ __initialize_meta_info__() return MOVIE_TITLE_DICT @@ -164,11 +179,17 @@ def __max_index_info__(a, b): def max_movie_id(): + """ + Get the maximum value of movie id. + """ __initialize_meta_info__() return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index def max_user_id(): + """ + Get the maximum value of user id. + """ __initialize_meta_info__() return reduce(__max_index_info__, USER_INFO.viewvalues()).index @@ -181,21 +202,33 @@ def __max_job_id_impl__(a, b): def max_job_id(): + """ + Get the maximum value of job id. + """ __initialize_meta_info__() return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id def movie_categories(): + """ + Get movie categoriges dictionary. + """ __initialize_meta_info__() return CATEGORIES_DICT def user_info(): + """ + Get user info dictionary. + """ __initialize_meta_info__() return USER_INFO def movie_info(): + """ + Get movie info dictionary. + """ __initialize_meta_info__() return MOVIE_INFO diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py index dab8620441c..57dc4d223bb 100644 --- a/python/paddle/v2/dataset/uci_housing.py +++ b/python/paddle/v2/dataset/uci_housing.py @@ -14,7 +14,9 @@ """ UCI Housing dataset. -TODO(yuyang18): Complete comments. +This module will download dataset from +https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and +parse train/test set into paddle reader creators. """ import numpy as np @@ -70,6 +72,15 @@ def load_data(filename, feature_num=14, ratio=0.8): def train(): + """ + UCI_HOUSING train set creator. + + It returns a reader creator, each sample in the reader is features after normalization + and price number. + + :return: Train reader creator + :rtype: callable + """ global UCI_TRAIN_DATA load_data(download(URL, 'uci_housing', MD5)) @@ -81,6 +92,15 @@ def train(): def test(): + """ + UCI_HOUSING test set creator. + + It returns a reader creator, each sample in the reader is features after normalization + and price number. + + :return: Test reader creator + :rtype: callable + """ global UCI_TEST_DATA load_data(download(URL, 'uci_housing', MD5)) diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py index ee63a93f5ad..48c39547fd1 100644 --- a/python/paddle/v2/dataset/wmt14.py +++ b/python/paddle/v2/dataset/wmt14.py @@ -12,7 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -wmt14 dataset +WMT14 dataset. +The original WMT14 dataset is too large and a small set of data for set is provided. +This module will download dataset from +http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and +parse train/test set into paddle reader creators. + """ import tarfile @@ -94,11 +99,29 @@ def reader_creator(tar_file, file_name, dict_size): def train(dict_size): + """ + WMT14 train set creator. + + It returns a reader creator, each sample in the reader is source language word index + sequence, target language word index sequence and next word index sequence. + + :return: Train reader creator + :rtype: callable + """ return reader_creator( download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'train/train', dict_size) def test(dict_size): + """ + WMT14 test set creator. + + It returns a reader creator, each sample in the reader is source language word index + sequence, target language word index sequence and next word index sequence. + + :return: Train reader creator + :rtype: callable + """ return reader_creator( download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'test/test', dict_size) diff --git a/python/paddle/v2/event.py b/python/paddle/v2/event.py index 1ad52b8baa4..fd6050fa339 100644 --- a/python/paddle/v2/event.py +++ b/python/paddle/v2/event.py @@ -1,14 +1,13 @@ """ -All training events. +Testing and training events. There are: +* TestResult * BeginIteration * EndIteration * BeginPass * EndPass - -TODO(yuyang18): Complete it! """ import py_paddle.swig_paddle as api diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py index 1a7b6790ac4..265f031532f 100644 --- a/python/paddle/v2/trainer.py +++ b/python/paddle/v2/trainer.py @@ -1,3 +1,6 @@ +""" +Trainer package +""" import collections import py_paddle.swig_paddle as api @@ -9,10 +12,7 @@ from . import optimizer as v2_optimizer from . import parameters as v2_parameters __all__ = ['SGD'] -""" -Trainer package -TODO(yuyang18): Complete comments. -""" + def default_event_handler(event): -- GitLab