diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py index 8a2642d8dd832530f64499ab60e220bd8bb0dc02..95a0eeeef5d65565a3bf6e4737dbe5b15b5e8f9a 100644 --- a/python/paddle/v2/dataset/cifar.py +++ b/python/paddle/v2/dataset/cifar.py @@ -14,14 +14,17 @@ """ CIFAR dataset. -This module will download dataset from https://www.cs.toronto.edu/~kriz/cifar.html and -parse train/test set into paddle reader creators. +This module will download dataset from +https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into +paddle reader creators. -The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, with 6000 -images per class. There are 50000 training images and 10000 test images. +The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, +with 6000 images per class. There are 50000 training images and 10000 test +images. -The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes containing -600 images each. There are 500 training images and 100 testing images per class. +The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes +containing 600 images each. There are 500 training images and 100 testing +images per class. """ diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py index 2638b63989cc4af5981280dc91c9b0c34735acb2..2da12dc8fd51b9b497678abbf50b8d9628c3bcb0 100644 --- a/python/paddle/v2/dataset/conll05.py +++ b/python/paddle/v2/dataset/conll05.py @@ -13,10 +13,11 @@ # limitations under the License. """ Conll05 dataset. -Paddle semantic role labeling Book and demo use this dataset as an example. Because -Conll05 is not free in public, the default downloaded URL is test set of -Conll05 (which is public). Users can change URL and MD5 to their Conll dataset. -And a pre-trained word vector model based on Wikipedia corpus is used to initialize SRL model. +Paddle semantic role labeling Book and demo use this dataset as an example. +Because Conll05 is not free in public, the default downloaded URL is test set +of Conll05 (which is public). Users can change URL and MD5 to their Conll +dataset. And a pre-trained word vector model based on Wikipedia corpus is used +to initialize SRL model. """ import tarfile @@ -198,9 +199,10 @@ def test(): """ Conll05 test set creator. - Because the train dataset is not free, the test dataset is used for training. - It returns a reader creator, each sample in the reader is nine features, including sentence - sequence, predicate, predicate context, predicate context flag and tagged sequence. + Because the train dataset is not free, the test dataset is used for + training. It returns a reader creator, each sample in the reader is nine + features, including sentence sequence, predicate, predicate context, + predicate context flag and tagged sequence. :return: Train reader creator :rtype: callable diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py index a3af22f5be419cea999fa12cf7588c79d4069b15..cc07b53ef4b437a850dc5ed6d2a58edda59c7e34 100644 --- a/python/paddle/v2/dataset/imdb.py +++ b/python/paddle/v2/dataset/imdb.py @@ -14,11 +14,10 @@ """ IMDB dataset. -This module download IMDB dataset from -http://ai.stanford.edu/%7Eamaas/data/sentiment/, which contains a set of 25,000 -highly polar movie reviews for training, and 25,000 for testing. Besides, this -module also provides API for build dictionary and parse train set and test set -into paddle reader creators. +This module downloads IMDB dataset from +http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set +of 25,000 highly polar movie reviews for training, and 25,000 for testing. +Besides, this module also provides API for building dictionary. """ import paddle.v2.dataset.common @@ -37,7 +36,7 @@ MD5 = '7c2ac02c03563afcf9b574c7e56c153a' def tokenize(pattern): """ - Read files that match pattern. Tokenize and yield each file. + Read files that match the given pattern. Tokenize and yield each file. """ with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb', @@ -57,7 +56,8 @@ def tokenize(pattern): def build_dict(pattern, cutoff): """ - Build a word dictionary, the key is word, and the value is index. + Build a word dictionary from the corpus. Keys of the dictionary are words, + and values are zero-based IDs of these words. """ word_freq = collections.defaultdict(int) for doc in tokenize(pattern): @@ -123,7 +123,7 @@ def train(word_idx): """ IMDB train set creator. - It returns a reader creator, each sample in the reader is an index + It returns a reader creator, each sample in the reader is an zero-based ID sequence and label in [0, 1]. :param word_idx: word dictionary @@ -140,7 +140,7 @@ def test(word_idx): """ IMDB test set creator. - It returns a reader creator, each sample in the reader is an index + It returns a reader creator, each sample in the reader is an zero-based ID sequence and label in [0, 1]. :param word_idx: word dictionary @@ -155,7 +155,7 @@ def test(word_idx): def word_dict(): """ - Build word dictionary. + Build a word dictionary from the corpus. :return: Word dictionary :rtype: dict diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py index 4d3c0d59246d33dd2d9a360cf26495a23908cd11..ddf0dbce22158612d052b835ee018780165040a8 100644 --- a/python/paddle/v2/dataset/imikolov.py +++ b/python/paddle/v2/dataset/imikolov.py @@ -14,8 +14,9 @@ """ imikolov's simple dataset. -This module will download dataset from http://www.fit.vutbr.cz/~imikolov/rnnlm/ and -parse train/test set into paddle reader creators. +This module will download dataset from +http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse train/test set into paddle +reader creators. """ import paddle.v2.dataset.common import collections @@ -42,7 +43,8 @@ def word_count(f, word_freq=None): def build_dict(): """ - Build a word dictionary, the key is word, and the value is index. + Build a word dictionary from the corpus, Keys of the dictionary are words, + and values are zero-based IDs of these words. """ train_filename = './simple-examples/data/ptb.train.txt' test_filename = './simple-examples/data/ptb.valid.txt' @@ -91,7 +93,7 @@ def train(word_idx, n): """ imikolov train set creator. - It returns a reader creator, each sample in the reader is an index + It returns a reader creator, each sample in the reader is a word ID tuple. :param word_idx: word dictionary @@ -108,7 +110,7 @@ def test(word_idx, n): """ imikolov test set creator. - It returns a reader creator, each sample in the reader is an index + It returns a reader creator, each sample in the reader is a word ID tuple. :param word_idx: word dictionary diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py index ea0f83b8d405eda6a4c55f849235367149e7da20..d6b57ec58559e505d092edac540276ecbc989996 100644 --- a/python/paddle/v2/dataset/movielens.py +++ b/python/paddle/v2/dataset/movielens.py @@ -14,10 +14,11 @@ """ Movielens 1-M dataset. -Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000 movies, which was -collected by GroupLens Research. This module will download Movielens 1-M dataset from -http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse train/test set -into paddle reader creators. +Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000 +movies, which was collected by GroupLens Research. This module will download +Movielens 1-M dataset from +http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse train/test +set into paddle reader creators. """ @@ -50,7 +51,7 @@ class MovieInfo(object): def value(self): """ - Get information of a movie. + Get information from a movie. """ return [ self.index, [CATEGORIES_DICT[c] for c in self.categories], @@ -78,7 +79,7 @@ class UserInfo(object): def value(self): """ - Get information of a user. + Get information from a user. """ return [self.index, 0 if self.is_male else 1, self.age, self.job_id] diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py index 3e358e4e8aa24f66b85e3b7f94d145ea7dd5641b..5da4df898473f90715a3d89d9b02bca46c675d51 100644 --- a/python/paddle/v2/dataset/uci_housing.py +++ b/python/paddle/v2/dataset/uci_housing.py @@ -75,8 +75,8 @@ def train(): """ UCI_HOUSING train set creator. - It returns a reader creator, each sample in the reader is features after normalization - and price number. + It returns a reader creator, each sample in the reader is features after + normalization and price number. :return: Train reader creator :rtype: callable @@ -95,8 +95,8 @@ def test(): """ UCI_HOUSING test set creator. - It returns a reader creator, each sample in the reader is features after normalization - and price number. + It returns a reader creator, each sample in the reader is features after + normalization and price number. :return: Test reader creator :rtype: callable diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py index 34757e3baf239b3206c2983e7cdec790948ae907..5cb523864cafd01c96386d0f50c747eceb22ae47 100644 --- a/python/paddle/v2/dataset/wmt14.py +++ b/python/paddle/v2/dataset/wmt14.py @@ -13,8 +13,8 @@ # limitations under the License. """ WMT14 dataset. -The original WMT14 dataset is too large and a small set of data for set is provided. -This module will download dataset from +The original WMT14 dataset is too large and a small set of data for set is +provided. This module will download dataset from http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and parse train/test set into paddle reader creators. @@ -107,8 +107,9 @@ def train(dict_size): """ WMT14 train set creator. - It returns a reader creator, each sample in the reader is source language word index - sequence, target language word index sequence and next word index sequence. + It returns a reader creator, each sample in the reader is source language + word ID sequence, target language word ID sequence and next word ID + sequence. :return: Train reader creator :rtype: callable @@ -121,8 +122,9 @@ def test(dict_size): """ WMT14 test set creator. - It returns a reader creator, each sample in the reader is source language word index - sequence, target language word index sequence and next word index sequence. + It returns a reader creator, each sample in the reader is source language + word ID sequence, target language word ID sequence and next word ID + sequence. :return: Train reader creator :rtype: callable diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py index 31222a89e4923e8c20f42a08f7b4b681dc552db3..2711c8bd71980f542a0dbb049ee007d6a6248424 100644 --- a/python/paddle/v2/trainer.py +++ b/python/paddle/v2/trainer.py @@ -1,5 +1,5 @@ """ -Trainer package +Module Trainer """ import collections