提交 67d4d89c 编写于 作者: Q qijun

add doc for some v2/dataset

上级 9f417f12
......@@ -49,7 +49,6 @@ mnist
:members:
:noindex:
cifar
+++++
......@@ -61,7 +60,7 @@ conll05
+++++++
.. automodule:: paddle.v2.dataset.conll05
:members:
:members: get_dict,get_embedding,test
:noindex:
imdb
......@@ -85,6 +84,12 @@ movielens
:members:
:noindex:
.. autoclass:: paddle.v2.dataset.movielens.MovieInfo
:noindex:
.. autoclass:: paddle.v2.dataset.movielens.UserInfo
:noindex:
sentiment
+++++++++
......@@ -102,7 +107,7 @@ uci_housing
wmt14
+++++
.. automodule:: paddle.v2.dataset.uci_housing
.. automodule:: paddle.v2.dataset.wmt14
:members:
:noindex:
......@@ -13,25 +13,18 @@ Trainer
=======
.. automodule:: paddle.v2.trainer
:members: Trainer
:members: SGD
:noindex:
Event
=====
.. automodule:: paddle.v2.event
:members: Event
:members:
:noindex:
Inference
=========
.. automodule:: paddle.v2.inference
:members: Inference
:noindex:
.. autofunction:: paddle.v2.infer
:members:
:noindex:
\ No newline at end of file
......@@ -52,7 +52,7 @@ class DataFeeder(DataProviderConverter):
# [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ], # first sample
# [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ] # second sample
# ]
arg = feeder(minibatch_data)
arg = feeder.convert(minibatch_data)
.. note::
......
......@@ -15,7 +15,7 @@
CIFAR dataset.
This module will download dataset from https://www.cs.toronto.edu/~kriz/cifar.html and
parse train set and test set into paddle reader creators.
parse train/test set into paddle reader creators.
The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, with 6000
images per class. There are 50000 training images and 10000 test images.
......
......@@ -12,12 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Conll 2005 dataset. Paddle semantic role labeling Book and demo use this
dataset as an example. Because Conll 2005 is not free in public, the default
downloaded URL is test set of Conll 2005 (which is public). Users can change
URL and MD5 to their Conll dataset.
TODO(yuyang18): Complete comments.
Conll05 dataset.
Paddle semantic role labeling Book and demo use this dataset as an example. Because
Conll05 is not free in public, the default downloaded URL is test set of
Conll05 (which is public). Users can change URL and MD5 to their Conll dataset.
And a pre-trained word vector model based on Wikipedia corpus is used to initialize SRL model.
"""
import tarfile
......@@ -180,6 +179,9 @@ def reader_creator(corpus_reader,
def get_dict():
"""
Get the word, verb and label dictionary of Wikipedia corpus.
"""
word_dict = load_dict(download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
verb_dict = load_dict(download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
label_dict = load_dict(download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
......@@ -187,10 +189,23 @@ def get_dict():
def get_embedding():
"""
Get the trained word vector based on Wikipedia corpus.
"""
return download(EMB_URL, 'conll05st', EMB_MD5)
def test():
"""
Conll05 test set creator.
Because the train dataset is not free, the test dataset is used for training.
It returns a reader creator, each sample in the reader is nine features, including sentence
sequence, predicate, predicate context, predicate context flag and tagged sequence.
:return: Train reader creator
:rtype: callable
"""
word_dict, verb_dict, label_dict = get_dict()
reader = corpus_reader(
download(DATA_URL, 'conll05st', DATA_MD5),
......
......@@ -12,9 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
IMDB dataset: http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
IMDB dataset.
TODO(yuyang18): Complete comments.
This module download IMDB dataset from
http://ai.stanford.edu/%7Eamaas/data/sentiment/, which contains a set of 25,000
highly polar movie reviews for training, and 25,000 for testing. Besides, this
module also provides API for build dictionary and parse train set and test set
into paddle reader creators.
"""
import paddle.v2.dataset.common
......@@ -30,8 +34,11 @@ URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
# Read files that match pattern. Tokenize and yield each file.
def tokenize(pattern):
"""
Read files that match pattern. Tokenize and yield each file.
"""
with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb',
MD5)) as tarf:
# Note that we should use tarfile.next(), which does
......@@ -48,6 +55,9 @@ def tokenize(pattern):
def build_dict(pattern, cutoff):
"""
Build a word dictionary, the key is word, and the value is index.
"""
word_freq = {}
for doc in tokenize(pattern):
for word in doc:
......@@ -109,18 +119,46 @@ def reader_creator(pos_pattern, neg_pattern, word_idx, buffer_size):
def train(word_idx):
"""
IMDB train set creator.
It returns a reader creator, each sample in the reader is an index
sequence and label in [0, 1].
:param word_idx: word dictionary
:type word_idx: dict
:return: Train reader creator
:rtype: callable
"""
return reader_creator(
re.compile("aclImdb/train/pos/.*\.txt$"),
re.compile("aclImdb/train/neg/.*\.txt$"), word_idx, 1000)
def test(word_idx):
"""
IMDB test set creator.
It returns a reader creator, each sample in the reader is an index
sequence and label in [0, 1].
:param word_idx: word dictionary
:type word_idx: dict
:return: Test reader creator
:rtype: callable
"""
return reader_creator(
re.compile("aclImdb/test/pos/.*\.txt$"),
re.compile("aclImdb/test/neg/.*\.txt$"), word_idx, 1000)
def word_dict():
"""
Build word dictionary.
:return: Word dictionary
:rtype: dict
"""
return build_dict(
re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
......
......@@ -12,9 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
imikolov's simple dataset: http://www.fit.vutbr.cz/~imikolov/rnnlm/
imikolov's simple dataset.
Complete comments.
This module will download dataset from http://www.fit.vutbr.cz/~imikolov/rnnlm/ and
parse train/test set into paddle reader creators.
"""
import paddle.v2.dataset.common
import tarfile
......@@ -40,6 +41,9 @@ def word_count(f, word_freq=None):
def build_dict():
"""
Build a word dictionary, the key is word, and the value is index.
"""
train_filename = './simple-examples/data/ptb.train.txt'
test_filename = './simple-examples/data/ptb.valid.txt'
with tarfile.open(
......@@ -84,10 +88,36 @@ def reader_creator(filename, word_idx, n):
def train(word_idx, n):
"""
imikolov train set creator.
It returns a reader creator, each sample in the reader is an index
tuple.
:param word_idx: word dictionary
:type word_idx: dict
:param n: sliding window size
:type n: int
:return: Train reader creator
:rtype: callable
"""
return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n)
def test(word_idx, n):
"""
imikolov test set creator.
It returns a reader creator, each sample in the reader is an index
tuple.
:param word_idx: word dictionary
:type word_idx: dict
:param n: sliding window size
:type n: int
:return: Train reader creator
:rtype: callable
"""
return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n)
......
......@@ -15,7 +15,7 @@
MNIST dataset.
This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
parse train set and test set into paddle reader creators.
parse train/test set into paddle reader creators.
"""
import paddle.v2.dataset.common
import subprocess
......
......@@ -14,11 +14,11 @@
"""
Movielens 1-M dataset.
GroupLens Research collected and made available rating data sets from the
MovieLens web site (http://movielens.org). Movielens 1-M dataset contains 1 million
ratings from 6000 users on 4000 movies.
Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000 movies, which was
collected by GroupLens Research. This module will download Movielens 1-M dataset from
http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse train/test set
into paddle reader creators.
TODO(yuyang18): Complete comments.
"""
import zipfile
......@@ -39,12 +39,18 @@ MD5 = 'c4d9eecfca2ab87c1945afe126590906'
class MovieInfo(object):
"""
Movie id, title and categories information are stored in MovieInfo.
"""
def __init__(self, index, categories, title):
self.index = int(index)
self.categories = categories
self.title = title
def value(self):
"""
Get information of a movie.
"""
return [
self.index, [CATEGORIES_DICT[c] for c in self.categories],
[MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()]
......@@ -59,6 +65,9 @@ class MovieInfo(object):
class UserInfo(object):
"""
User id, gender, age, and job information are stored in UserInfo.
"""
def __init__(self, index, gender, age, job_id):
self.index = int(index)
self.is_male = gender == 'M'
......@@ -66,6 +75,9 @@ class UserInfo(object):
self.job_id = int(job_id)
def value(self):
"""
Get information of a user.
"""
return [self.index, 0 if self.is_male else 1, self.age, self.job_id]
def __str__(self):
......@@ -152,6 +164,9 @@ test = functools.partial(__reader_creator__, is_test=True)
def get_movie_title_dict():
"""
Get movie title dictionary.
"""
__initialize_meta_info__()
return MOVIE_TITLE_DICT
......@@ -164,11 +179,17 @@ def __max_index_info__(a, b):
def max_movie_id():
"""
Get the maximum value of movie id.
"""
__initialize_meta_info__()
return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index
def max_user_id():
"""
Get the maximum value of user id.
"""
__initialize_meta_info__()
return reduce(__max_index_info__, USER_INFO.viewvalues()).index
......@@ -181,21 +202,33 @@ def __max_job_id_impl__(a, b):
def max_job_id():
"""
Get the maximum value of job id.
"""
__initialize_meta_info__()
return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id
def movie_categories():
"""
Get movie categoriges dictionary.
"""
__initialize_meta_info__()
return CATEGORIES_DICT
def user_info():
"""
Get user info dictionary.
"""
__initialize_meta_info__()
return USER_INFO
def movie_info():
"""
Get movie info dictionary.
"""
__initialize_meta_info__()
return MOVIE_INFO
......
......@@ -14,7 +14,9 @@
"""
UCI Housing dataset.
TODO(yuyang18): Complete comments.
This module will download dataset from
https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
parse train/test set into paddle reader creators.
"""
import numpy as np
......@@ -70,6 +72,15 @@ def load_data(filename, feature_num=14, ratio=0.8):
def train():
"""
UCI_HOUSING train set creator.
It returns a reader creator, each sample in the reader is features after normalization
and price number.
:return: Train reader creator
:rtype: callable
"""
global UCI_TRAIN_DATA
load_data(download(URL, 'uci_housing', MD5))
......@@ -81,6 +92,15 @@ def train():
def test():
"""
UCI_HOUSING test set creator.
It returns a reader creator, each sample in the reader is features after normalization
and price number.
:return: Test reader creator
:rtype: callable
"""
global UCI_TEST_DATA
load_data(download(URL, 'uci_housing', MD5))
......
......@@ -12,7 +12,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
wmt14 dataset
WMT14 dataset.
The original WMT14 dataset is too large and a small set of data for set is provided.
This module will download dataset from
http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
parse train/test set into paddle reader creators.
"""
import tarfile
......@@ -94,11 +99,29 @@ def reader_creator(tar_file, file_name, dict_size):
def train(dict_size):
"""
WMT14 train set creator.
It returns a reader creator, each sample in the reader is source language word index
sequence, target language word index sequence and next word index sequence.
:return: Train reader creator
:rtype: callable
"""
return reader_creator(
download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'train/train', dict_size)
def test(dict_size):
"""
WMT14 test set creator.
It returns a reader creator, each sample in the reader is source language word index
sequence, target language word index sequence and next word index sequence.
:return: Train reader creator
:rtype: callable
"""
return reader_creator(
download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'test/test', dict_size)
......
"""
All training events.
Testing and training events.
There are:
* TestResult
* BeginIteration
* EndIteration
* BeginPass
* EndPass
TODO(yuyang18): Complete it!
"""
import py_paddle.swig_paddle as api
......
"""
Trainer package
"""
import collections
import py_paddle.swig_paddle as api
......@@ -9,10 +12,7 @@ from . import optimizer as v2_optimizer
from . import parameters as v2_parameters
__all__ = ['SGD']
"""
Trainer package
TODO(yuyang18): Complete comments.
"""
def default_event_handler(event):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册