提交 67d4d89c 编写于 作者: Q qijun

add doc for some v2/dataset

上级 9f417f12
...@@ -49,7 +49,6 @@ mnist ...@@ -49,7 +49,6 @@ mnist
:members: :members:
:noindex: :noindex:
cifar cifar
+++++ +++++
...@@ -61,7 +60,7 @@ conll05 ...@@ -61,7 +60,7 @@ conll05
+++++++ +++++++
.. automodule:: paddle.v2.dataset.conll05 .. automodule:: paddle.v2.dataset.conll05
:members: :members: get_dict,get_embedding,test
:noindex: :noindex:
imdb imdb
...@@ -79,12 +78,18 @@ imikolov ...@@ -79,12 +78,18 @@ imikolov
:noindex: :noindex:
movielens movielens
+++++++++ +++++++++
.. automodule:: paddle.v2.dataset.movielens .. automodule:: paddle.v2.dataset.movielens
:members: :members:
:noindex: :noindex:
.. autoclass:: paddle.v2.dataset.movielens.MovieInfo
:noindex:
.. autoclass:: paddle.v2.dataset.movielens.UserInfo
:noindex:
sentiment sentiment
+++++++++ +++++++++
...@@ -102,7 +107,7 @@ uci_housing ...@@ -102,7 +107,7 @@ uci_housing
wmt14 wmt14
+++++ +++++
.. automodule:: paddle.v2.dataset.uci_housing .. automodule:: paddle.v2.dataset.wmt14
:members: :members:
:noindex: :noindex:
...@@ -13,25 +13,18 @@ Trainer ...@@ -13,25 +13,18 @@ Trainer
======= =======
.. automodule:: paddle.v2.trainer .. automodule:: paddle.v2.trainer
:members: Trainer :members: SGD
:noindex: :noindex:
Event Event
===== =====
.. automodule:: paddle.v2.event .. automodule:: paddle.v2.event
:members: Event :members:
:noindex: :noindex:
Inference Inference
========= =========
.. automodule:: paddle.v2.inference
:members: Inference
:noindex:
.. autofunction:: paddle.v2.infer .. autofunction:: paddle.v2.infer
:members: :noindex:
:noindex: \ No newline at end of file
...@@ -52,7 +52,7 @@ class DataFeeder(DataProviderConverter): ...@@ -52,7 +52,7 @@ class DataFeeder(DataProviderConverter):
# [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ], # first sample # [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ], # first sample
# [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ] # second sample # [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ] # second sample
# ] # ]
arg = feeder(minibatch_data) arg = feeder.convert(minibatch_data)
.. note:: .. note::
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
CIFAR dataset. CIFAR dataset.
This module will download dataset from https://www.cs.toronto.edu/~kriz/cifar.html and This module will download dataset from https://www.cs.toronto.edu/~kriz/cifar.html and
parse train set and test set into paddle reader creators. parse train/test set into paddle reader creators.
The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, with 6000 The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, with 6000
images per class. There are 50000 training images and 10000 test images. images per class. There are 50000 training images and 10000 test images.
......
...@@ -12,12 +12,11 @@ ...@@ -12,12 +12,11 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
Conll 2005 dataset. Paddle semantic role labeling Book and demo use this Conll05 dataset.
dataset as an example. Because Conll 2005 is not free in public, the default Paddle semantic role labeling Book and demo use this dataset as an example. Because
downloaded URL is test set of Conll 2005 (which is public). Users can change Conll05 is not free in public, the default downloaded URL is test set of
URL and MD5 to their Conll dataset. Conll05 (which is public). Users can change URL and MD5 to their Conll dataset.
And a pre-trained word vector model based on Wikipedia corpus is used to initialize SRL model.
TODO(yuyang18): Complete comments.
""" """
import tarfile import tarfile
...@@ -180,6 +179,9 @@ def reader_creator(corpus_reader, ...@@ -180,6 +179,9 @@ def reader_creator(corpus_reader,
def get_dict(): def get_dict():
"""
Get the word, verb and label dictionary of Wikipedia corpus.
"""
word_dict = load_dict(download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)) word_dict = load_dict(download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
verb_dict = load_dict(download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)) verb_dict = load_dict(download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
label_dict = load_dict(download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)) label_dict = load_dict(download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
...@@ -187,10 +189,23 @@ def get_dict(): ...@@ -187,10 +189,23 @@ def get_dict():
def get_embedding(): def get_embedding():
"""
Get the trained word vector based on Wikipedia corpus.
"""
return download(EMB_URL, 'conll05st', EMB_MD5) return download(EMB_URL, 'conll05st', EMB_MD5)
def test(): def test():
"""
Conll05 test set creator.
Because the train dataset is not free, the test dataset is used for training.
It returns a reader creator, each sample in the reader is nine features, including sentence
sequence, predicate, predicate context, predicate context flag and tagged sequence.
:return: Train reader creator
:rtype: callable
"""
word_dict, verb_dict, label_dict = get_dict() word_dict, verb_dict, label_dict = get_dict()
reader = corpus_reader( reader = corpus_reader(
download(DATA_URL, 'conll05st', DATA_MD5), download(DATA_URL, 'conll05st', DATA_MD5),
......
...@@ -12,9 +12,13 @@ ...@@ -12,9 +12,13 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
IMDB dataset: http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz IMDB dataset.
TODO(yuyang18): Complete comments. This module download IMDB dataset from
http://ai.stanford.edu/%7Eamaas/data/sentiment/, which contains a set of 25,000
highly polar movie reviews for training, and 25,000 for testing. Besides, this
module also provides API for build dictionary and parse train set and test set
into paddle reader creators.
""" """
import paddle.v2.dataset.common import paddle.v2.dataset.common
...@@ -30,8 +34,11 @@ URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz' ...@@ -30,8 +34,11 @@ URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
MD5 = '7c2ac02c03563afcf9b574c7e56c153a' MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
# Read files that match pattern. Tokenize and yield each file.
def tokenize(pattern): def tokenize(pattern):
"""
Read files that match pattern. Tokenize and yield each file.
"""
with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb', with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb',
MD5)) as tarf: MD5)) as tarf:
# Note that we should use tarfile.next(), which does # Note that we should use tarfile.next(), which does
...@@ -48,6 +55,9 @@ def tokenize(pattern): ...@@ -48,6 +55,9 @@ def tokenize(pattern):
def build_dict(pattern, cutoff): def build_dict(pattern, cutoff):
"""
Build a word dictionary, the key is word, and the value is index.
"""
word_freq = {} word_freq = {}
for doc in tokenize(pattern): for doc in tokenize(pattern):
for word in doc: for word in doc:
...@@ -109,18 +119,46 @@ def reader_creator(pos_pattern, neg_pattern, word_idx, buffer_size): ...@@ -109,18 +119,46 @@ def reader_creator(pos_pattern, neg_pattern, word_idx, buffer_size):
def train(word_idx): def train(word_idx):
"""
IMDB train set creator.
It returns a reader creator, each sample in the reader is an index
sequence and label in [0, 1].
:param word_idx: word dictionary
:type word_idx: dict
:return: Train reader creator
:rtype: callable
"""
return reader_creator( return reader_creator(
re.compile("aclImdb/train/pos/.*\.txt$"), re.compile("aclImdb/train/pos/.*\.txt$"),
re.compile("aclImdb/train/neg/.*\.txt$"), word_idx, 1000) re.compile("aclImdb/train/neg/.*\.txt$"), word_idx, 1000)
def test(word_idx): def test(word_idx):
"""
IMDB test set creator.
It returns a reader creator, each sample in the reader is an index
sequence and label in [0, 1].
:param word_idx: word dictionary
:type word_idx: dict
:return: Test reader creator
:rtype: callable
"""
return reader_creator( return reader_creator(
re.compile("aclImdb/test/pos/.*\.txt$"), re.compile("aclImdb/test/pos/.*\.txt$"),
re.compile("aclImdb/test/neg/.*\.txt$"), word_idx, 1000) re.compile("aclImdb/test/neg/.*\.txt$"), word_idx, 1000)
def word_dict(): def word_dict():
"""
Build word dictionary.
:return: Word dictionary
:rtype: dict
"""
return build_dict( return build_dict(
re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150) re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
......
...@@ -12,9 +12,10 @@ ...@@ -12,9 +12,10 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
imikolov's simple dataset: http://www.fit.vutbr.cz/~imikolov/rnnlm/ imikolov's simple dataset.
Complete comments. This module will download dataset from http://www.fit.vutbr.cz/~imikolov/rnnlm/ and
parse train/test set into paddle reader creators.
""" """
import paddle.v2.dataset.common import paddle.v2.dataset.common
import tarfile import tarfile
...@@ -40,6 +41,9 @@ def word_count(f, word_freq=None): ...@@ -40,6 +41,9 @@ def word_count(f, word_freq=None):
def build_dict(): def build_dict():
"""
Build a word dictionary, the key is word, and the value is index.
"""
train_filename = './simple-examples/data/ptb.train.txt' train_filename = './simple-examples/data/ptb.train.txt'
test_filename = './simple-examples/data/ptb.valid.txt' test_filename = './simple-examples/data/ptb.valid.txt'
with tarfile.open( with tarfile.open(
...@@ -84,10 +88,36 @@ def reader_creator(filename, word_idx, n): ...@@ -84,10 +88,36 @@ def reader_creator(filename, word_idx, n):
def train(word_idx, n): def train(word_idx, n):
"""
imikolov train set creator.
It returns a reader creator, each sample in the reader is an index
tuple.
:param word_idx: word dictionary
:type word_idx: dict
:param n: sliding window size
:type n: int
:return: Train reader creator
:rtype: callable
"""
return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n) return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n)
def test(word_idx, n): def test(word_idx, n):
"""
imikolov test set creator.
It returns a reader creator, each sample in the reader is an index
tuple.
:param word_idx: word dictionary
:type word_idx: dict
:param n: sliding window size
:type n: int
:return: Train reader creator
:rtype: callable
"""
return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n) return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n)
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
MNIST dataset. MNIST dataset.
This module will download dataset from http://yann.lecun.com/exdb/mnist/ and This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
parse train set and test set into paddle reader creators. parse train/test set into paddle reader creators.
""" """
import paddle.v2.dataset.common import paddle.v2.dataset.common
import subprocess import subprocess
......
...@@ -14,11 +14,11 @@ ...@@ -14,11 +14,11 @@
""" """
Movielens 1-M dataset. Movielens 1-M dataset.
GroupLens Research collected and made available rating data sets from the Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000 movies, which was
MovieLens web site (http://movielens.org). Movielens 1-M dataset contains 1 million collected by GroupLens Research. This module will download Movielens 1-M dataset from
ratings from 6000 users on 4000 movies. http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse train/test set
into paddle reader creators.
TODO(yuyang18): Complete comments.
""" """
import zipfile import zipfile
...@@ -39,12 +39,18 @@ MD5 = 'c4d9eecfca2ab87c1945afe126590906' ...@@ -39,12 +39,18 @@ MD5 = 'c4d9eecfca2ab87c1945afe126590906'
class MovieInfo(object): class MovieInfo(object):
"""
Movie id, title and categories information are stored in MovieInfo.
"""
def __init__(self, index, categories, title): def __init__(self, index, categories, title):
self.index = int(index) self.index = int(index)
self.categories = categories self.categories = categories
self.title = title self.title = title
def value(self): def value(self):
"""
Get information of a movie.
"""
return [ return [
self.index, [CATEGORIES_DICT[c] for c in self.categories], self.index, [CATEGORIES_DICT[c] for c in self.categories],
[MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()] [MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()]
...@@ -59,6 +65,9 @@ class MovieInfo(object): ...@@ -59,6 +65,9 @@ class MovieInfo(object):
class UserInfo(object): class UserInfo(object):
"""
User id, gender, age, and job information are stored in UserInfo.
"""
def __init__(self, index, gender, age, job_id): def __init__(self, index, gender, age, job_id):
self.index = int(index) self.index = int(index)
self.is_male = gender == 'M' self.is_male = gender == 'M'
...@@ -66,6 +75,9 @@ class UserInfo(object): ...@@ -66,6 +75,9 @@ class UserInfo(object):
self.job_id = int(job_id) self.job_id = int(job_id)
def value(self): def value(self):
"""
Get information of a user.
"""
return [self.index, 0 if self.is_male else 1, self.age, self.job_id] return [self.index, 0 if self.is_male else 1, self.age, self.job_id]
def __str__(self): def __str__(self):
...@@ -152,6 +164,9 @@ test = functools.partial(__reader_creator__, is_test=True) ...@@ -152,6 +164,9 @@ test = functools.partial(__reader_creator__, is_test=True)
def get_movie_title_dict(): def get_movie_title_dict():
"""
Get movie title dictionary.
"""
__initialize_meta_info__() __initialize_meta_info__()
return MOVIE_TITLE_DICT return MOVIE_TITLE_DICT
...@@ -164,11 +179,17 @@ def __max_index_info__(a, b): ...@@ -164,11 +179,17 @@ def __max_index_info__(a, b):
def max_movie_id(): def max_movie_id():
"""
Get the maximum value of movie id.
"""
__initialize_meta_info__() __initialize_meta_info__()
return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index
def max_user_id(): def max_user_id():
"""
Get the maximum value of user id.
"""
__initialize_meta_info__() __initialize_meta_info__()
return reduce(__max_index_info__, USER_INFO.viewvalues()).index return reduce(__max_index_info__, USER_INFO.viewvalues()).index
...@@ -181,21 +202,33 @@ def __max_job_id_impl__(a, b): ...@@ -181,21 +202,33 @@ def __max_job_id_impl__(a, b):
def max_job_id(): def max_job_id():
"""
Get the maximum value of job id.
"""
__initialize_meta_info__() __initialize_meta_info__()
return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id
def movie_categories(): def movie_categories():
"""
Get movie categoriges dictionary.
"""
__initialize_meta_info__() __initialize_meta_info__()
return CATEGORIES_DICT return CATEGORIES_DICT
def user_info(): def user_info():
"""
Get user info dictionary.
"""
__initialize_meta_info__() __initialize_meta_info__()
return USER_INFO return USER_INFO
def movie_info(): def movie_info():
"""
Get movie info dictionary.
"""
__initialize_meta_info__() __initialize_meta_info__()
return MOVIE_INFO return MOVIE_INFO
......
...@@ -14,7 +14,9 @@ ...@@ -14,7 +14,9 @@
""" """
UCI Housing dataset. UCI Housing dataset.
TODO(yuyang18): Complete comments. This module will download dataset from
https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
parse train/test set into paddle reader creators.
""" """
import numpy as np import numpy as np
...@@ -70,6 +72,15 @@ def load_data(filename, feature_num=14, ratio=0.8): ...@@ -70,6 +72,15 @@ def load_data(filename, feature_num=14, ratio=0.8):
def train(): def train():
"""
UCI_HOUSING train set creator.
It returns a reader creator, each sample in the reader is features after normalization
and price number.
:return: Train reader creator
:rtype: callable
"""
global UCI_TRAIN_DATA global UCI_TRAIN_DATA
load_data(download(URL, 'uci_housing', MD5)) load_data(download(URL, 'uci_housing', MD5))
...@@ -81,6 +92,15 @@ def train(): ...@@ -81,6 +92,15 @@ def train():
def test(): def test():
"""
UCI_HOUSING test set creator.
It returns a reader creator, each sample in the reader is features after normalization
and price number.
:return: Test reader creator
:rtype: callable
"""
global UCI_TEST_DATA global UCI_TEST_DATA
load_data(download(URL, 'uci_housing', MD5)) load_data(download(URL, 'uci_housing', MD5))
......
...@@ -12,7 +12,12 @@ ...@@ -12,7 +12,12 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
wmt14 dataset WMT14 dataset.
The original WMT14 dataset is too large and a small set of data for set is provided.
This module will download dataset from
http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
parse train/test set into paddle reader creators.
""" """
import tarfile import tarfile
...@@ -94,11 +99,29 @@ def reader_creator(tar_file, file_name, dict_size): ...@@ -94,11 +99,29 @@ def reader_creator(tar_file, file_name, dict_size):
def train(dict_size): def train(dict_size):
"""
WMT14 train set creator.
It returns a reader creator, each sample in the reader is source language word index
sequence, target language word index sequence and next word index sequence.
:return: Train reader creator
:rtype: callable
"""
return reader_creator( return reader_creator(
download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'train/train', dict_size) download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'train/train', dict_size)
def test(dict_size): def test(dict_size):
"""
WMT14 test set creator.
It returns a reader creator, each sample in the reader is source language word index
sequence, target language word index sequence and next word index sequence.
:return: Train reader creator
:rtype: callable
"""
return reader_creator( return reader_creator(
download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'test/test', dict_size) download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'test/test', dict_size)
......
""" """
All training events. Testing and training events.
There are: There are:
* TestResult
* BeginIteration * BeginIteration
* EndIteration * EndIteration
* BeginPass * BeginPass
* EndPass * EndPass
TODO(yuyang18): Complete it!
""" """
import py_paddle.swig_paddle as api import py_paddle.swig_paddle as api
......
"""
Trainer package
"""
import collections import collections
import py_paddle.swig_paddle as api import py_paddle.swig_paddle as api
...@@ -9,10 +12,7 @@ from . import optimizer as v2_optimizer ...@@ -9,10 +12,7 @@ from . import optimizer as v2_optimizer
from . import parameters as v2_parameters from . import parameters as v2_parameters
__all__ = ['SGD'] __all__ = ['SGD']
"""
Trainer package
TODO(yuyang18): Complete comments.
"""
def default_event_handler(event): def default_event_handler(event):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册