diff --git a/doc/api/v2/config/optimizer.rst b/doc/api/v2/config/optimizer.rst index ec6ba0aa46239f3806ca950e8863b953d0c4150b..b32373fdef52a7aa9d64b12cda3f76cb2abf351b 100644 --- a/doc/api/v2/config/optimizer.rst +++ b/doc/api/v2/config/optimizer.rst @@ -1,5 +1,3 @@ -.. _api_v2.optimizer: - ========== Optimizer ========== diff --git a/doc/api/v2/data.rst b/doc/api/v2/data.rst index b042320bc2922a1ddfa06b5d8479ac9134ae9d89..fef87c4fbdb452771ecdb361c6eeae5b32bcee14 100644 --- a/doc/api/v2/data.rst +++ b/doc/api/v2/data.rst @@ -1,6 +1,6 @@ -======== -Datasets -======== +================================== +Data Reader Interface and DataSets +================================== DataTypes @@ -49,7 +49,6 @@ mnist :members: :noindex: - cifar +++++ @@ -61,7 +60,7 @@ conll05 +++++++ .. automodule:: paddle.v2.dataset.conll05 - :members: + :members: get_dict,get_embedding,test :noindex: imdb @@ -85,6 +84,12 @@ movielens :members: :noindex: +.. autoclass:: paddle.v2.dataset.movielens.MovieInfo + :noindex: + +.. autoclass:: paddle.v2.dataset.movielens.UserInfo + :noindex: + sentiment +++++++++ @@ -102,7 +107,7 @@ uci_housing wmt14 +++++ -.. automodule:: paddle.v2.dataset.uci_housing +.. automodule:: paddle.v2.dataset.wmt14 :members: :noindex: diff --git a/doc/api/v2/run_logic.rst b/doc/api/v2/run_logic.rst index 94921e1a7b9c0a95931136bfb65d2560dba8b8ee..5c97651f6536d89d2b5926d4b2907a547aa86b55 100644 --- a/doc/api/v2/run_logic.rst +++ b/doc/api/v2/run_logic.rst @@ -6,18 +6,21 @@ Parameters ========== .. automodule:: paddle.v2.parameters + :members: Parameters :noindex: Trainer ======= .. automodule:: paddle.v2.trainer + :members: SGD :noindex: Event ===== .. automodule:: paddle.v2.event + :members: :noindex: Inference @@ -25,3 +28,4 @@ Inference .. autofunction:: paddle.v2.infer :noindex: + \ No newline at end of file diff --git a/doc/tutorials/embedding_model/index_cn.md b/doc/tutorials/embedding_model/index_cn.md index fe800308d8d7a03619ec8e13fd8dc4aa7a8ed8be..2b4a79fbbfc0c4af74aa73c540919f5d9cf2635b 100644 --- a/doc/tutorials/embedding_model/index_cn.md +++ b/doc/tutorials/embedding_model/index_cn.md @@ -6,9 +6,10 @@ ## 介绍 ### ### 中文字典 ### -我们的字典使用内部的分词工具对百度知道和百度百科的语料进行分词后产生。分词风格如下: "《红楼梦》"将被分为 "《","红楼梦","》",和 "《红楼梦》"。字典采用UTF8编码,输出有2列:词本身和词频。字典共包含 3206325个词和3个特殊标记: +我们的字典使用内部的分词工具对百度知道和百度百科的语料进行分词后产生。分词风格如下: "《红楼梦》"将被分为 "《","红楼梦","》",和 "《红楼梦》"。字典采用UTF8编码,输出有2列:词本身和词频。字典共包含 3206326个词和4个特殊标记: - ``: 分词序列的开始 - ``: 分词序列的结束 + - `PALCEHOLDER_JUST_IGNORE_THE_EMBEDDING`: 占位符,没有实际意义 - ``: 未知词 ### 中文词向量的预训练模型 ### diff --git a/doc/tutorials/embedding_model/index_en.md b/doc/tutorials/embedding_model/index_en.md index d793a50f488e464bcd90a2fb506a8dcc3c760433..9525f64f9b5384c8e44690fb0887fb2293108e0a 100644 --- a/doc/tutorials/embedding_model/index_en.md +++ b/doc/tutorials/embedding_model/index_en.md @@ -6,9 +6,10 @@ We thank @lipeng for the pull request that defined the model schemas and pretrai ## Introduction ### ### Chinese Word Dictionary ### -Our Chinese-word dictionary is created on Baidu ZhiDao and Baidu Baike by using in-house word segmentor. For example, the participle of "《红楼梦》" is "《","红楼梦","》",and "《红楼梦》". Our dictionary (using UTF-8 format) has has two columns: word and its frequency. The total word count is 3206325, including 3 special token: +Our Chinese-word dictionary is created on Baidu ZhiDao and Baidu Baike by using in-house word segmentor. For example, the participle of "《红楼梦》" is "《","红楼梦","》",and "《红楼梦》". Our dictionary (using UTF-8 format) has has two columns: word and its frequency. The total word count is 3206326, including 4 special token: - ``: the start of a sequence - ``: the end of a sequence + - `PALCEHOLDER_JUST_IGNORE_THE_EMBEDDING`: a placeholder, just ignore it and its embedding - ``: a word not included in dictionary ### Pretrained Chinese Word Embedding Model ### diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h index bc2f2f8563526aa045ea89f15152ee2d639b5774..f9c82a2bef82b4e6bcbf0c73583505d2692f3926 100644 --- a/paddle/gserver/gradientmachines/GradientMachine.h +++ b/paddle/gserver/gradientmachines/GradientMachine.h @@ -134,9 +134,7 @@ public: backward(callback); } - virtual Argument getLayerOutput(const std::string& layerName) { - return *((Argument*)nullptr); - } + virtual Argument getLayerOutput(const std::string& layerName) = 0; // see comment in Layer.h for the function with the same name virtual void resetState() {} diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index 176b8278f1102b240d02a494388a18229a682d55..a4b63f90ec85100fc1e8c05258b008173c255508 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -9,6 +9,9 @@ if [ ${WITH_GPU} == "ON" ]; then GPU_DOCKER_PKG="python-pip python-dev" else BASE_IMAGE="python:2.7.13-slim" + # FIXME: python base image uses different python version than WITH_GPU + # need to change PYTHONHOME to /usr/local when using python base image + CPU_DOCKER_PYTHON_HOME_ENV="ENV PYTHONHOME /usr/local" fi DOCKERFILE_GPU_ENV="" @@ -97,7 +100,8 @@ ADD build/*.deb /usr/local/opt/paddle/deb/ RUN dpkg -i /usr/local/opt/paddle/deb/*.deb && \ rm -f /usr/local/opt/paddle/deb/*.deb && \ paddle version -${DOCKERFILE_CUDNN_DSO} +${CPU_DOCKER_PYTHON_HOME_ENV} +${DOCKERFILE_CUDNN_DSO} ${DOCKERFILE_GPU_ENV} # default command shows the paddle version and exit CMD ["paddle", "version"] diff --git a/paddle/utils/.gitignore b/paddle/utils/.gitignore index 956b606a18cae1bb11322accfa174ae5ce1580de..f2cfd7409412de68f4183daebcb48e7a3ae37672 100644 --- a/paddle/utils/.gitignore +++ b/paddle/utils/.gitignore @@ -1,2 +1 @@ enable_virtualenv.c -PythonUtil.cpp diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt index 10d906ee16656a808122b81d8b2fef55b8e7b7e9..171eae381af70e9e76210a77a409e56a527cab06 100644 --- a/paddle/utils/CMakeLists.txt +++ b/paddle/utils/CMakeLists.txt @@ -1,7 +1,4 @@ # The utilities for paddle - -configure_file(PythonUtil.cpp.in ${PROJ_ROOT}/paddle/utils/PythonUtil.cpp) - file(GLOB UTIL_HEADERS . *.h) file(GLOB UTIL_SOURCES . *.cpp) create_resources(enable_virtualenv.py enable_virtualenv.c) diff --git a/paddle/utils/PythonUtil.cpp.in b/paddle/utils/PythonUtil.cpp similarity index 95% rename from paddle/utils/PythonUtil.cpp.in rename to paddle/utils/PythonUtil.cpp index a51b8f765f41f6febb86002f371b14e8797e7e4d..7faeff55c28b9065179ad27b3b604a9f411249e5 100644 --- a/paddle/utils/PythonUtil.cpp.in +++ b/paddle/utils/PythonUtil.cpp @@ -195,15 +195,6 @@ extern const char enable_virtualenv_py[]; } void initPython(int argc, char** argv) { #ifndef PADDLE_NO_PYTHON - std::string pyHome; -#if defined(__APPLE__) || defined(__OSX__) - pyHome = "/usr/local/Frameworks/Python.framework/Versions/2.7"; - Py_SetPythonHome(const_cast(pyHome.c_str())); -#endif - pyHome = "@PYTHON_INSTALL_DIR@"; // NOLINT - if (!pyHome.empty()) { - Py_SetPythonHome(const_cast(pyHome.c_str())); - } Py_SetProgramName(argv[0]); Py_Initialize(); PySys_SetArgv(argc, argv); diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py index ca3e44e5a0187da33654f4955197196b150da196..2698251b9e15046eb14f71c3f5b0546ecbb4a5dd 100644 --- a/python/paddle/v2/data_feeder.py +++ b/python/paddle/v2/data_feeder.py @@ -67,7 +67,7 @@ class DataFeeder(DataProviderConverter): # [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ], # first sample # [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ] # second sample # ] - arg = feeder(minibatch_data) + arg = feeder.convert(minibatch_data) .. note:: diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py index 3a8b98b8f045b0eb58be69649486cbd0a571f118..41fda1e8f24cdef13d8ab3645862814100a1cd4c 100644 --- a/python/paddle/v2/dataset/cifar.py +++ b/python/paddle/v2/dataset/cifar.py @@ -12,9 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -CIFAR dataset: https://www.cs.toronto.edu/~kriz/cifar.html +CIFAR dataset. + +This module will download dataset from +https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into +paddle reader creators. + +The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, +with 6000 images per class. There are 50000 training images and 10000 test +images. + +The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes +containing 600 images each. There are 500 training images and 100 testing +images per class. -TODO(yuyang18): Complete the comments. """ import cPickle @@ -54,20 +65,56 @@ def reader_creator(filename, sub_name): def train100(): + """ + CIFAR-100 training set creator. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 99]. + + :return: Training reader creator + :rtype: callable + """ return reader_creator( download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'train') def test100(): + """ + CIFAR-100 test set cretor. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 9]. + + :return: Test reader creator. + :rtype: callable + """ return reader_creator(download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'test') def train10(): + """ + CIFAR-10 training set creator. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 9]. + + :return: Training reader creator + :rtype: callable + """ return reader_creator( download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'data_batch') def test10(): + """ + CIFAR-10 test set cretor. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 9]. + + :return: Test reader creator. + :rtype: callable + """ return reader_creator( download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'test_batch') diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py index f1b0ce16f21ad13d4564242c2359355236093032..12d648bf6557ed6e437320e56a80294abac29f18 100644 --- a/python/paddle/v2/dataset/conll05.py +++ b/python/paddle/v2/dataset/conll05.py @@ -11,19 +11,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +Conll05 dataset. +Paddle semantic role labeling Book and demo use this dataset as an example. +Because Conll05 is not free in public, the default downloaded URL is test set +of Conll05 (which is public). Users can change URL and MD5 to their Conll +dataset. And a pre-trained word vector model based on Wikipedia corpus is used +to initialize SRL model. +""" import tarfile import gzip import itertools from common import download -""" -Conll 2005 dataset. Paddle semantic role labeling Book and demo use this -dataset as an example. Because Conll 2005 is not free in public, the default -downloaded URL is test set of Conll 2005 (which is public). Users can change -URL and MD5 to their Conll dataset. - -TODO(yuyang18): Complete comments. -""" __all__ = ['test, get_dict', 'get_embedding'] @@ -179,6 +179,9 @@ def reader_creator(corpus_reader, def get_dict(): + """ + Get the word, verb and label dictionary of Wikipedia corpus. + """ word_dict = load_dict(download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)) verb_dict = load_dict(download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)) label_dict = load_dict(download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)) @@ -186,10 +189,24 @@ def get_dict(): def get_embedding(): + """ + Get the trained word vector based on Wikipedia corpus. + """ return download(EMB_URL, 'conll05st', EMB_MD5) def test(): + """ + Conll05 test set creator. + + Because the training dataset is not free, the test dataset is used for + training. It returns a reader creator, each sample in the reader is nine + features, including sentence sequence, predicate, predicate context, + predicate context flag and tagged sequence. + + :return: Training reader creator + :rtype: callable + """ word_dict, verb_dict, label_dict = get_dict() reader = corpus_reader( download(DATA_URL, 'conll05st', DATA_MD5), diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py index 9a7ccff4d5cd2563053adb0aae95fc6d10ad2a50..5dc5abfe53d90ec3adc9a27a49ed086953146497 100644 --- a/python/paddle/v2/dataset/imdb.py +++ b/python/paddle/v2/dataset/imdb.py @@ -12,9 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -IMDB dataset: http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz +IMDB dataset. -TODO(yuyang18): Complete comments. +This module downloads IMDB dataset from +http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set +of 25,000 highly polar movie reviews for training, and 25,000 for testing. +Besides, this module also provides API for building dictionary. """ import paddle.v2.dataset.common @@ -31,8 +34,11 @@ URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz' MD5 = '7c2ac02c03563afcf9b574c7e56c153a' -# Read files that match pattern. Tokenize and yield each file. def tokenize(pattern): + """ + Read files that match the given pattern. Tokenize and yield each file. + """ + with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb', MD5)) as tarf: # Note that we should use tarfile.next(), which does @@ -49,6 +55,10 @@ def tokenize(pattern): def build_dict(pattern, cutoff): + """ + Build a word dictionary from the corpus. Keys of the dictionary are words, + and values are zero-based IDs of these words. + """ word_freq = collections.defaultdict(int) for doc in tokenize(pattern): for word in doc: @@ -110,18 +120,46 @@ def reader_creator(pos_pattern, neg_pattern, word_idx, buffer_size): def train(word_idx): + """ + IMDB training set creator. + + It returns a reader creator, each sample in the reader is an zero-based ID + sequence and label in [0, 1]. + + :param word_idx: word dictionary + :type word_idx: dict + :return: Training reader creator + :rtype: callable + """ return reader_creator( re.compile("aclImdb/train/pos/.*\.txt$"), re.compile("aclImdb/train/neg/.*\.txt$"), word_idx, 1000) def test(word_idx): + """ + IMDB test set creator. + + It returns a reader creator, each sample in the reader is an zero-based ID + sequence and label in [0, 1]. + + :param word_idx: word dictionary + :type word_idx: dict + :return: Test reader creator + :rtype: callable + """ return reader_creator( re.compile("aclImdb/test/pos/.*\.txt$"), re.compile("aclImdb/test/neg/.*\.txt$"), word_idx, 1000) def word_dict(): + """ + Build a word dictionary from the corpus. + + :return: Word dictionary + :rtype: dict + """ return build_dict( re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150) diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py index 5d7e0282b4db639e6590ade66241328d6ab8b5e3..41ca27e23632bea7e410f9d91920bbc539d38279 100644 --- a/python/paddle/v2/dataset/imikolov.py +++ b/python/paddle/v2/dataset/imikolov.py @@ -12,9 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -imikolov's simple dataset: http://www.fit.vutbr.cz/~imikolov/rnnlm/ +imikolov's simple dataset. -Complete comments. +This module will download dataset from +http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set +into paddle reader creators. """ import paddle.v2.dataset.common import collections @@ -40,6 +42,10 @@ def word_count(f, word_freq=None): def build_dict(): + """ + Build a word dictionary from the corpus, Keys of the dictionary are words, + and values are zero-based IDs of these words. + """ train_filename = './simple-examples/data/ptb.train.txt' test_filename = './simple-examples/data/ptb.valid.txt' with tarfile.open( @@ -84,10 +90,36 @@ def reader_creator(filename, word_idx, n): def train(word_idx, n): + """ + imikolov training set creator. + + It returns a reader creator, each sample in the reader is a word ID + tuple. + + :param word_idx: word dictionary + :type word_idx: dict + :param n: sliding window size + :type n: int + :return: Training reader creator + :rtype: callable + """ return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n) def test(word_idx, n): + """ + imikolov test set creator. + + It returns a reader creator, each sample in the reader is a word ID + tuple. + + :param word_idx: word dictionary + :type word_idx: dict + :param n: sliding window size + :type n: int + :return: Test reader creator + :rtype: callable + """ return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n) diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py index 48a39b5493a8004d6eb034498a797af9c662bd19..c1347d3c66da858104858bfb6739d84051322146 100644 --- a/python/paddle/v2/dataset/mnist.py +++ b/python/paddle/v2/dataset/mnist.py @@ -15,7 +15,7 @@ MNIST dataset. This module will download dataset from http://yann.lecun.com/exdb/mnist/ and -parse train set and test set into paddle reader creators. +parse training set and test set into paddle reader creators. """ import paddle.v2.dataset.common import subprocess @@ -76,12 +76,12 @@ def reader_creator(image_filename, label_filename, buffer_size): def train(): """ - MNIST train set creator. + MNIST training set creator. It returns a reader creator, each sample in the reader is image pixels in [0, 1] and label in [0, 9]. - :return: Train reader creator + :return: Training reader creator :rtype: callable """ return reader_creator( diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py index e148ddeca0370cd76128a31ce3a4d488e9737d98..837a85912663826f0483aff4f6a38f3945375d82 100644 --- a/python/paddle/v2/dataset/movielens.py +++ b/python/paddle/v2/dataset/movielens.py @@ -14,7 +14,12 @@ """ Movielens 1-M dataset. -TODO(yuyang18): Complete comments. +Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000 +movies, which was collected by GroupLens Research. This module will download +Movielens 1-M dataset from +http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse training +set and test set into paddle reader creators. + """ import zipfile @@ -35,12 +40,19 @@ MD5 = 'c4d9eecfca2ab87c1945afe126590906' class MovieInfo(object): + """ + Movie id, title and categories information are stored in MovieInfo. + """ + def __init__(self, index, categories, title): self.index = int(index) self.categories = categories self.title = title def value(self): + """ + Get information from a movie. + """ return [ self.index, [CATEGORIES_DICT[c] for c in self.categories], [MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()] @@ -55,6 +67,10 @@ class MovieInfo(object): class UserInfo(object): + """ + User id, gender, age, and job information are stored in UserInfo. + """ + def __init__(self, index, gender, age, job_id): self.index = int(index) self.is_male = gender == 'M' @@ -62,6 +78,9 @@ class UserInfo(object): self.job_id = int(job_id) def value(self): + """ + Get information from a user. + """ return [self.index, 0 if self.is_male else 1, self.age, self.job_id] def __str__(self): @@ -148,6 +167,9 @@ test = functools.partial(__reader_creator__, is_test=True) def get_movie_title_dict(): + """ + Get movie title dictionary. + """ __initialize_meta_info__() return MOVIE_TITLE_DICT @@ -160,11 +182,17 @@ def __max_index_info__(a, b): def max_movie_id(): + """ + Get the maximum value of movie id. + """ __initialize_meta_info__() return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index def max_user_id(): + """ + Get the maximum value of user id. + """ __initialize_meta_info__() return reduce(__max_index_info__, USER_INFO.viewvalues()).index @@ -177,21 +205,33 @@ def __max_job_id_impl__(a, b): def max_job_id(): + """ + Get the maximum value of job id. + """ __initialize_meta_info__() return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id def movie_categories(): + """ + Get movie categoriges dictionary. + """ __initialize_meta_info__() return CATEGORIES_DICT def user_info(): + """ + Get user info dictionary. + """ __initialize_meta_info__() return USER_INFO def movie_info(): + """ + Get movie info dictionary. + """ __initialize_meta_info__() return MOVIE_INFO diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py index 0eeb6d5affd8c280fb74edc82cf24bf418ca8ef9..4dd34e7383fe2a290fcf61474914183a383e2b9c 100644 --- a/python/paddle/v2/dataset/sentiment.py +++ b/python/paddle/v2/dataset/sentiment.py @@ -113,7 +113,7 @@ def reader_creator(data): def train(): """ - Default train set reader creator + Default training set reader creator """ data_set = load_sentiment_data() return reader_creator(data_set[0:NUM_TRAINING_INSTANCES]) diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py index dab8620441c966b19d8218025f8d8fa5b40d1c2c..3469fd9ce12dd4d934004f90286979b73048a5c8 100644 --- a/python/paddle/v2/dataset/uci_housing.py +++ b/python/paddle/v2/dataset/uci_housing.py @@ -14,7 +14,9 @@ """ UCI Housing dataset. -TODO(yuyang18): Complete comments. +This module will download dataset from +https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and +parse training set and test set into paddle reader creators. """ import numpy as np @@ -70,6 +72,15 @@ def load_data(filename, feature_num=14, ratio=0.8): def train(): + """ + UCI_HOUSING training set creator. + + It returns a reader creator, each sample in the reader is features after + normalization and price number. + + :return: Training reader creator + :rtype: callable + """ global UCI_TRAIN_DATA load_data(download(URL, 'uci_housing', MD5)) @@ -81,6 +92,15 @@ def train(): def test(): + """ + UCI_HOUSING test set creator. + + It returns a reader creator, each sample in the reader is features after + normalization and price number. + + :return: Test reader creator + :rtype: callable + """ global UCI_TEST_DATA load_data(download(URL, 'uci_housing', MD5)) diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py index 23ca8036281b16447403c1bfcec5e11f839ab94e..0902f87741c342b237439081703081b467dc6f35 100644 --- a/python/paddle/v2/dataset/wmt14.py +++ b/python/paddle/v2/dataset/wmt14.py @@ -12,7 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -wmt14 dataset +WMT14 dataset. +The original WMT14 dataset is too large and a small set of data for set is +provided. This module will download dataset from +http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and +parse training set and test set into paddle reader creators. + """ import tarfile import gzip @@ -99,11 +104,31 @@ def reader_creator(tar_file, file_name, dict_size): def train(dict_size): + """ + WMT14 training set creator. + + It returns a reader creator, each sample in the reader is source language + word ID sequence, target language word ID sequence and next word ID + sequence. + + :return: Training reader creator + :rtype: callable + """ return reader_creator( download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'train/train', dict_size) def test(dict_size): + """ + WMT14 test set creator. + + It returns a reader creator, each sample in the reader is source language + word ID sequence, target language word ID sequence and next word ID + sequence. + + :return: Test reader creator + :rtype: callable + """ return reader_creator( download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'test/test', dict_size) diff --git a/python/paddle/v2/event.py b/python/paddle/v2/event.py index 1ad52b8baa411269d29732685871a875df5185cc..fd6050fa339d280ad54e40128ea6bae25132c873 100644 --- a/python/paddle/v2/event.py +++ b/python/paddle/v2/event.py @@ -1,14 +1,13 @@ """ -All training events. +Testing and training events. There are: +* TestResult * BeginIteration * EndIteration * BeginPass * EndPass - -TODO(yuyang18): Complete it! """ import py_paddle.swig_paddle as api diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py index 2860f18e195d9ce05a23e2bb1c24ba8924348caa..c178336303f53769863063922868cd2a22e4b957 100644 --- a/python/paddle/v2/inference.py +++ b/python/paddle/v2/inference.py @@ -9,6 +9,17 @@ __all__ = ['infer'] class Inference(object): + """ + Inference combines neural network output and parameters together + to do inference. + + :param outptut_layer: The neural network that should be inferenced. + :type output_layer: paddle.v2.config_base.Layer or the sequence + of paddle.v2.config_base.Layer + :param parameters: The parameters dictionary. + :type parameters: paddle.v2.parameters.Parameters + """ + def __init__(self, output_layer, parameters): topo = topology.Topology(output_layer) gm = api.GradientMachine.createFromConfigProto( @@ -49,7 +60,7 @@ class Inference(object): retv = None for result in self.iter_infer_field(field=field, **kwargs): if retv is None: - retv = [[]] * len(result) + retv = [[] for i in xrange(len(result))] for i, item in enumerate(result): retv[i].append(item) retv = [numpy.concatenate(out) for out in retv] diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py index 1a01d95c205c0626374e1814a170ce2d58f23a60..feefd7d758ba09f5d8f818ca1b12b00c5f0e9797 100644 --- a/python/paddle/v2/optimizer.py +++ b/python/paddle/v2/optimizer.py @@ -47,6 +47,35 @@ class Optimizer(object): class Momentum(Optimizer): + """ + SGD Optimizer. + + SGD is an optimization method, trying to find a neural network that + minimize the "cost/error" of it by iteration. In paddle's implementation + SGD Optimizer is synchronized, which means all gradients will be wait to + calculate and reduced into one gradient, then do optimize operation. + + The neural network consider the learning problem of minimizing an objective + function, that has the form of a sum + + .. math:: + + Q(w) = \\sum_{i}^{n} Q_i(w) + + The value of function Q sometimes is the cost of neural network (Mean + Square Error between prediction and label for example). The function Q is + parametrised by w, the weight/bias of neural network. And weights is what to + be learned. The i is the i-th observation in (trainning) data. + + So, the SGD method will optimize the weight by + + .. math:: + + w = w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w) + + where :math:`\\eta` is learning rate. And :math:`n` is batch size. + """ + def __init__(self, momentum=None, sparse=False, **kwargs): learning_method = v1_optimizers.MomentumOptimizer( momentum=momentum, sparse=sparse) @@ -55,6 +84,26 @@ class Momentum(Optimizer): class Adam(Optimizer): + """ + Adam optimizer. + The details of please refer `Adam: A Method for Stochastic Optimization + `_ + + .. math:: + + m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\ + v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\ + w & = w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}} + + :param beta1: the :math:`\\beta_1` in equation. + :type beta1: float + :param beta2: the :math:`\\beta_2` in equation. + :type beta2: float + :param epsilon: the :math:`\\epsilon` in equation. It is used to prevent + divided by zero. + :type epsilon: float + """ + def __init__(self, beta1=0.9, beta2=0.999, epsilon=1e-8, **kwargs): learning_method = v1_optimizers.AdamOptimizer( beta1=beta1, beta2=beta2, epsilon=epsilon) @@ -62,6 +111,24 @@ class Adam(Optimizer): class Adamax(Optimizer): + """ + Adamax optimizer. + + The details of please refer this `Adam: A Method for Stochastic Optimization + `_ + + .. math:: + + m_t & = \\beta_1 * m_{t-1} + (1-\\beta_1)* \\nabla Q_i(w) \\\\ + u_t & = max(\\beta_2*u_{t-1}, abs(\\nabla Q_i(w))) \\\\ + w_t & = w_{t-1} - (\\eta/(1-\\beta_1^t))*m_t/u_t + + :param beta1: the :math:`\\beta_1` in the equation. + :type beta1: float + :param beta2: the :math:`\\beta_2` in the equation. + :type beta2: float + """ + def __init__(self, beta1=0.9, beta2=0.999, **kwargs): learning_method = v1_optimizers.AdamaxOptimizer( beta1=beta1, beta2=beta2) @@ -69,12 +136,40 @@ class Adamax(Optimizer): class AdaGrad(Optimizer): + """ + Adagrad(for ADAptive GRAdient algorithm) optimizer. + + For details please refer this `Adaptive Subgradient Methods for + Online Learning and Stochastic Optimization + `_. + + .. math:: + + G &= \\sum_{\\tau=1}^{t} g_{\\tau} g_{\\tau}^T \\\\ + w & = w - \\eta diag(G)^{-\\frac{1}{2}} \\circ g + """ + def __init__(self, **kwargs): learning_method = v1_optimizers.AdaGradOptimizer() super(AdaGrad, self).__init__(learning_method=learning_method, **kwargs) class DecayedAdaGrad(Optimizer): + """ + AdaGrad method with decayed sum gradients. The equations of this method + show as follow. + + .. math:: + + E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\ + learning\\_rate &= 1/sqrt( ( E(g_t^2) + \\epsilon ) + + :param rho: The :math:`\\rho` parameter in that equation + :type rho: float + :param epsilon: The :math:`\\epsilon` parameter in that equation. + :type epsilon: float + """ + def __init__(self, rho=0.95, epsilon=1e-06, **kwargs): learning_method = v1_optimizers.DecayedAdaGradOptimizer( rho=rho, epsilon=epsilon) @@ -83,6 +178,24 @@ class DecayedAdaGrad(Optimizer): class AdaDelta(Optimizer): + """ + AdaDelta method. The details of adadelta please refer to this + `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD + `_. + + .. math:: + + E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\ + learning\\_rate &= sqrt( ( E(dx_{t-1}^2) + \\epsilon ) / ( \\ + E(g_t^2) + \\epsilon ) ) \\\\ + E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\\_rate)^2 + + :param rho: :math:`\\rho` in equation + :type rho: float + :param epsilon: :math:`\\rho` in equation + :type epsilon: float + """ + def __init__(self, rho=0.95, epsilon=1e-06, **kwargs): learning_method = v1_optimizers.AdaDeltaOptimizer( rho=rho, epsilon=epsilon) @@ -91,6 +204,24 @@ class AdaDelta(Optimizer): class RMSProp(Optimizer): + """ + RMSProp(for Root Mean Square Propagation) optimizer. For details please + refer this `slide `_. + + The equations of this method as follows: + + .. math:: + + v(w, t) & = \\rho v(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\ + w & = w - \\frac{\\eta} {\\sqrt{v(w,t) + \\epsilon}} \\nabla Q_{i}(w) + + :param rho: the :math:`\\rho` in the equation. The forgetting factor. + :type rho: float + :param epsilon: the :math:`\\epsilon` in the equation. + :type epsilon: float + """ + def __init__(self, rho=0.95, epsilon=1e-6, **kwargs): learning_method = v1_optimizers.RMSPropOptimizer( rho=rho, epsilon=epsilon) diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py index f5797a86c2b71502a7791453ff86c6a486c9f185..68b4967cc031dfa2dd164d822aff97585f923e48 100644 --- a/python/paddle/v2/trainer.py +++ b/python/paddle/v2/trainer.py @@ -1,3 +1,6 @@ +""" +Module Trainer +""" import collections import py_paddle.swig_paddle as api @@ -9,10 +12,6 @@ from . import optimizer as v2_optimizer from . import parameters as v2_parameters __all__ = ['SGD'] -""" -Trainer package -TODO(yuyang18): Complete comments. -""" def default_event_handler(event): @@ -29,7 +28,8 @@ def default_event_handler(event): class SGD(object): """ Simple SGD Trainer. - TODO(yuyang18): Complete comments + SGD Trainer combines data reader, network topolopy and update_equation together + to train/test a neural network. :param update_equation: The optimizer object. :type update_equation: paddle.v2.optimizer.Optimizer @@ -74,7 +74,9 @@ class SGD(object): """ Training method. Will train num_passes of input data. - :param reader: + :param reader: A reader that reads and yeilds data items. Usually we use a + batched reader to do mini-batch training. + :type reader: collections.Iterable :param num_passes: The total train passes. :param event_handler: Event handler. A method will be invoked when event occurred. @@ -132,6 +134,16 @@ class SGD(object): self.__gradient_machine__.finish() def test(self, reader, feeding=None): + """ + Testing method. Will test input data. + + :param reader: A reader that reads and yeilds data items. + :type reader: collections.Iterable + :param feeding: Feeding is a map of neural network input name and array + index that reader returns. + :type feeding: dict + :return: + """ feeder = DataFeeder(self.__data_types__, feeding) evaluator = self.__gradient_machine__.makeEvaluator() out_args = api.Arguments.createArguments(0)