diff --git a/CMakeLists.txt b/CMakeLists.txt index 99c6c0d373052fa1be528ebb82c3d2f248e64bb0..92c866da8fc7c711fa0e983d4d31c9b0485ae760 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8) project(paddle CXX C) set(PADDLE_MAJOR_VERSION 0) set(PADDLE_MINOR_VERSION 8) -set(PADDLE_PATCH_VERSION 0b0) +set(PADDLE_PATCH_VERSION 0b1) set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION}) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake") @@ -15,7 +15,7 @@ find_package(Protobuf REQUIRED) find_package(PythonLibs 2.7 REQUIRED) find_package(PythonInterp 2.7 REQUIRED) find_package(ZLIB REQUIRED) -find_package(NumPy) +find_package(NumPy REQUIRED) find_package(Threads REQUIRED) find_package(Glog) find_package(Gflags QUIET) diff --git a/cmake/util.cmake b/cmake/util.cmake index 5f2f4a075cc579fac827fefbfc30f6743d2e4cc9..4e9efd3c187b0979dc042371b8ba5f256a484a9c 100644 --- a/cmake/util.cmake +++ b/cmake/util.cmake @@ -104,10 +104,9 @@ function(link_paddle_exe TARGET_NAME) ${PROTOBUF_LIBRARY} ${CMAKE_THREAD_LIBS_INIT} ${CBLAS_LIBS} - ${INTERAL_LIBS} ${ZLIB_LIBRARIES} - ${CMAKE_DL_LIBS} - ) + ${INTERAL_LIBS} + ${CMAKE_DL_LIBS}) if(WITH_PYTHON) target_link_libraries(${TARGET_NAME} diff --git a/demo/seqToseq/seqToseq_net.py b/demo/seqToseq/seqToseq_net.py index 479a64fa00d5fa5dd34220500251990b0baa1500..a9c0dd4af600c6a08b65f5f7f955380804deef3e 100644 --- a/demo/seqToseq/seqToseq_net.py +++ b/demo/seqToseq/seqToseq_net.py @@ -128,12 +128,16 @@ def gru_encoder_decoder(data_conf, return out decoder_group_name = "decoder_group" + group_inputs=[StaticInput(input=encoded_vector,is_seq=True), + StaticInput(input=encoded_proj,is_seq=True)] + if not is_generating: trg_embedding = embedding_layer( input=data_layer(name='target_language_word', size=target_dict_dim), size=word_vector_dim, param_attr=ParamAttr(name='_target_language_embedding')) + group_inputs.append(trg_embedding) # For decoder equipped with attention mechanism, in training, # target embeding (the groudtruth) is the data input, @@ -142,22 +146,13 @@ def gru_encoder_decoder(data_conf, # for the recurrent_group. decoder = recurrent_group(name=decoder_group_name, step=gru_decoder_with_attention, - input=[ - StaticInput(input=encoded_vector, - is_seq=True), - StaticInput(input=encoded_proj, - is_seq=True), trg_embedding - ]) + input=group_inputs) lbl = data_layer(name='target_language_next_word', size=target_dict_dim) - cost = classification_cost(input=decoder, label=lbl, ) + cost = classification_cost(input=decoder, label=lbl) outputs(cost) else: - gen_inputs = [StaticInput(input=encoded_vector, - is_seq=True), - StaticInput(input=encoded_proj, - is_seq=True), ] # In generation, the decoder predicts a next target word based on # the encoded source sequence and the last generated target word. @@ -171,10 +166,11 @@ def gru_encoder_decoder(data_conf, size=target_dict_dim, embedding_name='_target_language_embedding', embedding_size=word_vector_dim) - gen_inputs.append(trg_embedding) + group_inputs.append(trg_embedding) + beam_gen = beam_search(name=decoder_group_name, step=gru_decoder_with_attention, - input=gen_inputs, + input=group_inputs, id_input=data_layer(name="sent_id", size=1), dict_file=trg_dict_path, diff --git a/doc/build/contribute_to_paddle.md b/doc/build/contribute_to_paddle.md index 10d5d86311333c223d1024f520fccddcb4c5050d..06fcff61720755432c5618500ac509c5b3f867df 100644 --- a/doc/build/contribute_to_paddle.md +++ b/doc/build/contribute_to_paddle.md @@ -25,9 +25,12 @@ repo or just head straight to the command line: ```shell # Clone your fork to your local machine -git clone git@github.com:USERNAME/Paddle.git +git clone https://github.com/USERNAME/Paddle.git +``` +Then you can start to develop by making a local developement branch +```shell +git checkout -b MY_COOL_STUFF_BRANCH origin/master ``` -Then you can start to develop. ## Commit @@ -45,7 +48,7 @@ are the details if any. ## Keeping Fork Up to Date -Before pull your request, you shold sync you code from the latest PaddlePaddle. +Before pull your request, you should sync your code from the latest PaddlePaddle. To do this, you'll need to add a remote at first: ```shell @@ -60,8 +63,7 @@ git remote -v Update your fork with the latest upstream changes: ```shell -git fetch upstream -git pull upstream master +git pull --rebase upstream HEAD ``` If there are no unique commits locally, git will simply perform a fast-forward. @@ -74,10 +76,26 @@ Now, your local master branch is up-to-date with everything modified upstream. ```shell # push to your repository in Github -git push origin master +git push origin HEAD ``` ## Pull Request Go to the page for your fork on GitHub, select your development branch, and click the **pull request button**. + +## Update your pull request with the lastest version + +During the code review, your pull request may become stale because new commits in +baidu/Paddle. GitHub allows autmotic update if there is no conflict. You can do this +by clicking the "Update Branch" button in your pull request page. However, in the case +of conflict, you need to do the update manually. You need to do the following on +your local repository: +```shell +git checkout MY_COOL_STUFF_BRANCH +git pull --rebase upstream HEAD +# You may need to resolve the conflict according to the git prompt. +# Make and test your code. +git push -f origin HEAD +``` +Now your Pull Request is updated with the latest version. diff --git a/doc/ui/api/trainer_config_helpers/activations.rst b/doc/ui/api/trainer_config_helpers/activations.rst index 294f6e4d3127e4324a30f5e7c2f2be27db639e8f..c4e14ed779efb6f6601d2c5fa41764f318c82848 100644 --- a/doc/ui/api/trainer_config_helpers/activations.rst +++ b/doc/ui/api/trainer_config_helpers/activations.rst @@ -12,6 +12,13 @@ AbsActivation :members: AbsActivation :noindex: +ExpActivation +=============== + +.. automodule:: paddle.trainer_config_helpers.activations + :members: ExpActivation + :noindex: + IdentityActivation ================== diff --git a/doc/ui/data_provider/pydataprovider2.rst b/doc/ui/data_provider/pydataprovider2.rst index 152f8a6df6634c6292b4f219f216881c7024f4e4..e105d3be308705d228c0b188e15742a0f7325ab6 100644 --- a/doc/ui/data_provider/pydataprovider2.rst +++ b/doc/ui/data_provider/pydataprovider2.rst @@ -24,7 +24,7 @@ A small part of the original data as an example is shown as below: .. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_train.txt -Each line of the data contains two parts, separated by ';'. The first part is +Each line of the data contains two parts, separated by :code:`;`. The first part is label of an image. The second part contains 28x28 pixel float values. Just write path of the above data into train.list. It looks like this: @@ -74,7 +74,20 @@ you can take this as an example. .. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_config.py -Here we specify training data by 'train.list', and no testing data is specified. +Here we specify training data by :code:`train.list`, and no testing data is specified. +The method which actually provide data is :code:`process`. + +User also can use another style to provide data, which defines the +:code:`data_layer`'s name explicitly when `yield`. For example, +the :code:`dataprovider` is shown as below. + +.. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_provider.dict.py + :linenos: + +If user did't give the :code:`data_layer`'s name, PaddlePaddle will use +the order of :code:`data_layer` definition roughly to determine which feature to +which :code:`data_layer`. This order may be not correct, so TO DEFINE THE +:code:`data_layer`'s NAMES EXPLICITLY IS THE RECOMMANDED WAY TO PROVIDER DATA. Now, this simple example of using PyDataProvider is finished. The only thing that the user should know is how to generte **one sample** from @@ -93,7 +106,7 @@ DataProvider for the sequential model ------------------------------------- A sequence model takes sequences as its input. A sequence is made up of several timesteps. The so-called timestep, is not necessary to have something to do -with 'time'. It can also be explained to that the order of data are taken into +with time. It can also be explained to that the order of data are taken into consideration into model design and training. For example, the sentence can be interpreted as a kind of sequence data in NLP tasks. @@ -155,23 +168,7 @@ Reference @provider +++++++++ -'@provider' is a Python `Decorator`_, it can construct a PyDataProvider in -PaddlePaddle from a user defined function. Its parameters are: - -* `input_types`_ defines format of the data input. -* should_shuffle defines whether to shuffle data or not. By default, it is set - true during training, and false during testing. -* pool_size is the memory pool size (in sample number) in DataProvider. - -1 means no limit. -* can_over_batch_size defines whether PaddlePaddle can store little more - samples than pool_size. It is better to set True to avoid some deadlocks. -* calc_batch_size is a function define how to calculate batch size. This is - usefull in sequential model, that defines batch size is counted upon sequence - or token. By default, each sample or sequence counts to 1 when calculating - batch size. -* cache is a data cache strategy, see `cache`_. -* Init_hook function is invoked once the data provider is initialized, - see `init_hook`_. +.. autofunction:: paddle.trainer.PyDataProvider2.provider input_types +++++++++++ diff --git a/doc_cn/ui/data_provider/mnist_config.py b/doc_cn/ui/data_provider/mnist_config.py index 0f9094cd2776fc36490b8314a760820251d4cc64..7ba344338c374a7f9e7e4faa804e2e124577c0be 100644 --- a/doc_cn/ui/data_provider/mnist_config.py +++ b/doc_cn/ui/data_provider/mnist_config.py @@ -4,3 +4,5 @@ define_py_data_sources2(train_list='train.list', test_list=None, module='mnist_provider', obj='process') +img = data_layer(name='pixel', size=784) +label = data_layer(name='label', size=10) diff --git a/doc_cn/ui/data_provider/mnist_provider.dict.py b/doc_cn/ui/data_provider/mnist_provider.dict.py new file mode 100644 index 0000000000000000000000000000000000000000..4eab5b1fd3b50a67a9cfee92883cce71ee1a2c87 --- /dev/null +++ b/doc_cn/ui/data_provider/mnist_provider.dict.py @@ -0,0 +1,25 @@ +from paddle.trainer.PyDataProvider2 import * + + +# Define a py data provider +@provider(input_types=[ + dense_vector(28 * 28), + integer_value(10) +]) +def process(settings, filename): # settings is not used currently. + f = open(filename, 'r') # open one of training file + + for line in f: # read each line + label, pixel = line.split(';') + + # get features and label + pixels_str = pixel.split(' ') + + pixels_float = [] + for each_pixel_str in pixels_str: + pixels_float.append(float(each_pixel_str)) + + # give data to paddle. + yield { "pixel": pixels_float, 'label': int(label) } + + f.close() # close file diff --git a/doc_cn/ui/data_provider/pydataprovider2.rst b/doc_cn/ui/data_provider/pydataprovider2.rst index e743e4168821ff4713ddb015d03586ce82da4969..9e1d8c531f5ba2101d0f4d9506361e058b168181 100644 --- a/doc_cn/ui/data_provider/pydataprovider2.rst +++ b/doc_cn/ui/data_provider/pydataprovider2.rst @@ -56,6 +56,14 @@ process函数调用多次 :code:`yield` 即可。 :code:`yield` 是Python的一 这里说明了训练数据是 'train.list',而没有测试数据。引用的DataProvider是 'mnist_provider' 这个模块中的 'process' 函数。 +同时,根据模型配置文件中 :code:`data_layer` 的名字,用户也可以显式指定返回的数据对应关系。例如: + +.. literalinclude:: mnist_provider.dict.py + :linenos: + +如果用户不指定返回数据的对应关系,那么PaddlePaddle会粗略的根据layer的声明顺序, +来确定对应关系。这个对应关系可能不正确。所以推荐使用显式指定返回值和数据对应关系。 + 至此,简单的PyDataProvider样例就说明完毕了。对于用户来说,讲数据发送给PaddlePaddle,仅仅需要 知道如何从 **一个文件** 里面读取 **一条** 样本。而PaddlePaddle进程帮助用户做了 @@ -119,11 +127,13 @@ DataProvider创建的时候执行。这个初始化函数具有如下参数: @provider +++++++++ -'@provider'是一个Python的 `Decorator`_ ,他可以将某一个函数标记成一个PyDataProvider。它包含的参数有: +:code:`@provider` 是一个Python的 `Decorator`_ ,他可以将某一个函数标记成一个PyDataProvider。它包含的参数有: * `input_types`_ 是数据输入格式。具体有哪些格式,参考 `input_types`_ 。 * should_shuffle 是个DataProvider是不是要做shuffle,如果不设置的话,训练的时候默认shuffle, - 测试的时候默认不shuffle + 测试的时候默认不shuffle。 +* min_pool_size 是设置DataProvider在内存中最小暂存的数据条数。这个也是PaddlePaddle所能够保证的shuffle粒度。 + 设置成-1的话,会预先读取全部数据到内存中。 * pool_size 是设置DataProvider在内存中暂存的数据条数。设置成-1的话,即不在乎内存暂存多少条数据。 * can_over_batch_size 表示是否允许Paddle暂存略微多余pool_size的数据。这样做可以避免很多死锁问题。 一般推荐设置成True @@ -131,6 +141,11 @@ DataProvider创建的时候执行。这个初始化函数具有如下参数: 是一个batch size,但是有时为了计算均衡性,可以将一条数据设置成多个batch size * cache 是数据缓存的策略,参考 `cache`_ * init_hook 是初始化时调用的函数,参考 `init_hook`_ +* use_dynamic_order 如果是true的话,可以返回一个dict,key是data_layer的名字,value是特征值。同时,也可以 + 返回一个list或者tuple。如果是false的话,只能够返回list或者tuple +* check 设置成true的话,会根据input_types检查数据的合法性。 +* check_fail_continue 如果设置成true的话,即使在check中数据不合法,也会扔到这条数据,继续训练。 如果 + check是false的话,没有作用。 input_types +++++++++++ @@ -190,3 +205,55 @@ DataProvider提供了两种简单的Cache策略。他们是 * CacheType.NO_CACHE 不缓存任何数据,每次都会从python端读取数据 * CacheType.CACHE_PASS_IN_MEM 第一个pass会从python端读取数据,剩下的pass会直接从内存里 读取数据。 + + +注意事项 +-------- + +可能的内存泄露问题 +++++++++++++++++++ + +PaddlePaddle将train.list中的每一行,都传递给process函数,从而生成多个generator。 +即如果train.list中,有100个训练文件,即会生成100个generator。这个本身不是一个很 +严重的问题。 + +但是,如果在训练时,每一条训练数据都是一个文件,并且,训练数据非常多的情况下,就 +会生成多个generator。每个generator在没有调用的时候,是几乎不占内存的。但是,当调 +用过一次的时候,generator便会存下当前的上下文(Context)。而这个Context可能会非常 +大。并且,generator至少调用两次才会知道是否停止。所以,即使在process里面只会有一 +个yield,也需要两次随机选择到同样的generator的时候,才会释放该段内存。 + +.. code-block:: python + + def func(): + yield 0 + + f = func() # 创建generator + tmp = next(f) # 调用一次,返回0 + tmp = next(f) # 调用第二次的时候,才会Stop Iteration + +而如果按顺序调用这些generator就不会出现这个问题。 + +所以最佳实践推荐不要将每一个样本都放入train.list。而是将样本的地址放入另一个文本 +文件,train.list写入那个文本文件的地址。 或者在python generator的上下文中尽量留 +下非常少的变量引用。例如 + +.. code-block:: python + + def real_process(fn): + # ... read from fn + return result # 当函数返回的时候,python可以解除掉内部变量的引用。 + + def process(fn): + yield real_process(fn) + +这个问题是PyDataProvider读数据时候的逻辑问题,基本上不能整体修正。 + + +内存不够用的情况 +++++++++++++++++ + +PyDataProvider2会尽量使用内存。所以如果对于内存比较小的机器,推荐设置 +:code:`pool_size` 变量,而这个变量推荐大于训练的batch size,并且在内存足够 +的情况下越大越好。 + diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp index ba05b70fe9a3de486c1b568bd24f1fcca8a67389..c3b4769f7612b76f5c467fee66826f0e84a6e787 100644 --- a/paddle/gserver/dataproviders/DataProvider.cpp +++ b/paddle/gserver/dataproviders/DataProvider.cpp @@ -149,9 +149,13 @@ void DoubleBuffer::startAsyncLoad() { taskReadySem_.post(); } -ClassRegistrar DataProvider::registrar_; -DataProvider* DataProvider::create(const DataConfig& config, bool useGpu) { - return registrar_.createByType(config.type(), config, useGpu); +ClassRegistrar +DataProvider::registrar_; + +DataProvider* DataProvider::create(const DataConfig& config, + const ModelConfig& modelConfig, + bool useGpu) { + return registrar_.createByType(config.type(), config, modelConfig, useGpu); } REGISTER_DATA_PROVIDER(simple, SimpleDataProvider); diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h index aab5d93fcaa1e7286db7c2aeb60c6d10695a5ced..534491d70d546734f2197de5b04a85a56d00d732 100644 --- a/paddle/gserver/dataproviders/DataProvider.h +++ b/paddle/gserver/dataproviders/DataProvider.h @@ -39,15 +39,30 @@ limitations under the License. */ #include "paddle/parameter/Argument.h" namespace paddle { - /** * @def REGISTER_DATA_PROVIDER - * @brief Macro for registering a data provider + * @brief Macro for registering a data provider. The class type should contain + * a consturctor with parameter (DataConfig, bool). */ -#define REGISTER_DATA_PROVIDER(__type_name, __class_name) \ - static InitFunction __reg_type_##__type_name([]() { \ - DataProvider::registrar_.registerClass<__class_name>(#__type_name); \ - }) +#define REGISTER_DATA_PROVIDER(__type_name, __class_name)\ + static InitFunction __reg_type_##__type_name([]() {\ + DataProvider::registrar_.registerClass(\ + #__type_name, \ + [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \ + DataProvider* dp = new __class_name (conf, useGpu);\ + return dp;\ + });\ +}) + +/** + * @def REGISTER_DATA_PROVIDER_EX + * @brief Macro for registering a data provider, which contains a constructor + * with parameter (DataConfig, ModelConfig, bool). + */ +#define REGISTER_DATA_PROVIDER_EX(__type_name, __class_name) \ + static InitFunction __reg_type_##__type_name([] { \ + DataProvider::registrar_.registerClass<__class_name>(#__type_name); \ +}) class DataBatch; class BufferBatch; @@ -285,10 +300,18 @@ protected: */ class DataProvider { public: - static ClassRegistrar registrar_; + static ClassRegistrar registrar_; static DataProvider* create(const DataConfig& config, + const ModelConfig& modelConfig, bool useGpu = FLAGS_use_gpu); + /** + * @brief create only used for unittest. + */ + inline static DataProvider* create(const DataConfig &config, bool useGpu) { + return create(config, ModelConfig(), useGpu); + } + DataProvider(const DataConfig& config, bool useGpu) : config_(config), skipShuffle_(false), @@ -336,13 +359,13 @@ public: * @note return -1 to indicate unlimited number of samples. */ virtual int64_t getSize() = 0; + /** * @brief Get next batch training samples internally * @param[in] size size of training samples to get * @param[out] batch a batch of training samples * @return actual size of obtained training samples */ - virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch) = 0; protected: diff --git a/paddle/gserver/dataproviders/MultiDataProvider.cpp b/paddle/gserver/dataproviders/MultiDataProvider.cpp index c3d14a7069bd3dc240e343ab9b11e17d35065269..8e4f53978a0451f3bb6cd5da30f017708448f9ac 100644 --- a/paddle/gserver/dataproviders/MultiDataProvider.cpp +++ b/paddle/gserver/dataproviders/MultiDataProvider.cpp @@ -22,7 +22,9 @@ namespace paddle { using namespace std; -MultiDataProvider::MultiDataProvider(const DataConfig& config, bool useGpu) +MultiDataProvider::MultiDataProvider(const DataConfig& config, + const ModelConfig& modelConfig, + bool useGpu) : DataProvider(config, useGpu) { bool atLeastOneMainDataFlag = false; totalDataRatio_ = 0; @@ -58,7 +60,9 @@ MultiDataProvider::MultiDataProvider(const DataConfig& config, bool useGpu) subConfig.set_async_load_data(false); } subDataProviders_[i] = - std::unique_ptr(DataProvider::create(subConfig, useGpu_)); + std::unique_ptr(DataProvider::create(subConfig, + modelConfig, + useGpu_)); } } @@ -116,6 +120,6 @@ int64_t MultiDataProvider::getNextBatchInternal(int64_t size, return batch->getSize(); } -REGISTER_DATA_PROVIDER(multi, MultiDataProvider); +REGISTER_DATA_PROVIDER_EX(multi, MultiDataProvider); } // namespace paddle diff --git a/paddle/gserver/dataproviders/MultiDataProvider.h b/paddle/gserver/dataproviders/MultiDataProvider.h index 714421286376b4dc1c1485e0264540cfe38d8f65..b498ba6516c4320566b1b3cc2bd557ae016d7c39 100644 --- a/paddle/gserver/dataproviders/MultiDataProvider.h +++ b/paddle/gserver/dataproviders/MultiDataProvider.h @@ -24,7 +24,9 @@ protected: std::vector> subDataProviders_; public: - MultiDataProvider(const DataConfig& config, bool useGpu); + MultiDataProvider(const DataConfig& config, + const ModelConfig& modelConfig, + bool useGpu); ~MultiDataProvider() {} virtual void reset(); virtual void shuffle(); diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp index 8e51752dc29ee317dd268d6b7d444958e75ef88e..0b41f6a02aecc6fe8dd3d305db3f1108191c08a9 100644 --- a/paddle/gserver/dataproviders/PyDataProvider2.cpp +++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp @@ -24,6 +24,27 @@ limitations under the License. */ namespace paddle { +namespace unittest { + +static std::unique_ptr> + OnPoolFilled; + +namespace pydp2 { + +void setOnPoolFilledHook(const std::function& callback) { + OnPoolFilled.reset(new std::function()); + *OnPoolFilled = callback; +} + +void clearOnPoolFilledHook() { + OnPoolFilled.reset(); +} + +} // namespace pydp2 +} // namespace unittest + + + /** * Slot type */ @@ -179,6 +200,7 @@ public: * Ctor */ PyDataProvider2(const DataConfig& config, + const ModelConfig& modelConfig, bool useGpu) :DataProvider(config, useGpu), callingContextCreated_(2) { auto& args = config.load_data_args(); @@ -192,6 +214,12 @@ public: py::DictHelper kwargsDict(kwargs); kwargsDict.setBool("is_train", !config.for_test()); + std::vector inputs; + inputs.reserve(modelConfig.input_layer_names().size()); + std::copy(modelConfig.input_layer_names().begin(), + modelConfig.input_layer_names().end(), + std::back_inserter(inputs)); + kwargsDict.setStringList("input_order", inputs); // kwargs is keyword arguemts to create object. this->createPyDataObj(config.load_data_module(), @@ -199,7 +227,7 @@ public: config.files(), std::move(kwargs)); DBG << "Instance " << instance_.get() << " loaded."; - this->readPyFields(); + this->readPyFields(config.for_test()); DBG << "Py Field Done"; } @@ -253,14 +281,28 @@ private: CHECK_PY(instance_) << "Cannot Create instance"; } - void readPyFields() { + void readPyFields(bool testing) { py::ObjectHelper self(this->instance_); - this->skipShuffle_ = !self.getBoolAttr("should_shuffle"); bool ok; + + this->skipShuffle_ = !self.getBoolAttr("should_shuffle", + &ok /*isBoolType*/); + if (!ok) { + this->skipShuffle_ = testing; // shuffle when is training, skip shuffle + // when is testing. + } + DBG << "Provider Skip Shuffle " << this->skipShuffle_; + this->poolSize_ = self.getIntAttr("pool_size", &ok); if (!ok) { this->poolSize_ = -1UL; } + this->minPoolSize_ = self.getIntAttr("min_pool_size", &ok); + if (!ok) { + this->minPoolSize_ = -1UL; + } + this->minPoolSize_ = std::min(this->poolSize_, this->minPoolSize_); + this->canOverBatchSize_ = self.getBoolAttr("can_over_batch_size"); calcBatchSize_.reset(self.getAttr("calc_batch_size")); @@ -307,7 +349,6 @@ private: } void loadThread() { - callingContexts_.reserve(fileLists_.size()); DBG << "Creating context"; for (auto& filename : fileLists_) { PyGuard g; @@ -332,7 +373,14 @@ private: bool atEnd; data = py::iterNext(callingContexts_[cid], &atEnd); if (atEnd || data == nullptr) { - callingContexts_.erase(callingContexts_.begin() + cid); + if (cid != 0) { + std::swap(callingContexts_[cid], callingContexts_[0]); + cid = 0; + } + { + PyGuard g; + callingContexts_.pop_front(); + } this->pullCV_.notify_all(); continue; } @@ -354,11 +402,7 @@ private: if (this->loadThread_){ // wait poolActualSize < poolSize; std::unique_lock l(mtx_); pushCV_.wait(l, [this, additionalBatchSize] { - if (this->canOverBatchSize_) { - return this->poolActualSize_ < poolSize_; - } else { - return this->poolActualSize_ + additionalBatchSize < poolSize_; - } + return this->poolActualSize_ < poolSize_; }); } @@ -402,7 +446,7 @@ private: private: std::unique_ptr loadThread_; std::atomic exit_; - std::vector callingContexts_; + std::deque callingContexts_; std::deque dataPool_; size_t poolActualSize_; std::condition_variable pushCV_; @@ -413,6 +457,7 @@ private: PyObjectPtr instance_; size_t poolSize_; + size_t minPoolSize_; bool canOverBatchSize_; PyObjectPtr calcBatchSize_; PyObjectPtr generator_; @@ -478,8 +523,13 @@ public: // data pool ready. std::unique_lock l(mtx_); pullCV_.wait(l, [this, &size] { - return this->poolActualSize_ >= size || callingContexts_.empty(); + return this->poolActualSize_ >= std::max(size, this->minPoolSize_) + || callingContexts_.empty(); }); + + if (unittest::OnPoolFilled) { + (*unittest::OnPoolFilled)(this->poolActualSize_); + } } std::deque data; size_t bsize = 0; @@ -495,7 +545,8 @@ public: std::deque& pool = *poolPtr; while (bsize < size && !pool.empty()) { - { // move data from pool to data + { + // move data from pool to data std::lock_guard guard(mtx_); if (skipShuffle_) { size_t i = 0; @@ -505,14 +556,13 @@ public: } else { // when shuffle, use swap to drop only last pool element. size_t i = ThreadLocalRand::rand() % pool.size(); CHECK(pool[i] != nullptr); - if (i != pool.size() - 1) { - std::swap(pool[i], pool.back()); + if (i != 0) { + std::swap(pool[i], pool.front()); } - data.emplace_back(std::move(pool.back())); - pool.pop_back(); + data.emplace_back(std::move(pool.front())); + pool.pop_front(); } - } - { + if (calcBatchSize_) { // custom calc batch size. PyGuard guard; Py_INCREF(data.back().get()); @@ -521,8 +571,17 @@ public: calcBatchSize.getArgs().set(0, data.back()); PyObjectPtr customBatchSize(calcBatchSize()); bool ok; - bsize += py::castInt(customBatchSize.get(), &ok); + size_t tmp = py::castInt(customBatchSize.get(), &ok); CHECK(ok) << "calc_batch_size must return int"; + + if (bsize + tmp > size && !canOverBatchSize_) { + // Put data back. + pool.push_front(std::move(data.back())); + data.pop_back(); + break; + } else { + bsize += tmp; + } } else { bsize += 1; } @@ -598,7 +657,6 @@ public: } else { *batch = cpuBatch; } - return bsize; } }; @@ -606,7 +664,8 @@ public: std::unordered_set PyDataProvider2::gModuleClsPtrs_; PyObjectPtr PyDataProvider2::zeroTuple_(PyTuple_New(0)); -REGISTER_DATA_PROVIDER(py2, PyDataProvider2); +REGISTER_DATA_PROVIDER_EX(py2, PyDataProvider2); + /** * Scanner for dense slot. diff --git a/paddle/gserver/layers/CRFLayer.cpp b/paddle/gserver/layers/CRFLayer.cpp index fb0a0ddb3d45bac5339b6eb4a11ba3c01d0bd97f..c1dcad2b5f2a840ba06e8ef9833eee7a6e5e20cb 100644 --- a/paddle/gserver/layers/CRFLayer.cpp +++ b/paddle/gserver/layers/CRFLayer.cpp @@ -31,7 +31,7 @@ bool CRFLayer::init(const LayerMap& layerMap, } // coeff only affect bp, keep consistent with CostLayer - coeff_ = config_.has_coeff() ? config_.coeff() : real(1.0); + coeff_ = config_.coeff(); if (inputLayers_.size() == 3) { weightLayer_ = inputLayers_[2]; } diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp index 0f99aee03200c3834c7c27343f41f77edc5a558e..14ff8510f7b19dc24b7b1ba603485488ddd4979d 100644 --- a/paddle/gserver/layers/CostLayer.cpp +++ b/paddle/gserver/layers/CostLayer.cpp @@ -26,11 +26,7 @@ namespace paddle { bool CostLayer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) { bool ret = Layer::init(layerMap, parameterMap); - if (config_.has_coeff()) { - coeff_ = config_.coeff(); // coeff only affact bp - } else { - coeff_ = real(1.0); - } + coeff_ = config_.coeff(); if (!ret) return ret; CHECK_GE(inputLayers_.size(), 2UL); CHECK_LE(inputLayers_.size(), 3UL); diff --git a/paddle/gserver/tests/rnn_data_provider.py b/paddle/gserver/tests/rnn_data_provider.py index 85a83554c5c3045d144ee0250d2808237eccc9e0..347d5891b906b4b4779764af4e838b5d099b360b 100644 --- a/paddle/gserver/tests/rnn_data_provider.py +++ b/paddle/gserver/tests/rnn_data_provider.py @@ -19,14 +19,18 @@ data = [ [[[0, 2], [2, 5], [0, 1, 2]], 1], ] + @provider(input_types=[integer_value_sub_sequence(10), - integer_value(2)]) + integer_value(2)], + should_shuffle=False) def process_subseq(settings, file_name): for d in data: yield d + @provider(input_types=[integer_value_sequence(10), - integer_value(2)]) + integer_value(2)], + should_shuffle=False) def process_seq(settings, file_name): for d in data: seq = [] diff --git a/paddle/gserver/tests/sequenceGen.py b/paddle/gserver/tests/sequenceGen.py index cb83d79d78cc677d5ffeb77f5693d08da2a51668..cbed1f15fc4157ea29bddf5ba410d5e05271e04c 100644 --- a/paddle/gserver/tests/sequenceGen.py +++ b/paddle/gserver/tests/sequenceGen.py @@ -17,22 +17,26 @@ import sys from paddle.trainer.PyDataProvider2 import * + def hook(settings, dict_file, **kwargs): settings.word_dict = dict_file - settings.input_types = [integer_value_sequence(len(settings.word_dict)), + settings.input_types = [integer_value_sequence(len(settings.word_dict)), integer_value_sequence(3)] settings.logger.info('dict len : %d' % (len(settings.word_dict))) -@provider(init_hook=hook) + +@provider(init_hook=hook, should_shuffle=False) def process(settings, file_name): with open(file_name, 'r') as fdata: for line in fdata: label, comment = line.strip().split('\t') label = int(''.join(label.split())) words = comment.split() - word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict] + word_slot = [settings.word_dict[w] for w in words if + w in settings.word_dict] yield word_slot, [label] + ## for hierarchical sequence network def hook2(settings, dict_file, **kwargs): settings.word_dict = dict_file @@ -40,17 +44,19 @@ def hook2(settings, dict_file, **kwargs): integer_value_sub_sequence(3)] settings.logger.info('dict len : %d' % (len(settings.word_dict))) -@provider(init_hook=hook2) + +@provider(init_hook=hook2, should_shuffle=False) def process2(settings, file_name): with open(file_name) as fdata: label_list = [] word_slot_list = [] for line in fdata: if (len(line)) > 1: - label,comment = line.strip().split('\t') + label, comment = line.strip().split('\t') label = int(''.join(label.split())) words = comment.split() - word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict] + word_slot = [settings.word_dict[w] for w in words if + w in settings.word_dict] label_list.append([label]) word_slot_list.append(word_slot) else: diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp index 824295eb6e9f2461497e4151d7b0f1d603d93a32..c5fe31b29187f4a5b429a928d1870a06848691fa 100644 --- a/paddle/gserver/tests/test_PyDataProvider2.cpp +++ b/paddle/gserver/tests/test_PyDataProvider2.cpp @@ -20,6 +20,18 @@ limitations under the License. */ #include "paddle/gserver/dataproviders/DataProvider.h" P_DEFINE_string(train_list, "unittest.list", "file list for unittest"); + +namespace paddle { +namespace unittest { +namespace pydp2 { +extern void setOnPoolFilledHook(const std::function& func); +extern void clearOnPoolFilledHook(); + +} // namespace pydp2 +} // namespace unittest +} // namespace paddle + + const paddle::real epsilon = 1e-5; static inline int64_t readDataBatch( @@ -235,6 +247,112 @@ TEST(PyDataProvider2, index_sub_seq) { } } +TEST(PyDataProvider2, min_pool_size) { + paddle::DataConfig config; + config.set_type("py2"); + config.set_files(FLAGS_train_list.c_str()); + config.set_load_data_module("test_PyDataProvider2"); + config.set_load_data_object("test_min_pool_size"); + config.set_load_data_args(""); + size_t totalData = 1 << 14; + constexpr size_t batchSize = 100; + constexpr size_t minPoolSize = 1000; + paddle::DataBatch batch; + std::unique_ptr provider( + paddle::DataProvider::create(config, false)); + provider->reset(); + + paddle::unittest::pydp2::setOnPoolFilledHook([&](size_t poolSize) { + if (totalData > batchSize) { + CHECK_GE(poolSize, std::min(totalData-batchSize, minPoolSize)); + } + }); + while (true) { + size_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch); + if (realBatchSize) { + totalData -= realBatchSize; + } else { + break; + } + } + paddle::unittest::pydp2::clearOnPoolFilledHook(); +} + +TEST(PyDataProvider2, can_over_batch_size) { + paddle::DataConfig config; + config.set_type("py2"); + config.set_files(FLAGS_train_list.c_str()); + config.set_load_data_module("test_PyDataProvider2"); + config.set_load_data_object("test_can_over_batch_size"); + config.set_load_data_args(""); + paddle::DataBatch batch; + std::unique_ptr provider( + paddle::DataProvider::create(config, false)); + provider->reset(); + constexpr size_t batchSize = 100; + while (true) { + size_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch); + if (realBatchSize) { + CHECK_LE(realBatchSize, batchSize); + } else { + break; + } + } +} + +TEST(PyDataProvider2, input_order) { + paddle::DataConfig config; + config.set_type("py2"); + config.set_files(FLAGS_train_list.c_str()); + config.set_load_data_module("test_PyDataProvider2"); + config.set_load_data_object("test_input_order"); + config.set_load_data_args(""); + + paddle::ModelConfig modelConfig; + *modelConfig.add_input_layer_names() = "input1"; + *modelConfig.add_input_layer_names() = "input2"; + paddle::DataBatch batch; + std::unique_ptr provider( + paddle::DataProvider::create(config, modelConfig, false)); + provider->reset(); + constexpr size_t batchSize = 100; + while (true) { + size_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch); + if (!realBatchSize) { + break; + } + ASSERT_EQ(batch.getStreams().size(), 2); + for (size_t i = 0; i < realBatchSize; ++i) { + ASSERT_EQ(batch.getStream(0).ids->getData()[i], 0); + ASSERT_EQ(batch.getStream(1).ids->getData()[i], 1); + } + } +} + +TEST(PyDataProvider2, test_check) { + paddle::DataConfig config; + config.set_type("py2"); + config.set_files(FLAGS_train_list.c_str()); + config.set_load_data_module("test_PyDataProvider2"); + config.set_load_data_object("test_check"); + config.set_load_data_args(""); + paddle::DataBatch batch; + std::unique_ptr provider( + paddle::DataProvider::create(config, false)); + provider->reset(); + while (true) { + size_t realBatchSize = provider->getNextBatchInternal(100, &batch); + if (!realBatchSize) { + break; + } else { + auto& ivec = batch.getStream(0).ids; + for (size_t i=0; i < ivec->getSize(); ++i) { + CHECK_LT(ivec->getData()[i], 10); + } + } + } +} + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); paddle::initMain(argc, argv); diff --git a/paddle/gserver/tests/test_PyDataProvider2.py b/paddle/gserver/tests/test_PyDataProvider2.py index a88c48cb4e295d52e69e770a8906fa857c878c22..145fe85cff7d88e73233068f956489a0c2259abe 100644 --- a/paddle/gserver/tests/test_PyDataProvider2.py +++ b/paddle/gserver/tests/test_PyDataProvider2.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import random + from paddle.trainer.PyDataProvider2 import * @@ -39,7 +41,8 @@ def test_init_hook(setting, filename): @provider( - input_types=[sparse_binary_vector(30000, seq_type=SequenceType.NO_SEQUENCE)]) + input_types=[ + sparse_binary_vector(30000, seq_type=SequenceType.NO_SEQUENCE)]) def test_sparse_non_value_no_seq(setting, filename): for i in xrange(200): yield [(i + 1) * (j + 1) for j in xrange(10)] @@ -66,3 +69,43 @@ def test_index_sub_seq(setting, filename): for i in xrange(200): yield list(gen_sub_seq(i)) + + +@provider(input_types=[index_slot(100)], min_pool_size=1000) +def test_min_pool_size(setting, filename): + for _ in xrange(1 << 14): + yield random.randint(0, 100 - 1) + + +@provider(input_types=[index_slot(100, seq_type=SequenceType.SEQUENCE)], + can_over_batch_size=False, + calc_batch_size=lambda x: len(x[0])) +def test_can_over_batch_size(setting, filename): + for _ in xrange(1 << 10): + seq_len = random.randint(0, 99) + yield [random.randint(0, 100 - 1) for _ in xrange(seq_len)] + + +@provider(input_types=[index_slot(10), index_slot(10)]) +def test_input_order(setting, filename): + for _ in xrange(1000): + yield { + 'input1': 0, + 'input2': 1 + } + + +@provider(input_types=[index_slot(10)], + check=True, + check_fail_continue=True, + should_shuffle="123") # also test should shuffle +def test_check(settings, filename): + yield_good_value = False + + while not yield_good_value: + for _ in xrange(10000): + i = random.randint(0, 100) + if i < 10: + yield_good_value = True + yield i + diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp index 84d2ee1e73a54ab96b2dd5d9885df366656b915d..275150e12d12b57550ce45355cb3c533b57b4b86 100644 --- a/paddle/trainer/Trainer.cpp +++ b/paddle/trainer/Trainer.cpp @@ -194,7 +194,7 @@ void Trainer::init(const std::shared_ptr &config, dataProvider_ = dataProvider; if (!dataProvider_ && config_->hasDataConfig()) { - dataProvider_.reset(DataProvider::create(*config_, gpuData)); + dataProvider_.reset(DataProvider::create(*config_, *config_, gpuData)); } if (dataProvider_) { evaluator_.reset(trainerInternal_.getGradientMachine()->makeEvaluator()); @@ -212,7 +212,7 @@ void Trainer::init(const std::shared_ptr &config, testDataProvider_ = testDataProvider; if (!testDataProvider_ && config_->hasTestDataConfig()) { testDataProvider_.reset( - DataProvider::create(config_->getTestDataConfig(), gpuData)); + DataProvider::create(config_->getTestDataConfig(), *config_, gpuData)); } if (testDataProvider_) { tester_.reset(new Tester(config_, createTesterConfig(), diff --git a/paddle/trainer/tests/.gitignore b/paddle/trainer/tests/.gitignore index 79f701203671cda6a295db4594e10a7df4332d29..aedb0ef22e02344af27d18dc3f500fab23f6686f 100644 --- a/paddle/trainer/tests/.gitignore +++ b/paddle/trainer/tests/.gitignore @@ -1,2 +1,3 @@ dump_text.test test_pydata_provider_wrapper.json +*proto.bin diff --git a/paddle/trainer/tests/sample_trainer_rnn_gen.conf b/paddle/trainer/tests/sample_trainer_rnn_gen.conf index 5b65310e7649cba90682fcb60f808b01653876ba..abb6e9b179326ba6beb1509b6af9bf0a4e2d6338 100644 --- a/paddle/trainer/tests/sample_trainer_rnn_gen.conf +++ b/paddle/trainer/tests/sample_trainer_rnn_gen.conf @@ -13,96 +13,53 @@ # See the License for the specific language governing permissions and # limitations under the License. -#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later. -import math +from paddle.trainer_config_helpers import * -beam_search = get_config_arg('beam_search', bool, False) - -model_type("recurrent_nn") - -Settings(learning_rate=0, batch_size=15, algorithm='sgd') - -Inputs("sent_id", "dummy_data_input") -Outputs("predict_word") +settings(batch_size=15, learning_rate=0) num_words = 5 +beam_flag = get_config_arg('beam_search', bool, False) -DataLayer(name="sent_id", size=1, ) +sent_id = data_layer(name="sent_id", size=1) # This layer has no actual use, but only to decide batch_size in generation. # When generating, at least one Memory in RecurrentLayer MUST have a boot layer. -DataLayer(name="dummy_data_input", size=2, ) - -if beam_search: - RecurrentLayerGroupBegin("decoding_layer_group", - in_links=[], - out_links=["predict_word"], - generator=Generator(max_num_frames=10, - beam_size=2, - num_results_per_sample=2, )) -else: - RecurrentLayerGroupBegin("decoding_layer_group", - in_links=[], - out_links=["predict_word"], - generator=Generator(max_num_frames=10, )) -dummy_memory = Memory(name="dummy_memory", - size=2, - boot_layer="dummy_data_input") -MixedLayer(name="dummy_memory", - size=2, - bias=False, - inputs=[IdentityProjection(dummy_memory)], ) -state_memory = Memory(name="state", - size=num_words, - #boot_bias=True, - #boot_bias_active_type = "tanh", - ) - -predict_word_memory = Memory(name="predict_word", - size=num_words, - boot_with_const_id=0, ) - -MixedLayer( - name = "word_embedding", - size = num_words, # word embedding dim is the same as num_words in this test. - bias = False, - inputs = TableProjection(predict_word_memory, - initial_std=1, - learning_rate=0, - parameter_name="wordvec")) - -Layer( # simplified RNN for testing - name="state", - type="mixed", - size=num_words, - bias=False, - inputs=[FullMatrixProjection("word_embedding", - parameter_name="transtable")]) - -Layer(name="output", - type="mixed", - size=num_words, - active_type="exponential", - bias=False, - inputs=TransposedFullMatrixProjection("state", - initial_std=1, - learning_rate=0, - parameter_name="wordvec"), ) - -Layer(name="predict_word", type="maxid", inputs=["output"], ) - -Layer(name="eos_check", - type="eos_id", - eos_id=num_words - 1, - inputs=["predict_word"], ) -RecurrentLayerGroupEnd("decoding_layer_group") - -Evaluator(name="answer_printer", - type="seq_text_printer", - dict_file="./trainer/tests/test_gen_dict.txt", - result_file="./trainer/tests/dump_text.test", - inputs=[ - "sent_id", - "predict_word", - ], ) +dummy_data = data_layer(name="dummy_data_input", size=2) + +gen_inputs = [StaticInput(input=dummy_data, size=2), + GeneratedInput(size=num_words, + embedding_name="wordvec", + embedding_size=num_words)] + +def step(dummy_memory, predict_word): + + # simplified RNN for testing + with mixed_layer(size=num_words) as layer: + layer += full_matrix_projection(input=predict_word, + param_attr=ParamAttr(name="transtable")) + + with mixed_layer(size=num_words, act=ExpActivation()) as out: + out += trans_full_matrix_projection(input=layer, + param_attr=ParamAttr(name="wordvec")) + + return out + +beam_gen = beam_search(name="rnn_gen", + step=step, + input=gen_inputs, + id_input=sent_id, + dict_file="./trainer/tests/test_gen_dict.txt", + result_file="./trainer/tests/dump_text.test", + bos_id=0, + eos_id=num_words-1, + beam_size=2 if beam_flag else 1, + num_results_per_sample=2 if beam_flag else 1, + max_length=10) + +#outputs(beam_gen) +# In this config, as dummy_data_input doesn't work on beam_gen (we can find dummy_memory +# is read-only memory, and isn't used by other layers of step), we show the Inputs and Outputs +# as follows. Note that "__beam_search_predict__" is the default output name of beam_search. +Inputs("sent_id","dummy_data_input") +Outputs("__beam_search_predict__") diff --git a/paddle/utils/PythonUtil.h b/paddle/utils/PythonUtil.h index 397229d803df9d750a440498ad5a90b779597ee9..2808338fbdf596c99a122d68c1ead2fe6de6a3c5 100644 --- a/paddle/utils/PythonUtil.h +++ b/paddle/utils/PythonUtil.h @@ -183,10 +183,21 @@ public: /** * Get bool attribute. * @param field + * @param [out] isBoolType return true if attribute is bool type. If the + * attribute is not bool type, then an implicit + * conversion will happens, and will return the + * conversion result. + * + * Such as, if the attribute is 1, then the return + * value of function will be true, but the isBoolType + * will return false. * @return */ - bool getBoolAttr(const std::string& field) const { + bool getBoolAttr(const std::string& field, bool* isBoolType = nullptr) const { PyObjectPtr tmp(getAttr(field)); + if (isBoolType) { + *isBoolType = PyBool_Check(tmp.get()); + } return PyObject_IsTrue(tmp.get()); } @@ -266,6 +277,15 @@ public: this->set(key, PyBool_FromLong(b)); } + void setStringList(const std::string& key, + const std::vector& items) { + auto * list = PyList_New(items.size()); + for (size_t i=0; i < items.size(); ++i) { + PyList_SetItem(list, i, PyString_FromString(items[i].c_str())); + } + this->set(key, list); + } + private: inline void checkDict() { CHECK(PyDict_Check(this->dict_)); diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4 index a2b243a7869eaff120b25ece35e95be4d4284d18..b32f8b1ee90723e7bfdd4cbd5d93a35ac22b6b6d 100644 --- a/proto/ModelConfig.proto.m4 +++ b/proto/ModelConfig.proto.m4 @@ -299,7 +299,7 @@ sinclude(`ModelConfigLayer.proto.m4') optional bool norm_by_times = 25; // for CostLayers - optional real coeff = 26; + optional real coeff = 26 [default = 1.0]; // for AverageLayer // can be set to: 'average', 'sum' or 'squarerootn' diff --git a/proto/ParameterConfig.proto.m4 b/proto/ParameterConfig.proto.m4 index 222e070089116e68b0b29034280f12767ce21cd6..e8d512445e5025f5663fbe3e20b4425cf1633a2b 100644 --- a/proto/ParameterConfig.proto.m4 +++ b/proto/ParameterConfig.proto.m4 @@ -31,8 +31,8 @@ message ParameterUpdaterHookConfig { message ParameterConfig { required string name = 1; required uint64 size = 2; - required real learning_rate = 3; - required real momentum = 4; + optional real learning_rate = 3 [default = 1.0]; + optional real momentum = 4 [default = 0.0]; optional real initial_mean = 5 [default = 0.0]; optional real initial_std = 6 [default = 0.01]; // use L2-regularization if decay_rate set and decay_rate_l1 not set @@ -54,8 +54,8 @@ message ParameterConfig { optional int32 num_batches_regularization = 13 [default = 1]; // if is_sparse is true, para is sparse, else para is dense optional bool is_sparse = 14[default = false]; - // if para is sparse, format should be "csc" or "csr" - optional string format = 15[default = "csr"]; + // if para is sparse, format should be "csc" or "csr", empty means is not sparse + optional string format = 15 [default = ""]; // sparse remote update or not optional bool sparse_remote_update = 16 [default = false]; // gradient clipping threshold, no clipping by default diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index fd9a003bb018c87fb8e8e2992390f27edfd72f4b..dce0b909524369926eda54763e571706b79daeaf 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -1,6 +1,14 @@ set(OUTPUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/build") +file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py) +file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py) +file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py) + +set(PY_FILES paddle/__init__.py + ${TRAINER_PY_FILES} + ${HELPERS_PY_FILES} + ${UTILS_PY_FILES}) set(PADDLE_INTERNAL_PACKAGE "") if (PADDLE_WITH_INTERNAL) @@ -13,7 +21,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp - DEPENDS gen_proto_py) + DEPENDS gen_proto_py ${PY_FILES}) add_custom_target(paddle_python ALL DEPENDS ${OUTPUT_DIR}/.timestamp) diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py index c4f61473933d04126541d0f95451c06601ba4c50..34f5dd41b7e683bbfa71e8a3e23ff3f542b39591 100644 --- a/python/paddle/trainer/PyDataProvider2.py +++ b/python/paddle/trainer/PyDataProvider2.py @@ -14,6 +14,13 @@ import cPickle import logging +import collections +import functools +import itertools + +logging.basicConfig( + format="[%(levelname)s %(asctime)s %(filename)s:%(lineno)s]" + " %(message)s") class SequenceType(object): @@ -68,30 +75,39 @@ sparse_binary_vector = sparse_non_value_slot sparse_vector = sparse_value_slot integer_value = index_slot + def dense_vector_sequence(dim): return dense_vector(dim, seq_type=SequenceType.SEQUENCE) + def dense_vector_sub_sequence(dim): return dense_vector(dim, seq_type=SequenceType.SUB_SEQUENCE) + def sparse_binary_vector_sequence(dim): return sparse_binary_vector(dim, seq_type=SequenceType.SEQUENCE) + def sparse_binary_vector_sub_sequence(dim): return sparse_binary_vector(dim, seq_type=SequenceType.SUB_SEQUENCE) + def sparse_vector_sequence(dim): return sparse_vector(dim, seq_type=SequenceType.SEQUENCE) + def sparse_vector_sub_sequence(dim): return sparse_vector(dim, seq_type=SequenceType.SUB_SEQUENCE) + def integer_value_sequence(dim): return integer_value(dim, seq_type=SequenceType.SEQUENCE) + def integer_value_sub_sequence(dim): return integer_value(dim, seq_type=SequenceType.SUB_SEQUENCE) + def integer_sequence(dim): return index_slot(dim, seq_type=SequenceType.SEQUENCE) @@ -102,13 +118,97 @@ class SingleSlotWrapper(object): def __call__(self, obj, filename): for item in self.generator(obj, filename): - yield [item] + if isinstance(item, dict): + yield item + else: + yield [item] -def provider(input_types=None, should_shuffle=True, pool_size=-1, +class InputOrderWrapper(object): + def __init__(self, generator, input_order): + self.generator = generator + self.input_order = input_order + + def __call__(self, obj, filename): + for item in self.generator(obj, filename): + if isinstance(item, dict): + yield [item.get(input_name, None) for input_name in + self.input_order] + else: + yield item + + +class CheckWrapper(object): + def __init__(self, generator, input_types, check_fail_continue, logger): + self.generator = generator + self.input_types = input_types + self.check_fail_continue = check_fail_continue + self.logger = logger + + def __call__(self, obj, filename): + for items in self.generator(obj, filename): + try: + assert len(items) == len(self.input_types) + assert len(filter(lambda x: x is None, items)) == 0 + for item, input_type in itertools.izip(items, self.input_types): + callback = functools.partial(CheckWrapper.loop_callback, + input_type) + + for _ in xrange(input_type.seq_type): + callback = functools.partial(CheckWrapper.loop_check, + callback) + callback(item) + + yield items + except AssertionError as e: + self.logger.warning( + "Item (%s) is not fit the input type with error %s" + % (repr(item), repr(e))) + + if self.check_fail_continue: + continue + else: + raise + + @staticmethod + def loop_callback(input_type, each): + assert isinstance(input_type, InputType) + if input_type.type == DataType.Dense: + assert isinstance(each, collections.Sequence) + for d in each: + assert isinstance(d, float) + assert len(each, input_type.dim) + elif input_type.type == DataType.Index: + assert isinstance(each, int) + assert each < input_type.dim + elif input_type.type == DataType.SparseNonValue \ + or input_type.type == DataType.SparseValue: + assert isinstance(each, collections.Sequence) + sparse_id = set() + for k in each: + if input_type.type == DataType.SparseValue: + k, v = k + assert isinstance(v, float) + assert isinstance(k, int) + assert k < input_type.dim + sparse_id.add(k) + assert len(sparse_id) == len(each) + else: + raise RuntimeError("Not support input type") + + @staticmethod + def loop_check(callback, item): + for each in item: + callback(each) + + +def provider(input_types=None, should_shuffle=None, pool_size=-1, + min_pool_size=-1, can_over_batch_size=True, calc_batch_size=None, cache=CacheType.NO_CACHE, + check=False, check_fail_continue=False, + use_dynamic_order=True, init_hook=None, **kwargs): """ Provider decorator. Use it to make a function into PyDataProvider2 object. @@ -130,30 +230,63 @@ def provider(input_types=None, should_shuffle=True, pool_size=-1, :param input_types: Specify the input types, can also be set in init_hook. It is a list of InputType object. For example, input_types= \ [dense_vector(9), integer_value(2)]. - :param should_shuffle: True if data should shuffle. + :type input_types: list|tuple + + :param should_shuffle: True if data should shuffle. Pass None means shuffle + when is training and not to shuffle when is testing. :type should_shuffle: bool + :param pool_size: Max number of sample in data pool. :type pool_size: int + + :param min_pool_size: Set minimal sample in data pool. The PaddlePaddle will + random pick sample in pool. So the min_pool_size + effect the randomize of data. + :type min_pool_size: int + :param can_over_batch_size: True if paddle can return a mini-batch larger than batch size in settings. It is useful when custom calculate one sample's batch_size. It is very danger to set it to false and use calc_batch_size together. Default is false. + :type can_over_batch_size: bool + :param calc_batch_size: a method to calculate each sample's batch size. Default each sample's batch size is 1. But to you can customize each sample's batch size. + :type calc_batch_size: callable + :param cache: Cache strategy of Data Provider. Default is CacheType.NO_CACHE + :type cache: int :param init_hook: Initialize hook. Useful when data provider need load some external data like dictionary. The parameter is (settings, file_list, \*\*kwargs). - - settings\: Is the global settings. User can set - settings.input_types here. - - file_list\: All file names for passed to data provider. - - kwargs: Other keyword arguments passed from + - settings. It is the global settings object. User can set + settings.input_types here. + - file_list. All file names for passed to data provider. + - is_train. Is this data provider used for training or not. + - kwargs. Other keyword arguments passed from trainer_config's args parameter. + :type init_hook: callable + + :param check: Check the yield data format is as same as input_types. Enable + this will make data provide process slow but it is very useful + for debug. Default is disabled. + :type check: bool + + :param check_fail_continue: Continue train or not when check failed. Just + drop the wrong format data when it is True. Has + no effect when check set to False. + :type check_fail_continue: bool + + :param use_dynamic_order: Allow provider to yield a dictionary object, whose + key is a input data layer name, and value is the + feature value. The tuples are still allowed when + use_dynmaic_order is True. + :type use_dynamic_order: bool """ def __wrapper__(generator): @@ -168,12 +301,38 @@ def provider(input_types=None, should_shuffle=True, pool_size=-1, self.slots = kwargs['slots'] self.slots = input_types self.should_shuffle = should_shuffle + + true_table = [1, 't', 'true', 'on'] + false_table = [0, 'f', 'false', 'off'] + if not isinstance(self.should_shuffle, bool) and \ + self.should_shuffle is not None: + + if isinstance(self.should_shuffle, basestring): + self.should_shuffle = self.should_shuffle.lower() + + if self.should_shuffle in true_table: + self.should_shuffle = True + elif self.should_shuffle in false_table: + self.should_shuffle = False + else: + self.logger.warning( + "Could not recognize should_shuffle (%s), " + "just use default value of should_shuffle." + " Please set should_shuffle to bool value or " + "something in %s" % ( + repr(self.should_shuffle), + repr(true_table + false_table))) + self.should_shuffle = None + self.pool_size = pool_size self.can_over_batch_size = can_over_batch_size self.calc_batch_size = calc_batch_size self.file_list = file_list self.generator = generator self.cache = cache + self.min_pool_size = min_pool_size + self.input_order = kwargs['input_order'] + self.check = check if init_hook is not None: init_hook(self, file_list=file_list, **kwargs) if self.input_types is not None: @@ -184,6 +343,15 @@ def provider(input_types=None, should_shuffle=True, pool_size=-1, if len(self.slots) == 1: self.generator = SingleSlotWrapper(self.generator) + if use_dynamic_order: + self.generator = InputOrderWrapper(self.generator, + self.input_order) + if self.check: + self.generator = CheckWrapper(self.generator, + self.slots, + check_fail_continue, + self.logger) + return DataProvider return __wrapper__ @@ -196,3 +364,4 @@ def deserialize_args(args): :return: """ return cPickle.loads(args) + diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index f2f67f9bd66a4ebab9b5ace7fb13a194959d6c10..4ce01e005ae3ca549bb39c149e4ebf3cb04f8c1c 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -114,15 +114,15 @@ g_layer_type_map = {} # Initialize global variables. We use this function so that we can # call parse_config() multiple times def init_config_environment( - g_default_momentum = 0., - g_default_decay_rate = 0., + g_default_momentum = None, + g_default_decay_rate = None, g_default_initial_mean = 0., g_default_initial_std = 0.01, - g_default_num_batches_regularization = 1, + g_default_num_batches_regularization = None, g_default_initial_strategy = 0, g_default_initial_smart = False, - g_default_gradient_clipping_threshold = 0., - g_default_device = -1, + g_default_gradient_clipping_threshold = None, + g_default_device = None, g_default_update_hooks = None, g_default_compact_func = None, @@ -1099,12 +1099,12 @@ def Evaluator( inputs, chunk_scheme = None, num_chunk_types = None, - classification_threshold = 0.5, - positive_label = -1, - dict_file = "", - result_file = "", - num_results = 1, - delimited = True, + classification_threshold = None, + positive_label = None, + dict_file = None, + result_file = None, + num_results = None, + delimited = None, ): evaluator = g_config.model_config.evaluators.add() evaluator.type = type @@ -1120,12 +1120,19 @@ def Evaluator( evaluator.num_chunk_types = num_chunk_types g_current_submodel.evaluator_names.append(evaluator.name) - evaluator.classification_threshold = classification_threshold - evaluator.positive_label = positive_label - evaluator.dict_file = dict_file - evaluator.result_file = result_file - evaluator.num_results = num_results - evaluator.delimited = delimited + if classification_threshold is not None: + evaluator.classification_threshold = classification_threshold + if positive_label is not None: + evaluator.positive_label = positive_label + if dict_file is not None: + evaluator.dict_file = dict_file + + if result_file is not None: + evaluator.result_file = result_file + if num_results is not None: + evaluator.num_results = num_results + if delimited is not None: + evaluator.delimited = delimited class LayerBase(object): def __init__( @@ -1137,7 +1144,7 @@ class LayerBase(object): device=None, active_type="", drop_rate=0., - coeff=1.): + coeff=None): config_assert('@' not in name, "layer name: %s contain special character @" % name) global g_current_submodel @@ -1155,10 +1162,12 @@ class LayerBase(object): self.inputs = [self.inputs] self.config = g_config.model_config.layers.add() + assert isinstance(self.config, LayerConfig) self.config.name = name self.config.type = type self.config.active_type = active_type - self.config.coeff = coeff + if coeff is not None: + self.config.coeff = float(coeff) if size != 0: self.config.size = size if drop_rate != 0: @@ -1166,7 +1175,7 @@ class LayerBase(object): if device is not None: self.config.device = device - else: + elif g_default_device is not None: self.config.device = g_default_device for input_index in xrange(len(self.inputs)): @@ -1236,10 +1245,12 @@ class LayerBase(object): if bias.parameter_name is None: bias.parameter_name = gen_bias_parameter_name(self.config.name) if bias.parameter_name not in g_parameter_map: + assert isinstance(self.config, LayerConfig) + Parameter( bias.parameter_name, size, - self.config.device, + self.config.device if self.config.HasField('device') else None, dims, bias.learning_rate, bias.momentum, @@ -1265,7 +1276,7 @@ class LayerBase(object): input_index, size, dims=None, - sparse = False, + sparse = None, format = "csr"): if dims is None: # TODO(yuyang18): print warning and callstack here! @@ -1293,7 +1304,7 @@ class LayerBase(object): Parameter( input_config.parameter_name, size, - self.config.device, + self.config.device if self.config.HasField("device") else None, dims, input_config.learning_rate, input_config.momentum, @@ -1353,6 +1364,8 @@ class FCLayer(LayerBase): if sparse: psize = self.inputs[input_index].nnz + else: + sparse = None self.create_input_parameter(input_index, psize, dims, sparse, format) self.create_bias_parameter(bias, self.config.size) @@ -2430,7 +2443,6 @@ class MixedLayer(LayerBase): config_assert(inputs, 'inputs cannot be empty') super(MixedLayer, self).__init__( name, 'mixed', size, inputs=inputs, **xargs) - operator_input_index = [] for operator in self.operators: operator_conf = operator.operator_conf @@ -2445,21 +2457,31 @@ class MixedLayer(LayerBase): input_layer = self.get_input_layer(input_index) operator_conf.input_sizes.append(input_layer.size) operator_input_index.append(input_index) - if self.config.size == 0: + if self.config.size == 0: size = operator.calc_output_size(operator_conf.input_sizes) if size != 0: self.set_layer_size(size) - + else: + size = operator.calc_output_size(operator_conf.input_sizes) + if size != 0: + config_assert(size == self.config.size, + "different inputs have different size: %s vs. %s" % + (size, self.config.size)) for input_index in xrange(len(self.inputs)): input_layer = self.get_input_layer(input_index) input = self.inputs[input_index] if input_index not in operator_input_index: config_assert(isinstance(input, Projection), "input should be projection or operation") - if self.config.size == 0 and isinstance(input, Projection): + if self.config.size == 0 and isinstance(input, Projection): size = input.calc_output_size(input_layer) if size != 0: self.set_layer_size(size) - + elif isinstance(input, Projection): + sz = input.calc_output_size(input_layer) + if sz != 0: + config_assert(sz == self.config.size, + "different inputs have different size: %s vs. %s" % + (sz, self.config.size)) config_assert(size != 0, "size is not set") for input_index in xrange(len(self.inputs)): @@ -2827,27 +2849,44 @@ def Parameter( para = g_config.model_config.parameters.add() para.name = name para.size = size - para.device = device - para.dims.extend(dims); - para.learning_rate = default(learning_rate, 1.) - para.momentum = default(momentum, g_default_momentum) + if device is not None: + para.device = int(device) + para.dims.extend(dims) + + if learning_rate is not None: + para.learning_rate = float(learning_rate) + + momentum = default(momentum, g_default_momentum) + if momentum is not None: + para.momentum = float(momentum) + config_assert(not momentum or not decay_rate_l1, "momentum and decay_rate_l1 cannot both be non-zero") - para.decay_rate = default(decay_rate, g_default_decay_rate) + + decay_rate = default(decay_rate, g_default_decay_rate) + if decay_rate is not None: + para.decay_rate = decay_rate + if decay_rate_l1 is not None: para.decay_rate_l1 = decay_rate_l1 para.initial_std = default(initial_std, g_default_initial_std) para.initial_mean = default(initial_mean, g_default_initial_mean) - para.num_batches_regularization = default( + + num_batches_regularization = default( num_batches_regularization, g_default_num_batches_regularization) + if num_batches_regularization is not None: + para.num_batches_regularization = int(num_batches_regularization) + if sparse_remote_update is not None: para.sparse_remote_update = sparse_remote_update if sparse_remote_update: g_config.opt_config.use_sparse_remote_updater = True if sparse_update is not None: para.sparse_update = sparse_update - para.gradient_clipping_threshold = default( - gradient_clipping_threshold, g_default_gradient_clipping_threshold); + gradient_clipping_threshold = default( + gradient_clipping_threshold, g_default_gradient_clipping_threshold) + if gradient_clipping_threshold is not None: + para.gradient_clipping_threshold = gradient_clipping_threshold para.initial_strategy = default(initial_strategy, g_default_initial_strategy) para.initial_smart = default(initial_smart, g_default_initial_smart) if para.initial_smart: @@ -2860,15 +2899,19 @@ def Parameter( para.initial_std = 1. / math.sqrt(para.size) if g_default_compact_func is not None: sparse, format, need_compact = g_default_compact_func(para.name) - para.is_sparse = default(sparse, False) - para.format = default(format, "") - para.need_compact = default(need_compact, False) + + if sparse is not None: + para.is_sparse = sparse + if format is not None: + para.format = format + if need_compact is not None: + para.need_compact = need_compact if is_static is not None: para.is_static = is_static config_assert(not para.sparse_remote_update or not para.is_static, "sparse_remote_update and is_static cannot both be true") - - para.is_shared = default(is_shared, False) + if is_shared is not None: + para.is_shared = is_shared update_hooks = default(update_hooks, g_default_update_hooks) diff --git a/python/paddle/trainer_config_helpers/activations.py b/python/paddle/trainer_config_helpers/activations.py index 24defb06a6d66692e266c3102a5b3334d7493d38..85534675199e7627f9753e5d233f5208b14decfd 100644 --- a/python/paddle/trainer_config_helpers/activations.py +++ b/python/paddle/trainer_config_helpers/activations.py @@ -14,7 +14,7 @@ __all__ = ["TanhActivation", "SigmoidActivation", "SoftmaxActivation", "IdentityActivation", "LinearActivation", - 'SequenceSoftmaxActivation', + 'SequenceSoftmaxActivation', 'ExpActivation', "ReluActivation", "BReluActivation", "SoftReluActivation", "STanhActivation", "AbsActivation", "SquareActivation", "BaseActivation"] @@ -185,3 +185,12 @@ class SquareActivation(BaseActivation): """ def __init__(self): BaseActivation.__init__(self, 'square', False) + +class ExpActivation(BaseActivation): + """ + Exponential Activation. + + .. math:: + f(z) = e^z. + """ + def __init__(self): BaseActivation.__init__(self, 'exponential', False) diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py index 985fae9f955c950d861d4f1f2f98845562fb6fc9..7a00d0b7ec57af4eca90e76c3e5f955499205698 100644 --- a/python/paddle/trainer_config_helpers/evaluators.py +++ b/python/paddle/trainer_config_helpers/evaluators.py @@ -65,12 +65,12 @@ def evaluator_base( name=None, chunk_scheme=None, num_chunk_types=None, - classification_threshold=0.5, - positive_label=-1, - dict_file="", - result_file="", - num_results=1, - delimited=True): + classification_threshold=None, + positive_label=None, + dict_file=None, + result_file=None, + num_results=None, + delimited=None): """ Evaluator will evaluate the network status while training/testing. @@ -105,9 +105,10 @@ def evaluator_base( :type weight: LayerOutput. """ # inputs type assertions. - assert isinstance(classification_threshold, float) - assert isinstance(positive_label, int) - assert isinstance(num_results, int) + assert classification_threshold is None or isinstance( + classification_threshold, float) + assert positive_label is None or isinstance(positive_label, int) + assert num_results is None or isinstance(num_results, int) if not isinstance(input, list): input = [input] @@ -136,7 +137,7 @@ def classification_error_evaluator( label, name=None, weight=None, - threshold=0.5): + threshold=None): """ Classification Error Evaluator. It will print error rate for classification. @@ -253,7 +254,7 @@ def pnpair_evaluator( def precision_recall_evaluator( input, label, - positive_label=-1, + positive_label=None, weight=None, name=None, ): @@ -494,7 +495,7 @@ def gradient_printer_evaluator( @wrap_name_default() def maxid_printer_evaluator( input, - num_results=1, + num_results=None, name=None, ): """ @@ -518,13 +519,14 @@ def maxid_printer_evaluator( """ evaluator_base(name=name, type="max_id_printer", - input=input) + input=input, + num_results=num_results) @evaluator(EvaluatorAttribute.FOR_PRINT) @wrap_name_default() def maxframe_printer_evaluator( input, - num_results=1, + num_results=None, name=None, ): """ @@ -556,9 +558,9 @@ def maxframe_printer_evaluator( @wrap_name_default() def seqtext_printer_evaluator( input, - dict_file="", - result_file="", - delimited=True, + result_file, + dict_file=None, + delimited=None, name=None, ): """ @@ -616,6 +618,7 @@ def seqtext_printer_evaluator( :param name: Evaluator name. :type name: None|basestring """ + assert isinstance(result_file, basestring) evaluator_base(name=name, type="seq_text_printer", input=input, diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index bda0b4f5d60e82c1d577b0063fd5e164bf6117c3..fab7e6e091863fdad6d81f4c63f12132c2be5161 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -28,7 +28,7 @@ except ImportError: import copy __all__ = ["full_matrix_projection", "AggregateLevel", "ExpandLevel", - "identity_projection", "dotmul_projection", + "identity_projection", "dotmul_projection", "dotmul_operator", "table_projection", "mixed_layer", "data_layer", "embedding_layer", "fc_layer", "grumemory", "pooling_layer", "lstmemory", "last_seq", "first_seq", @@ -389,7 +389,7 @@ def identity_projection(input, offset=None): @wrap_param_attr_default() def dotmul_projection(input, param_attr=None, scale=1): """ - 1. DotMulProjection if input is a layer. + DotMulProjection with a layer as input. It performs element-wise multiplication with weight. .. math:: @@ -403,48 +403,45 @@ def dotmul_projection(input, param_attr=None, scale=1): proj = dotmul_projection(input=layer) - 2. DotMulOperator if input is a list or tuple. - It takes two inputs, performs element-wise multiplication: - - .. math:: - out.row[i] += scale * (in1.row[i] .* in2.row[i]) - - where :math:`.*` means element-wise multiplication, and - scale is a config scalar, its default value is one. - - The example usage is: - - .. code-block:: python - - op = dotmul_projection(input=[layer1, layer2], - scale=2.0) - :param input: Input layer. - :type input: LayerOutput|list|tuple + :type input: LayerOutput :param param_attr: Parameter config, None if use default. :type param_attr: ParameterAttribute :param scale: config scalar, default value is one. :type scale: float - :return: A DotMulProjection or DotMulOperator Object. - :rtype: DotMulProjection or DotMulOperator + :return: A DotMulProjection Object. + :rtype: DotMulProjection """ - if isinstance(input, LayerOutput): - proj = DotMulProjection(input_layer_name=input.name, + proj = DotMulProjection(input_layer_name=input.name, size=input.size, **param_attr.attr) - proj.origin = input - proj.origin.projection = "dot_mul" - return proj - else: - assert isinstance(input, list) or isinstance(input, tuple) - assert len(input) == 2 - assert param_attr is None - op = DotMulOperator(input_layer_name=[x.name for x in input], - scale=scale) - op.origin = input - op.origin.operator = "dot_mul" - return op + proj.origin = input + return proj +def dotmul_operator(x, y, scale=1): + """ + DotMulOperator takes two inputs and performs element-wise multiplication: + .. math:: + out.row[i] += scale * (in1.row[i] .* in2.row[i]) + where :math:`.*` means element-wise multiplication, and + scale is a config scalar, its default value is one. + The example usage is: + .. code-block:: python + op = dotmul_operator(x, y, + scale=1) + :param input: Input layer + :type input: LayerOutput + :param scale: config scalar, default value is one. + :type scale: float + :return: A DotMulOperator Object. + :rtype: DotMulOperator + """ + assert isinstance(x, LayerOutput) + assert isinstance(y, LayerOutput) + op = DotMulOperator(input_layer_names=[x.name, y.name], + scale=scale) + op.origin = [x, y] + return op @wrap_bias_attr_default(['padding_attr']) def context_projection(input, context_len, context_start=None, @@ -539,7 +536,10 @@ class MixedLayerType(LayerOutput): if not self.finalized: assert isinstance(other, Projection) or isinstance(other, Operator) self.inputs.append(other) - self.parents.append(other.origin) + if isinstance(other, Projection): + self.parents.append(other.origin) + else: + self.parents.extend(other.origin) return self else: raise MixedLayerType.AddToSealedMixedLayerException() @@ -565,7 +565,7 @@ class MixedLayerType(LayerOutput): @wrap_act_default(act=LinearActivation()) @wrap_bias_attr_default(has_bias=False) @layer_support(ERROR_CLIPPING, DROPOUT) -def mixed_layer(size, input=None, name=None, act=None, bias_attr=False, +def mixed_layer(size=0, input=None, name=None, act=None, bias_attr=False, layer_attr=None): """ Mixed Layer. A mixed layer will add all inputs together, then activate. diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py index ed676ac2152a6c08b90c18480ef0c69d5c0779f8..af85f745f63e59769e900e0fd64a10928affd654 100644 --- a/python/paddle/trainer_config_helpers/optimizers.py +++ b/python/paddle/trainer_config_helpers/optimizers.py @@ -79,7 +79,7 @@ class MomentumOptimizer(BaseSGDOptimizer): 'learning_method': 'momentum' } - def __init__(self, momentum=1e-3): + def __init__(self, momentum=None): self.momentum = momentum diff --git a/python/paddle/trainer_config_helpers/tests/layers_test_config.py b/python/paddle/trainer_config_helpers/tests/layers_test_config.py index 39c85c788eecad5c6bba6dbd2f2734725fa4fff6..27b22ecb701c52ab2a0a1f5f95d7b07186fbbb58 100644 --- a/python/paddle/trainer_config_helpers/tests/layers_test_config.py +++ b/python/paddle/trainer_config_helpers/tests/layers_test_config.py @@ -38,8 +38,11 @@ print_layer(input=[out]) outputs(classification_cost(out, data_layer(name="label", size=num_classes))) +dotmul = mixed_layer(input=[dotmul_operator(x=x1, y=y1), + dotmul_projection(input=y1)]) + # for ctc -tmp = fc_layer(input=x1, +tmp = fc_layer(input=[x1, dotmul], size=num_classes + 1, act=SoftmaxActivation()) ctc = ctc_layer(input=tmp,