diff --git a/CMakeLists.txt b/CMakeLists.txt
index 99c6c0d373052fa1be528ebb82c3d2f248e64bb0..92c866da8fc7c711fa0e983d4d31c9b0485ae760 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8)
 project(paddle CXX C)
 set(PADDLE_MAJOR_VERSION 0)
 set(PADDLE_MINOR_VERSION 8)
-set(PADDLE_PATCH_VERSION 0b0)
+set(PADDLE_PATCH_VERSION 0b1)
 set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
@@ -15,7 +15,7 @@ find_package(Protobuf REQUIRED)
 find_package(PythonLibs 2.7 REQUIRED)
 find_package(PythonInterp 2.7 REQUIRED)
 find_package(ZLIB REQUIRED)
-find_package(NumPy)
+find_package(NumPy REQUIRED)
 find_package(Threads REQUIRED)
 find_package(Glog)
 find_package(Gflags QUIET)
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 5f2f4a075cc579fac827fefbfc30f6743d2e4cc9..4e9efd3c187b0979dc042371b8ba5f256a484a9c 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -104,10 +104,9 @@ function(link_paddle_exe TARGET_NAME)
         ${PROTOBUF_LIBRARY}
         ${CMAKE_THREAD_LIBS_INIT}
         ${CBLAS_LIBS}
-        ${INTERAL_LIBS}
         ${ZLIB_LIBRARIES}
-        ${CMAKE_DL_LIBS}
-        )
+        ${INTERAL_LIBS}
+        ${CMAKE_DL_LIBS})
     
     if(WITH_PYTHON)
         target_link_libraries(${TARGET_NAME}
diff --git a/demo/seqToseq/seqToseq_net.py b/demo/seqToseq/seqToseq_net.py
index 479a64fa00d5fa5dd34220500251990b0baa1500..a9c0dd4af600c6a08b65f5f7f955380804deef3e 100644
--- a/demo/seqToseq/seqToseq_net.py
+++ b/demo/seqToseq/seqToseq_net.py
@@ -128,12 +128,16 @@ def gru_encoder_decoder(data_conf,
         return out
 
     decoder_group_name = "decoder_group"
+    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
+                  StaticInput(input=encoded_proj,is_seq=True)]
+
     if not is_generating:
         trg_embedding = embedding_layer(
             input=data_layer(name='target_language_word',
                              size=target_dict_dim),
             size=word_vector_dim,
             param_attr=ParamAttr(name='_target_language_embedding'))
+        group_inputs.append(trg_embedding)
 
         # For decoder equipped with attention mechanism, in training,
         # target embeding (the groudtruth) is the data input,
@@ -142,22 +146,13 @@ def gru_encoder_decoder(data_conf,
         # for the recurrent_group.
         decoder = recurrent_group(name=decoder_group_name,
                                   step=gru_decoder_with_attention,
-                                  input=[
-                                      StaticInput(input=encoded_vector,
-                                                  is_seq=True),
-                                      StaticInput(input=encoded_proj,
-                                                  is_seq=True), trg_embedding
-                                  ])
+                                  input=group_inputs)
 
         lbl = data_layer(name='target_language_next_word',
                          size=target_dict_dim)
-        cost = classification_cost(input=decoder, label=lbl, )
+        cost = classification_cost(input=decoder, label=lbl)
         outputs(cost)
     else:
-        gen_inputs = [StaticInput(input=encoded_vector,
-                                  is_seq=True),
-                      StaticInput(input=encoded_proj,
-                                  is_seq=True), ]
         # In generation, the decoder predicts a next target word based on
         # the encoded source sequence and the last generated target word.
 
@@ -171,10 +166,11 @@ def gru_encoder_decoder(data_conf,
             size=target_dict_dim,
             embedding_name='_target_language_embedding',
             embedding_size=word_vector_dim)
-        gen_inputs.append(trg_embedding)
+        group_inputs.append(trg_embedding)
+
         beam_gen = beam_search(name=decoder_group_name,
                                step=gru_decoder_with_attention,
-                               input=gen_inputs,
+                               input=group_inputs,
                                id_input=data_layer(name="sent_id",
                                                    size=1),
                                dict_file=trg_dict_path,
diff --git a/doc/build/contribute_to_paddle.md b/doc/build/contribute_to_paddle.md
index 10d5d86311333c223d1024f520fccddcb4c5050d..06fcff61720755432c5618500ac509c5b3f867df 100644
--- a/doc/build/contribute_to_paddle.md
+++ b/doc/build/contribute_to_paddle.md
@@ -25,9 +25,12 @@ repo or just head straight to the command line:
  
 ```shell
 # Clone your fork to your local machine
-git clone git@github.com:USERNAME/Paddle.git
+git clone https://github.com/USERNAME/Paddle.git
+```
+Then you can start to develop by making a local developement branch
+```shell
+git checkout -b MY_COOL_STUFF_BRANCH origin/master
 ```
-Then you can start to develop. 
 
 ## Commit
 
@@ -45,7 +48,7 @@ are the details if any.
 
 ## Keeping Fork Up to Date
 
-Before pull your request, you shold sync you code from the latest PaddlePaddle.
+Before pull your request, you should sync your code from the latest PaddlePaddle.
 To do this, you'll need to add a remote at first:
 
 ```shell
@@ -60,8 +63,7 @@ git remote -v
 Update your fork with the latest upstream changes:
 
 ```shell
-git fetch upstream
-git pull upstream master
+git pull --rebase upstream HEAD
 ```
 
 If there are no unique commits locally, git will simply perform a fast-forward.
@@ -74,10 +76,26 @@ Now, your local master branch is up-to-date with everything modified upstream.
 
 ```shell
 # push to your repository in Github
-git push origin master
+git push origin HEAD
 ```
 
 ## Pull Request
 
 Go to the page for your fork on GitHub, select your development branch,
 and click the **pull request button**.
+
+## Update your pull request with the lastest version
+
+During the code review, your pull request may become stale because new commits in
+baidu/Paddle. GitHub allows autmotic update if there is no conflict. You can do this
+by clicking the "Update Branch" button in your pull request page. However, in the case
+of conflict, you need to do the update manually. You need to do the following on
+your local repository:
+```shell
+git checkout MY_COOL_STUFF_BRANCH
+git pull --rebase upstream HEAD
+# You may need to resolve the conflict according to the git prompt.
+# Make and test your code.
+git push -f origin HEAD
+```
+Now your Pull Request is updated with the latest version.
diff --git a/doc/ui/api/trainer_config_helpers/activations.rst b/doc/ui/api/trainer_config_helpers/activations.rst
index 294f6e4d3127e4324a30f5e7c2f2be27db639e8f..c4e14ed779efb6f6601d2c5fa41764f318c82848 100644
--- a/doc/ui/api/trainer_config_helpers/activations.rst
+++ b/doc/ui/api/trainer_config_helpers/activations.rst
@@ -12,6 +12,13 @@ AbsActivation
     :members: AbsActivation
     :noindex:
     
+ExpActivation
+===============
+
+..  automodule:: paddle.trainer_config_helpers.activations
+    :members: ExpActivation
+    :noindex:
+    
 IdentityActivation
 ==================
 
diff --git a/doc/ui/data_provider/pydataprovider2.rst b/doc/ui/data_provider/pydataprovider2.rst
index 152f8a6df6634c6292b4f219f216881c7024f4e4..e105d3be308705d228c0b188e15742a0f7325ab6 100644
--- a/doc/ui/data_provider/pydataprovider2.rst
+++ b/doc/ui/data_provider/pydataprovider2.rst
@@ -24,7 +24,7 @@ A small part of the original data as an example is shown as below:
 
 .. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_train.txt
 
-Each line of the data contains two parts, separated by ';'. The first part is
+Each line of the data contains two parts, separated by :code:`;`. The first part is
 label of an image. The second part contains 28x28 pixel float values.
 
 Just write path of the above data into train.list. It looks like this:
@@ -74,7 +74,20 @@ you can take this as an example.
 
 .. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_config.py
 
-Here we specify training data by 'train.list', and no testing data is specified.
+Here we specify training data by :code:`train.list`, and no testing data is specified.
+The method which actually provide data is :code:`process`.
+
+User also can use another style to provide data, which defines the
+:code:`data_layer`'s name explicitly when `yield`. For example,
+the :code:`dataprovider` is shown as below.
+
+.. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_provider.dict.py
+   :linenos:
+
+If user did't give the :code:`data_layer`'s name, PaddlePaddle will use
+the order of :code:`data_layer` definition roughly to determine which feature to
+which :code:`data_layer`. This order may be not correct, so TO DEFINE THE
+:code:`data_layer`'s NAMES EXPLICITLY IS THE RECOMMANDED WAY TO PROVIDER DATA.
 
 Now, this simple example of using PyDataProvider is finished.
 The only thing that the user should know is how to generte **one sample** from
@@ -93,7 +106,7 @@ DataProvider for the sequential model
 -------------------------------------
 A sequence model takes sequences as its input. A sequence is made up of several
 timesteps. The so-called timestep, is not necessary to have something to do
-with 'time'. It can also be explained to that the order of data are taken into
+with time. It can also be explained to that the order of data are taken into
 consideration into model design and training.
 For example, the sentence can be interpreted as a kind of sequence data in NLP
 tasks.
@@ -155,23 +168,7 @@ Reference
 @provider
 +++++++++
 
-'@provider' is a Python `Decorator`_, it can construct a PyDataProvider in
-PaddlePaddle from a user defined function. Its parameters are:
-
-* `input_types`_ defines format of the data input.
-* should_shuffle defines whether to shuffle data or not. By default, it is set
-  true during training, and false during testing.
-* pool_size is the memory pool size (in sample number) in DataProvider.
-  -1 means no limit.
-* can_over_batch_size defines whether PaddlePaddle can store little more
-  samples than pool_size. It is better to set True to avoid some deadlocks.
-* calc_batch_size is a function define how to calculate batch size. This is
-  usefull in sequential model, that defines batch size is counted upon sequence
-  or token. By default, each sample or sequence counts to 1 when calculating
-  batch size.
-* cache is a data cache strategy, see `cache`_.
-* Init_hook function is invoked once the data provider is initialized,
-  see `init_hook`_.
+.. autofunction:: paddle.trainer.PyDataProvider2.provider
 
 input_types
 +++++++++++
diff --git a/doc_cn/ui/data_provider/mnist_config.py b/doc_cn/ui/data_provider/mnist_config.py
index 0f9094cd2776fc36490b8314a760820251d4cc64..7ba344338c374a7f9e7e4faa804e2e124577c0be 100644
--- a/doc_cn/ui/data_provider/mnist_config.py
+++ b/doc_cn/ui/data_provider/mnist_config.py
@@ -4,3 +4,5 @@ define_py_data_sources2(train_list='train.list',
                         test_list=None,
                         module='mnist_provider',
                         obj='process')
+img = data_layer(name='pixel', size=784)
+label = data_layer(name='label', size=10)
diff --git a/doc_cn/ui/data_provider/mnist_provider.dict.py b/doc_cn/ui/data_provider/mnist_provider.dict.py
new file mode 100644
index 0000000000000000000000000000000000000000..4eab5b1fd3b50a67a9cfee92883cce71ee1a2c87
--- /dev/null
+++ b/doc_cn/ui/data_provider/mnist_provider.dict.py
@@ -0,0 +1,25 @@
+from paddle.trainer.PyDataProvider2 import *
+
+
+# Define a py data provider
+@provider(input_types=[
+    dense_vector(28 * 28),
+    integer_value(10)
+])
+def process(settings, filename):  # settings is not used currently.
+    f = open(filename, 'r')  # open one of training file
+
+    for line in f:  # read each line
+        label, pixel = line.split(';')
+
+        # get features and label
+        pixels_str = pixel.split(' ')
+
+        pixels_float = []
+        for each_pixel_str in pixels_str:
+            pixels_float.append(float(each_pixel_str))
+
+        # give data to paddle.
+        yield { "pixel": pixels_float, 'label': int(label) }
+
+    f.close()  # close file
diff --git a/doc_cn/ui/data_provider/pydataprovider2.rst b/doc_cn/ui/data_provider/pydataprovider2.rst
index e743e4168821ff4713ddb015d03586ce82da4969..9e1d8c531f5ba2101d0f4d9506361e058b168181 100644
--- a/doc_cn/ui/data_provider/pydataprovider2.rst
+++ b/doc_cn/ui/data_provider/pydataprovider2.rst
@@ -56,6 +56,14 @@ process函数调用多次 :code:`yield` 即可。 :code:`yield` 是Python的一
 这里说明了训练数据是 'train.list'，而没有测试数据。引用的DataProvider是 'mnist_provider' 
 这个模块中的 'process' 函数。
 
+同时，根据模型配置文件中 :code:`data_layer` 的名字，用户也可以显式指定返回的数据对应关系。例如:
+
+.. literalinclude:: mnist_provider.dict.py
+   :linenos:
+
+如果用户不指定返回数据的对应关系，那么PaddlePaddle会粗略的根据layer的声明顺序，
+来确定对应关系。这个对应关系可能不正确。所以推荐使用显式指定返回值和数据对应关系。
+
 至此，简单的PyDataProvider样例就说明完毕了。对于用户来说，讲数据发送给PaddlePaddle，仅仅需要
 知道如何从 **一个文件** 里面读取 **一条** 样本。而PaddlePaddle进程帮助用户做了
 
@@ -119,11 +127,13 @@ DataProvider创建的时候执行。这个初始化函数具有如下参数:
 @provider
 +++++++++
 
-'@provider'是一个Python的 `Decorator`_ ，他可以将某一个函数标记成一个PyDataProvider。它包含的参数有:
+:code:`@provider` 是一个Python的 `Decorator`_ ，他可以将某一个函数标记成一个PyDataProvider。它包含的参数有:
 
 *  `input_types`_ 是数据输入格式。具体有哪些格式，参考 `input_types`_ 。
 *  should_shuffle 是个DataProvider是不是要做shuffle，如果不设置的话，训练的时候默认shuffle，
-   测试的时候默认不shuffle
+   测试的时候默认不shuffle。
+*  min_pool_size 是设置DataProvider在内存中最小暂存的数据条数。这个也是PaddlePaddle所能够保证的shuffle粒度。
+   设置成-1的话，会预先读取全部数据到内存中。
 *  pool_size 是设置DataProvider在内存中暂存的数据条数。设置成-1的话，即不在乎内存暂存多少条数据。
 *  can_over_batch_size 表示是否允许Paddle暂存略微多余pool_size的数据。这样做可以避免很多死锁问题。
    一般推荐设置成True
@@ -131,6 +141,11 @@ DataProvider创建的时候执行。这个初始化函数具有如下参数:
    是一个batch size，但是有时为了计算均衡性，可以将一条数据设置成多个batch size
 *  cache 是数据缓存的策略，参考 `cache`_
 *  init_hook 是初始化时调用的函数，参考 `init_hook`_
+*  use_dynamic_order 如果是true的话，可以返回一个dict，key是data_layer的名字，value是特征值。同时，也可以
+   返回一个list或者tuple。如果是false的话，只能够返回list或者tuple
+*  check 设置成true的话，会根据input_types检查数据的合法性。
+*  check_fail_continue 如果设置成true的话，即使在check中数据不合法，也会扔到这条数据，继续训练。 如果
+   check是false的话，没有作用。
 
 input_types
 +++++++++++
@@ -190,3 +205,55 @@ DataProvider提供了两种简单的Cache策略。他们是
 * CacheType.NO_CACHE 不缓存任何数据，每次都会从python端读取数据
 * CacheType.CACHE_PASS_IN_MEM 第一个pass会从python端读取数据，剩下的pass会直接从内存里
   读取数据。 
+
+
+注意事项
+--------
+
+可能的内存泄露问题
+++++++++++++++++++
+
+PaddlePaddle将train.list中的每一行，都传递给process函数，从而生成多个generator。
+即如果train.list中，有100个训练文件，即会生成100个generator。这个本身不是一个很
+严重的问题。
+
+但是，如果在训练时，每一条训练数据都是一个文件，并且，训练数据非常多的情况下，就
+会生成多个generator。每个generator在没有调用的时候，是几乎不占内存的。但是，当调
+用过一次的时候，generator便会存下当前的上下文(Context)。而这个Context可能会非常
+大。并且，generator至少调用两次才会知道是否停止。所以，即使在process里面只会有一
+个yield，也需要两次随机选择到同样的generator的时候，才会释放该段内存。
+
+..  code-block:: python
+
+    def func():
+        yield 0
+
+    f = func()  # 创建generator
+    tmp = next(f)  # 调用一次，返回0
+    tmp = next(f)  # 调用第二次的时候，才会Stop Iteration
+
+而如果按顺序调用这些generator就不会出现这个问题。
+
+所以最佳实践推荐不要将每一个样本都放入train.list。而是将样本的地址放入另一个文本
+文件，train.list写入那个文本文件的地址。 或者在python generator的上下文中尽量留
+下非常少的变量引用。例如
+
+..  code-block:: python
+
+    def real_process(fn):
+        # ... read from fn
+        return result   # 当函数返回的时候，python可以解除掉内部变量的引用。
+
+    def process(fn):
+        yield real_process(fn)
+
+这个问题是PyDataProvider读数据时候的逻辑问题，基本上不能整体修正。
+
+
+内存不够用的情况
+++++++++++++++++
+
+PyDataProvider2会尽量使用内存。所以如果对于内存比较小的机器，推荐设置
+:code:`pool_size` 变量，而这个变量推荐大于训练的batch size，并且在内存足够
+的情况下越大越好。
+
diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp
index ba05b70fe9a3de486c1b568bd24f1fcca8a67389..c3b4769f7612b76f5c467fee66826f0e84a6e787 100644
--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -149,9 +149,13 @@ void DoubleBuffer::startAsyncLoad() {
   taskReadySem_.post();
 }
 
-ClassRegistrar<DataProvider, DataConfig, bool> DataProvider::registrar_;
-DataProvider* DataProvider::create(const DataConfig& config, bool useGpu) {
-  return registrar_.createByType(config.type(), config, useGpu);
+ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool>
+DataProvider::registrar_;
+
+DataProvider* DataProvider::create(const DataConfig& config,
+                                   const ModelConfig& modelConfig,
+                                   bool useGpu) {
+  return registrar_.createByType(config.type(), config, modelConfig, useGpu);
 }
 
 REGISTER_DATA_PROVIDER(simple, SimpleDataProvider);
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index aab5d93fcaa1e7286db7c2aeb60c6d10695a5ced..534491d70d546734f2197de5b04a85a56d00d732 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -39,15 +39,30 @@ limitations under the License. */
 #include "paddle/parameter/Argument.h"
 
 namespace paddle {
-
 /**
  * @def REGISTER_DATA_PROVIDER
- * @brief Macro for registering a data provider
+ * @brief Macro for registering a data provider. The class type should contain
+ *        a consturctor with parameter (DataConfig, bool).
  */
-#define REGISTER_DATA_PROVIDER(__type_name, __class_name)               \
-  static InitFunction __reg_type_##__type_name([]() {                   \
-    DataProvider::registrar_.registerClass<__class_name>(#__type_name); \
-  })
+#define REGISTER_DATA_PROVIDER(__type_name, __class_name)\
+  static InitFunction __reg_type_##__type_name([]() {\
+  DataProvider::registrar_.registerClass(\
+  #__type_name, \
+  [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \
+    DataProvider* dp = new __class_name (conf, useGpu);\
+    return dp;\
+  });\
+})
+
+/**
+ * @def REGISTER_DATA_PROVIDER_EX
+ * @brief Macro for registering a data provider, which contains a constructor
+ *        with parameter (DataConfig, ModelConfig, bool).
+ */
+#define REGISTER_DATA_PROVIDER_EX(__type_name, __class_name)            \
+  static InitFunction __reg_type_##__type_name([] {                     \
+  DataProvider::registrar_.registerClass<__class_name>(#__type_name);   \
+})
 
 class DataBatch;
 class BufferBatch;
@@ -285,10 +300,18 @@ protected:
  */
 class DataProvider {
 public:
-  static ClassRegistrar<DataProvider, DataConfig, bool> registrar_;
+  static ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool> registrar_;
   static DataProvider* create(const DataConfig& config,
+                              const ModelConfig& modelConfig,
                               bool useGpu = FLAGS_use_gpu);
 
+  /**
+   * @brief create only used for unittest.
+   */
+  inline static DataProvider* create(const DataConfig &config, bool useGpu) {
+    return create(config, ModelConfig(), useGpu);
+  }
+
   DataProvider(const DataConfig& config, bool useGpu)
       : config_(config),
         skipShuffle_(false),
@@ -336,13 +359,13 @@ public:
    * @note return -1 to indicate unlimited number of samples.
    */
   virtual int64_t getSize() = 0;
+
   /**
    * @brief Get next batch training samples internally
    * @param[in]    size      size of training samples to get
    * @param[out]   batch     a batch of training samples
    * @return actual size of obtained training samples
    */
-
   virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch) = 0;
 
 protected:
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.cpp b/paddle/gserver/dataproviders/MultiDataProvider.cpp
index c3d14a7069bd3dc240e343ab9b11e17d35065269..8e4f53978a0451f3bb6cd5da30f017708448f9ac 100644
--- a/paddle/gserver/dataproviders/MultiDataProvider.cpp
+++ b/paddle/gserver/dataproviders/MultiDataProvider.cpp
@@ -22,7 +22,9 @@ namespace paddle {
 
 using namespace std;
 
-MultiDataProvider::MultiDataProvider(const DataConfig& config, bool useGpu)
+MultiDataProvider::MultiDataProvider(const DataConfig& config,
+                                     const ModelConfig& modelConfig,
+                                     bool useGpu)
     : DataProvider(config, useGpu) {
   bool atLeastOneMainDataFlag = false;
   totalDataRatio_ = 0;
@@ -58,7 +60,9 @@ MultiDataProvider::MultiDataProvider(const DataConfig& config, bool useGpu)
       subConfig.set_async_load_data(false);
     }
     subDataProviders_[i] =
-        std::unique_ptr<DataProvider>(DataProvider::create(subConfig, useGpu_));
+        std::unique_ptr<DataProvider>(DataProvider::create(subConfig,
+                                                           modelConfig,
+                                                           useGpu_));
   }
 }
 
@@ -116,6 +120,6 @@ int64_t MultiDataProvider::getNextBatchInternal(int64_t size,
   return batch->getSize();
 }
 
-REGISTER_DATA_PROVIDER(multi, MultiDataProvider);
+REGISTER_DATA_PROVIDER_EX(multi, MultiDataProvider);
 
 }  // namespace paddle
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.h b/paddle/gserver/dataproviders/MultiDataProvider.h
index 714421286376b4dc1c1485e0264540cfe38d8f65..b498ba6516c4320566b1b3cc2bd557ae016d7c39 100644
--- a/paddle/gserver/dataproviders/MultiDataProvider.h
+++ b/paddle/gserver/dataproviders/MultiDataProvider.h
@@ -24,7 +24,9 @@ protected:
   std::vector<std::unique_ptr<DataProvider>> subDataProviders_;
 
 public:
-  MultiDataProvider(const DataConfig& config, bool useGpu);
+  MultiDataProvider(const DataConfig& config,
+                    const ModelConfig& modelConfig,
+                    bool useGpu);
   ~MultiDataProvider() {}
   virtual void reset();
   virtual void shuffle();
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp
index 8e51752dc29ee317dd268d6b7d444958e75ef88e..0b41f6a02aecc6fe8dd3d305db3f1108191c08a9 100644
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -24,6 +24,27 @@ limitations under the License. */
 
 namespace paddle {
 
+namespace unittest {
+
+static std::unique_ptr<std::function<void(size_t /*poolActualSize */)>>
+         OnPoolFilled;
+
+namespace pydp2 {
+
+void setOnPoolFilledHook(const std::function<void(size_t)>& callback) {
+  OnPoolFilled.reset(new std::function<void(size_t)>());
+  *OnPoolFilled = callback;
+}
+
+void clearOnPoolFilledHook() {
+  OnPoolFilled.reset();
+}
+
+}  // namespace pydp2
+}  // namespace unittest
+
+
+
 /**
  * Slot type
  */
@@ -179,6 +200,7 @@ public:
    * Ctor
    */
   PyDataProvider2(const DataConfig& config,
+                  const ModelConfig& modelConfig,
                   bool useGpu)
     :DataProvider(config, useGpu), callingContextCreated_(2) {
     auto& args = config.load_data_args();
@@ -192,6 +214,12 @@ public:
 
     py::DictHelper kwargsDict(kwargs);
     kwargsDict.setBool("is_train", !config.for_test());
+    std::vector<std::string> inputs;
+    inputs.reserve(modelConfig.input_layer_names().size());
+    std::copy(modelConfig.input_layer_names().begin(),
+              modelConfig.input_layer_names().end(),
+              std::back_inserter(inputs));
+    kwargsDict.setStringList("input_order", inputs);
 
     // kwargs is keyword arguemts to create object.
     this->createPyDataObj(config.load_data_module(),
@@ -199,7 +227,7 @@ public:
                           config.files(),
                           std::move(kwargs));
     DBG << "Instance " << instance_.get() << " loaded.";
-    this->readPyFields();
+    this->readPyFields(config.for_test());
     DBG << "Py Field Done";
   }
 
@@ -253,14 +281,28 @@ private:
     CHECK_PY(instance_) << "Cannot Create instance";
   }
 
-  void readPyFields() {
+  void readPyFields(bool testing) {
     py::ObjectHelper self(this->instance_);
-    this->skipShuffle_ = !self.getBoolAttr("should_shuffle");
     bool ok;
+
+    this->skipShuffle_ = !self.getBoolAttr("should_shuffle",
+                                           &ok /*isBoolType*/);
+    if (!ok) {
+      this->skipShuffle_ = testing;  // shuffle when is training, skip shuffle
+                                     // when is testing.
+    }
+    DBG << "Provider Skip Shuffle " << this->skipShuffle_;
+
     this->poolSize_ = self.getIntAttr<size_t>("pool_size", &ok);
     if (!ok) {
       this->poolSize_ = -1UL;
     }
+    this->minPoolSize_ = self.getIntAttr<size_t>("min_pool_size", &ok);
+    if (!ok) {
+      this->minPoolSize_ = -1UL;
+    }
+    this->minPoolSize_ = std::min(this->poolSize_, this->minPoolSize_);
+
     this->canOverBatchSize_ = self.getBoolAttr("can_over_batch_size");
 
     calcBatchSize_.reset(self.getAttr("calc_batch_size"));
@@ -307,7 +349,6 @@ private:
   }
 
   void loadThread() {
-    callingContexts_.reserve(fileLists_.size());
     DBG << "Creating context";
     for (auto& filename : fileLists_) {
       PyGuard g;
@@ -332,7 +373,14 @@ private:
         bool atEnd;
         data = py::iterNext(callingContexts_[cid], &atEnd);
         if (atEnd || data == nullptr) {
-          callingContexts_.erase(callingContexts_.begin() + cid);
+          if (cid != 0) {
+            std::swap(callingContexts_[cid], callingContexts_[0]);
+            cid = 0;
+          }
+          {
+            PyGuard g;
+            callingContexts_.pop_front();
+          }
           this->pullCV_.notify_all();
           continue;
         }
@@ -354,11 +402,7 @@ private:
       if (this->loadThread_){  // wait poolActualSize < poolSize;
         std::unique_lock<std::mutex> l(mtx_);
         pushCV_.wait(l, [this, additionalBatchSize] {
-          if (this->canOverBatchSize_) {
-            return this->poolActualSize_ < poolSize_;
-          } else {
-            return this->poolActualSize_ + additionalBatchSize < poolSize_;
-          }
+          return this->poolActualSize_ < poolSize_;
         });
       }
 
@@ -402,7 +446,7 @@ private:
 private:
   std::unique_ptr<std::thread> loadThread_;
   std::atomic<bool> exit_;
-  std::vector<PyObjectPtr> callingContexts_;
+  std::deque<PyObjectPtr> callingContexts_;
   std::deque<PyObjectPtr> dataPool_;
   size_t poolActualSize_;
   std::condition_variable pushCV_;
@@ -413,6 +457,7 @@ private:
 
   PyObjectPtr instance_;
   size_t poolSize_;
+  size_t minPoolSize_;
   bool canOverBatchSize_;
   PyObjectPtr calcBatchSize_;
   PyObjectPtr generator_;
@@ -478,8 +523,13 @@ public:
                         // data pool ready.
       std::unique_lock<std::mutex> l(mtx_);
       pullCV_.wait(l, [this, &size] {
-        return this->poolActualSize_ >= size || callingContexts_.empty();
+        return this->poolActualSize_ >= std::max(size, this->minPoolSize_)
+            || callingContexts_.empty();
       });
+
+      if (unittest::OnPoolFilled) {
+        (*unittest::OnPoolFilled)(this->poolActualSize_);
+      }
     }
     std::deque<PyObjectPtr> data;
     size_t bsize = 0;
@@ -495,7 +545,8 @@ public:
     std::deque<PyObjectPtr>& pool = *poolPtr;
 
     while (bsize < size && !pool.empty()) {
-      {  // move data from pool to data
+      {
+        // move data from pool to data
         std::lock_guard<std::mutex> guard(mtx_);
         if (skipShuffle_) {
           size_t i = 0;
@@ -505,14 +556,13 @@ public:
         } else {  // when shuffle, use swap to drop only last pool element.
           size_t i = ThreadLocalRand::rand() % pool.size();
           CHECK(pool[i] != nullptr);
-          if (i != pool.size() - 1) {
-            std::swap(pool[i], pool.back());
+          if (i != 0) {
+            std::swap(pool[i], pool.front());
           }
-          data.emplace_back(std::move(pool.back()));
-          pool.pop_back();
+          data.emplace_back(std::move(pool.front()));
+          pool.pop_front();
         }
-      }
-      {
+
         if (calcBatchSize_) {  // custom calc batch size.
           PyGuard guard;
           Py_INCREF(data.back().get());
@@ -521,8 +571,17 @@ public:
           calcBatchSize.getArgs().set(0, data.back());
           PyObjectPtr customBatchSize(calcBatchSize());
           bool ok;
-          bsize += py::castInt<size_t>(customBatchSize.get(), &ok);
+          size_t tmp = py::castInt<size_t>(customBatchSize.get(), &ok);
           CHECK(ok) << "calc_batch_size must return int";
+
+          if (bsize + tmp > size && !canOverBatchSize_) {
+            // Put data back.
+            pool.push_front(std::move(data.back()));
+            data.pop_back();
+            break;
+          } else {
+            bsize += tmp;
+          }
         } else {
           bsize += 1;
         }
@@ -598,7 +657,6 @@ public:
     } else {
       *batch = cpuBatch;
     }
-
     return bsize;
   }
 };
@@ -606,7 +664,8 @@ public:
 std::unordered_set<uintptr_t > PyDataProvider2::gModuleClsPtrs_;
 PyObjectPtr PyDataProvider2::zeroTuple_(PyTuple_New(0));
 
-REGISTER_DATA_PROVIDER(py2, PyDataProvider2);
+REGISTER_DATA_PROVIDER_EX(py2, PyDataProvider2);
+
 
 /**
  * Scanner for dense slot.
diff --git a/paddle/gserver/layers/CRFLayer.cpp b/paddle/gserver/layers/CRFLayer.cpp
index fb0a0ddb3d45bac5339b6eb4a11ba3c01d0bd97f..c1dcad2b5f2a840ba06e8ef9833eee7a6e5e20cb 100644
--- a/paddle/gserver/layers/CRFLayer.cpp
+++ b/paddle/gserver/layers/CRFLayer.cpp
@@ -31,7 +31,7 @@ bool CRFLayer::init(const LayerMap& layerMap,
   }
 
   // coeff only affect bp, keep consistent with CostLayer
-  coeff_ = config_.has_coeff() ? config_.coeff() : real(1.0);
+  coeff_ = config_.coeff();
   if (inputLayers_.size() == 3) {
     weightLayer_ = inputLayers_[2];
   }
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index 0f99aee03200c3834c7c27343f41f77edc5a558e..14ff8510f7b19dc24b7b1ba603485488ddd4979d 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -26,11 +26,7 @@ namespace paddle {
 bool CostLayer::init(const LayerMap& layerMap,
                      const ParameterMap& parameterMap) {
   bool ret = Layer::init(layerMap, parameterMap);
-  if (config_.has_coeff()) {
-    coeff_ = config_.coeff();  // coeff only affact bp
-  } else {
-    coeff_ = real(1.0);
-  }
+  coeff_ = config_.coeff();
   if (!ret) return ret;
   CHECK_GE(inputLayers_.size(), 2UL);
   CHECK_LE(inputLayers_.size(), 3UL);
diff --git a/paddle/gserver/tests/rnn_data_provider.py b/paddle/gserver/tests/rnn_data_provider.py
index 85a83554c5c3045d144ee0250d2808237eccc9e0..347d5891b906b4b4779764af4e838b5d099b360b 100644
--- a/paddle/gserver/tests/rnn_data_provider.py
+++ b/paddle/gserver/tests/rnn_data_provider.py
@@ -19,14 +19,18 @@ data = [
     [[[0, 2], [2, 5], [0, 1, 2]], 1],
 ]
 
+
 @provider(input_types=[integer_value_sub_sequence(10),
-                       integer_value(2)])
+                       integer_value(2)],
+          should_shuffle=False)
 def process_subseq(settings, file_name):
     for d in data:
         yield d
 
+
 @provider(input_types=[integer_value_sequence(10),
-                       integer_value(2)])
+                       integer_value(2)],
+          should_shuffle=False)
 def process_seq(settings, file_name):
     for d in data:
         seq = []
diff --git a/paddle/gserver/tests/sequenceGen.py b/paddle/gserver/tests/sequenceGen.py
index cb83d79d78cc677d5ffeb77f5693d08da2a51668..cbed1f15fc4157ea29bddf5ba410d5e05271e04c 100644
--- a/paddle/gserver/tests/sequenceGen.py
+++ b/paddle/gserver/tests/sequenceGen.py
@@ -17,22 +17,26 @@ import sys
 
 from paddle.trainer.PyDataProvider2 import *
 
+
 def hook(settings, dict_file, **kwargs):
     settings.word_dict = dict_file
-    settings.input_types = [integer_value_sequence(len(settings.word_dict)), 
+    settings.input_types = [integer_value_sequence(len(settings.word_dict)),
                             integer_value_sequence(3)]
     settings.logger.info('dict len : %d' % (len(settings.word_dict)))
 
-@provider(init_hook=hook)
+
+@provider(init_hook=hook, should_shuffle=False)
 def process(settings, file_name):
     with open(file_name, 'r') as fdata:
         for line in fdata:
             label, comment = line.strip().split('\t')
             label = int(''.join(label.split()))
             words = comment.split()
-            word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict]
+            word_slot = [settings.word_dict[w] for w in words if
+                         w in settings.word_dict]
             yield word_slot, [label]
 
+
 ## for hierarchical sequence network
 def hook2(settings, dict_file, **kwargs):
     settings.word_dict = dict_file
@@ -40,17 +44,19 @@ def hook2(settings, dict_file, **kwargs):
                             integer_value_sub_sequence(3)]
     settings.logger.info('dict len : %d' % (len(settings.word_dict)))
 
-@provider(init_hook=hook2)
+
+@provider(init_hook=hook2, should_shuffle=False)
 def process2(settings, file_name):
     with open(file_name) as fdata:
         label_list = []
         word_slot_list = []
         for line in fdata:
             if (len(line)) > 1:
-                label,comment = line.strip().split('\t')
+                label, comment = line.strip().split('\t')
                 label = int(''.join(label.split()))
                 words = comment.split()
-                word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict]
+                word_slot = [settings.word_dict[w] for w in words if
+                             w in settings.word_dict]
                 label_list.append([label])
                 word_slot_list.append(word_slot)
             else:
diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp
index 824295eb6e9f2461497e4151d7b0f1d603d93a32..c5fe31b29187f4a5b429a928d1870a06848691fa 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider2.cpp
@@ -20,6 +20,18 @@ limitations under the License. */
 #include "paddle/gserver/dataproviders/DataProvider.h"
 
 P_DEFINE_string(train_list, "unittest.list", "file list for unittest");
+
+namespace paddle {
+namespace unittest {
+namespace pydp2 {
+extern void setOnPoolFilledHook(const std::function<void(size_t)>& func);
+extern void clearOnPoolFilledHook();
+
+}  // namespace pydp2
+}  // namespace unittest
+}  // namespace paddle
+
+
 const paddle::real epsilon = 1e-5;
 
 static inline int64_t readDataBatch(
@@ -235,6 +247,112 @@ TEST(PyDataProvider2, index_sub_seq) {
   }
 }
 
+TEST(PyDataProvider2, min_pool_size) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_min_pool_size");
+  config.set_load_data_args("");
+  size_t totalData = 1 << 14;
+  constexpr size_t batchSize = 100;
+  constexpr size_t minPoolSize = 1000;
+  paddle::DataBatch batch;
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->reset();
+
+  paddle::unittest::pydp2::setOnPoolFilledHook([&](size_t poolSize) {
+    if (totalData > batchSize) {
+      CHECK_GE(poolSize, std::min(totalData-batchSize, minPoolSize));
+    }
+  });
+  while (true) {
+    size_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
+    if (realBatchSize) {
+      totalData -= realBatchSize;
+    } else {
+      break;
+    }
+  }
+  paddle::unittest::pydp2::clearOnPoolFilledHook();
+}
+
+TEST(PyDataProvider2, can_over_batch_size) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_can_over_batch_size");
+  config.set_load_data_args("");
+  paddle::DataBatch batch;
+  std::unique_ptr<paddle::DataProvider> provider(
+  paddle::DataProvider::create(config, false));
+  provider->reset();
+  constexpr size_t batchSize = 100;
+  while (true) {
+    size_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
+    if (realBatchSize) {
+      CHECK_LE(realBatchSize, batchSize);
+    } else {
+      break;
+    }
+  }
+}
+
+TEST(PyDataProvider2, input_order) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_input_order");
+  config.set_load_data_args("");
+
+  paddle::ModelConfig modelConfig;
+  *modelConfig.add_input_layer_names() = "input1";
+  *modelConfig.add_input_layer_names() = "input2";
+  paddle::DataBatch batch;
+  std::unique_ptr<paddle::DataProvider> provider(
+  paddle::DataProvider::create(config, modelConfig, false));
+  provider->reset();
+  constexpr size_t batchSize = 100;
+  while (true) {
+    size_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
+    if (!realBatchSize) {
+      break;
+    }
+    ASSERT_EQ(batch.getStreams().size(), 2);
+    for (size_t i = 0; i < realBatchSize; ++i) {
+      ASSERT_EQ(batch.getStream(0).ids->getData()[i], 0);
+      ASSERT_EQ(batch.getStream(1).ids->getData()[i], 1);
+    }
+  }
+}
+
+TEST(PyDataProvider2, test_check) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_check");
+  config.set_load_data_args("");
+  paddle::DataBatch batch;
+  std::unique_ptr<paddle::DataProvider> provider(
+  paddle::DataProvider::create(config, false));
+  provider->reset();
+  while (true) {
+    size_t realBatchSize = provider->getNextBatchInternal(100, &batch);
+    if (!realBatchSize) {
+      break;
+    } else {
+      auto& ivec = batch.getStream(0).ids;
+      for (size_t i=0; i < ivec->getSize(); ++i) {
+        CHECK_LT(ivec->getData()[i], 10);
+      }
+    }
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   paddle::initMain(argc, argv);
diff --git a/paddle/gserver/tests/test_PyDataProvider2.py b/paddle/gserver/tests/test_PyDataProvider2.py
index a88c48cb4e295d52e69e770a8906fa857c878c22..145fe85cff7d88e73233068f956489a0c2259abe 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.py
+++ b/paddle/gserver/tests/test_PyDataProvider2.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import random
+
 from paddle.trainer.PyDataProvider2 import *
 
 
@@ -39,7 +41,8 @@ def test_init_hook(setting, filename):
 
 
 @provider(
-    input_types=[sparse_binary_vector(30000, seq_type=SequenceType.NO_SEQUENCE)])
+    input_types=[
+        sparse_binary_vector(30000, seq_type=SequenceType.NO_SEQUENCE)])
 def test_sparse_non_value_no_seq(setting, filename):
     for i in xrange(200):
         yield [(i + 1) * (j + 1) for j in xrange(10)]
@@ -66,3 +69,43 @@ def test_index_sub_seq(setting, filename):
 
     for i in xrange(200):
         yield list(gen_sub_seq(i))
+
+
+@provider(input_types=[index_slot(100)], min_pool_size=1000)
+def test_min_pool_size(setting, filename):
+    for _ in xrange(1 << 14):
+        yield random.randint(0, 100 - 1)
+
+
+@provider(input_types=[index_slot(100, seq_type=SequenceType.SEQUENCE)],
+          can_over_batch_size=False,
+          calc_batch_size=lambda x: len(x[0]))
+def test_can_over_batch_size(setting, filename):
+    for _ in xrange(1 << 10):
+        seq_len = random.randint(0, 99)
+        yield [random.randint(0, 100 - 1) for _ in xrange(seq_len)]
+
+
+@provider(input_types=[index_slot(10), index_slot(10)])
+def test_input_order(setting, filename):
+    for _ in xrange(1000):
+        yield {
+            'input1': 0,
+            'input2': 1
+        }
+
+
+@provider(input_types=[index_slot(10)],
+          check=True,
+          check_fail_continue=True,
+          should_shuffle="123")  # also test should shuffle
+def test_check(settings, filename):
+    yield_good_value = False
+
+    while not yield_good_value:
+        for _ in xrange(10000):
+            i = random.randint(0, 100)
+            if i < 10:
+                yield_good_value = True
+            yield i
+
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
index 84d2ee1e73a54ab96b2dd5d9885df366656b915d..275150e12d12b57550ce45355cb3c533b57b4b86 100644
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -194,7 +194,7 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper> &config,
 
   dataProvider_ = dataProvider;
   if (!dataProvider_ && config_->hasDataConfig()) {
-    dataProvider_.reset(DataProvider::create(*config_, gpuData));
+    dataProvider_.reset(DataProvider::create(*config_, *config_, gpuData));
   }
   if (dataProvider_) {
     evaluator_.reset(trainerInternal_.getGradientMachine()->makeEvaluator());
@@ -212,7 +212,7 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper> &config,
   testDataProvider_ = testDataProvider;
   if (!testDataProvider_ && config_->hasTestDataConfig()) {
     testDataProvider_.reset(
-        DataProvider::create(config_->getTestDataConfig(), gpuData));
+        DataProvider::create(config_->getTestDataConfig(), *config_, gpuData));
   }
   if (testDataProvider_) {
     tester_.reset(new Tester(config_, createTesterConfig(),
diff --git a/paddle/trainer/tests/.gitignore b/paddle/trainer/tests/.gitignore
index 79f701203671cda6a295db4594e10a7df4332d29..aedb0ef22e02344af27d18dc3f500fab23f6686f 100644
--- a/paddle/trainer/tests/.gitignore
+++ b/paddle/trainer/tests/.gitignore
@@ -1,2 +1,3 @@
 dump_text.test
 test_pydata_provider_wrapper.json
+*proto.bin
diff --git a/paddle/trainer/tests/sample_trainer_rnn_gen.conf b/paddle/trainer/tests/sample_trainer_rnn_gen.conf
index 5b65310e7649cba90682fcb60f808b01653876ba..abb6e9b179326ba6beb1509b6af9bf0a4e2d6338 100644
--- a/paddle/trainer/tests/sample_trainer_rnn_gen.conf
+++ b/paddle/trainer/tests/sample_trainer_rnn_gen.conf
@@ -13,96 +13,53 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
 
-import math
+from paddle.trainer_config_helpers import *
 
-beam_search = get_config_arg('beam_search', bool, False)
-
-model_type("recurrent_nn")
-
-Settings(learning_rate=0, batch_size=15, algorithm='sgd')
-
-Inputs("sent_id", "dummy_data_input")
-Outputs("predict_word")
+settings(batch_size=15, learning_rate=0)
 
 num_words = 5
+beam_flag = get_config_arg('beam_search', bool, False)
 
-DataLayer(name="sent_id", size=1, )
+sent_id = data_layer(name="sent_id", size=1)
 
 # This layer has no actual use, but only to decide batch_size in generation.
 # When generating, at least one Memory in RecurrentLayer MUST have a boot layer.
-DataLayer(name="dummy_data_input", size=2, )
-
-if beam_search:
-    RecurrentLayerGroupBegin("decoding_layer_group",
-                             in_links=[],
-                             out_links=["predict_word"],
-                             generator=Generator(max_num_frames=10,
-                                                 beam_size=2,
-                                                 num_results_per_sample=2, ))
-else:
-    RecurrentLayerGroupBegin("decoding_layer_group",
-                             in_links=[],
-                             out_links=["predict_word"],
-                             generator=Generator(max_num_frames=10, ))
-dummy_memory = Memory(name="dummy_memory",
-                      size=2,
-                      boot_layer="dummy_data_input")
-MixedLayer(name="dummy_memory",
-           size=2,
-           bias=False,
-           inputs=[IdentityProjection(dummy_memory)], )
-state_memory = Memory(name="state",
-                      size=num_words,
-                      #boot_bias=True,
-                      #boot_bias_active_type = "tanh",
-                      )
-
-predict_word_memory = Memory(name="predict_word",
-                             size=num_words,
-                             boot_with_const_id=0, )
-
-MixedLayer(
-        name = "word_embedding",
-        size = num_words, # word embedding dim is the same as num_words in this test.
-        bias = False,
-        inputs = TableProjection(predict_word_memory,
-                                 initial_std=1,
-                                 learning_rate=0,
-                                 parameter_name="wordvec"))
-
-Layer(  # simplified RNN for testing
-    name="state",
-    type="mixed",
-    size=num_words,
-    bias=False,
-    inputs=[FullMatrixProjection("word_embedding",
-                                 parameter_name="transtable")])
-
-Layer(name="output",
-      type="mixed",
-      size=num_words,
-      active_type="exponential",
-      bias=False,
-      inputs=TransposedFullMatrixProjection("state",
-                                            initial_std=1,
-                                            learning_rate=0,
-                                            parameter_name="wordvec"), )
-
-Layer(name="predict_word", type="maxid", inputs=["output"], )
-
-Layer(name="eos_check",
-      type="eos_id",
-      eos_id=num_words - 1,
-      inputs=["predict_word"], )
-RecurrentLayerGroupEnd("decoding_layer_group")
-
-Evaluator(name="answer_printer",
-          type="seq_text_printer",
-          dict_file="./trainer/tests/test_gen_dict.txt",
-          result_file="./trainer/tests/dump_text.test",
-          inputs=[
-              "sent_id",
-              "predict_word",
-          ], )
+dummy_data = data_layer(name="dummy_data_input", size=2)
+
+gen_inputs = [StaticInput(input=dummy_data, size=2),
+              GeneratedInput(size=num_words,
+                             embedding_name="wordvec",
+                             embedding_size=num_words)]
+
+def step(dummy_memory, predict_word):
+    
+    # simplified RNN for testing
+    with mixed_layer(size=num_words) as layer:
+        layer += full_matrix_projection(input=predict_word,
+                                        param_attr=ParamAttr(name="transtable"))
+
+    with mixed_layer(size=num_words, act=ExpActivation()) as out:
+        out += trans_full_matrix_projection(input=layer,
+                                            param_attr=ParamAttr(name="wordvec"))
+
+    return out
+    
+beam_gen = beam_search(name="rnn_gen",
+                       step=step,
+                       input=gen_inputs,
+                       id_input=sent_id,
+                       dict_file="./trainer/tests/test_gen_dict.txt",
+                       result_file="./trainer/tests/dump_text.test",
+                       bos_id=0,
+                       eos_id=num_words-1,
+                       beam_size=2 if beam_flag else 1,
+                       num_results_per_sample=2 if beam_flag else 1,
+                       max_length=10) 
+
+#outputs(beam_gen)
+# In this config, as dummy_data_input doesn't work on beam_gen (we can find dummy_memory
+# is read-only memory, and isn't used by other layers of step), we show the Inputs and Outputs
+# as follows. Note that "__beam_search_predict__" is the default output name of beam_search.
+Inputs("sent_id","dummy_data_input")
+Outputs("__beam_search_predict__")
diff --git a/paddle/utils/PythonUtil.h b/paddle/utils/PythonUtil.h
index 397229d803df9d750a440498ad5a90b779597ee9..2808338fbdf596c99a122d68c1ead2fe6de6a3c5 100644
--- a/paddle/utils/PythonUtil.h
+++ b/paddle/utils/PythonUtil.h
@@ -183,10 +183,21 @@ public:
   /**
    * Get bool attribute.
    * @param field
+   * @param [out] isBoolType return true if attribute is bool type. If the
+   *                         attribute is not bool type, then an implicit
+   *                         conversion will happens, and will return the
+   *                         conversion result.
+   *
+   *                         Such as, if the attribute is 1, then the return
+   *                         value of function will be true, but the isBoolType
+   *                         will return false.
    * @return
    */
-  bool getBoolAttr(const std::string& field) const {
+  bool getBoolAttr(const std::string& field, bool* isBoolType = nullptr) const {
     PyObjectPtr tmp(getAttr(field));
+    if (isBoolType) {
+      *isBoolType = PyBool_Check(tmp.get());
+    }
     return PyObject_IsTrue(tmp.get());
   }
 
@@ -266,6 +277,15 @@ public:
     this->set(key, PyBool_FromLong(b));
   }
 
+  void setStringList(const std::string& key,
+                     const std::vector<std::string>& items) {
+    auto * list = PyList_New(items.size());
+    for (size_t i=0; i < items.size(); ++i) {
+      PyList_SetItem(list, i, PyString_FromString(items[i].c_str()));
+    }
+    this->set(key, list);
+  }
+
 private:
   inline void checkDict() {
     CHECK(PyDict_Check(this->dict_));
diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4
index a2b243a7869eaff120b25ece35e95be4d4284d18..b32f8b1ee90723e7bfdd4cbd5d93a35ac22b6b6d 100644
--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
@@ -299,7 +299,7 @@ sinclude(`ModelConfigLayer.proto.m4')
   optional bool norm_by_times = 25;
 
   // for CostLayers
-  optional real coeff = 26;
+  optional real coeff = 26 [default = 1.0];
 
   // for AverageLayer
   // can be set to: 'average', 'sum' or 'squarerootn'
diff --git a/proto/ParameterConfig.proto.m4 b/proto/ParameterConfig.proto.m4
index 222e070089116e68b0b29034280f12767ce21cd6..e8d512445e5025f5663fbe3e20b4425cf1633a2b 100644
--- a/proto/ParameterConfig.proto.m4
+++ b/proto/ParameterConfig.proto.m4
@@ -31,8 +31,8 @@ message ParameterUpdaterHookConfig {
 message ParameterConfig {
   required string name = 1;
   required uint64 size = 2;
-  required real learning_rate = 3;
-  required real momentum = 4;
+  optional real learning_rate = 3 [default = 1.0];
+  optional real momentum = 4 [default = 0.0];
   optional real initial_mean = 5 [default = 0.0];
   optional real initial_std = 6 [default = 0.01];
   // use L2-regularization if decay_rate set and decay_rate_l1 not set
@@ -54,8 +54,8 @@ message ParameterConfig {
   optional int32 num_batches_regularization = 13 [default = 1];
   // if is_sparse is true, para is sparse, else para is dense
   optional bool is_sparse = 14[default = false];
-  // if para is sparse, format should be "csc" or "csr"
-  optional string format = 15[default = "csr"];
+  // if para is sparse, format should be "csc" or "csr", empty means is not sparse
+  optional string format = 15 [default = ""];
   // sparse remote update or not
   optional bool sparse_remote_update = 16 [default = false];
   // gradient clipping threshold, no clipping by default
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index fd9a003bb018c87fb8e8e2992390f27edfd72f4b..dce0b909524369926eda54763e571706b79daeaf 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,6 +1,14 @@
 set(OUTPUT_DIR
     "${CMAKE_CURRENT_BINARY_DIR}/build")
 
+file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
+file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
+file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py)
+
+set(PY_FILES paddle/__init__.py
+             ${TRAINER_PY_FILES}
+             ${HELPERS_PY_FILES}
+             ${UTILS_PY_FILES})
 
 set(PADDLE_INTERNAL_PACKAGE "")
 if (PADDLE_WITH_INTERNAL)
@@ -13,7 +21,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
 add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
     COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
-    DEPENDS gen_proto_py)
+    DEPENDS gen_proto_py ${PY_FILES})
 
 add_custom_target(paddle_python ALL DEPENDS
     ${OUTPUT_DIR}/.timestamp)
diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py
index c4f61473933d04126541d0f95451c06601ba4c50..34f5dd41b7e683bbfa71e8a3e23ff3f542b39591 100644
--- a/python/paddle/trainer/PyDataProvider2.py
+++ b/python/paddle/trainer/PyDataProvider2.py
@@ -14,6 +14,13 @@
 
 import cPickle
 import logging
+import collections
+import functools
+import itertools
+
+logging.basicConfig(
+    format="[%(levelname)s %(asctime)s %(filename)s:%(lineno)s]"
+           " %(message)s")
 
 
 class SequenceType(object):
@@ -68,30 +75,39 @@ sparse_binary_vector = sparse_non_value_slot
 sparse_vector = sparse_value_slot
 integer_value = index_slot
 
+
 def dense_vector_sequence(dim):
     return dense_vector(dim, seq_type=SequenceType.SEQUENCE)
 
+
 def dense_vector_sub_sequence(dim):
     return dense_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
 
+
 def sparse_binary_vector_sequence(dim):
     return sparse_binary_vector(dim, seq_type=SequenceType.SEQUENCE)
 
+
 def sparse_binary_vector_sub_sequence(dim):
     return sparse_binary_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
 
+
 def sparse_vector_sequence(dim):
     return sparse_vector(dim, seq_type=SequenceType.SEQUENCE)
 
+
 def sparse_vector_sub_sequence(dim):
     return sparse_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
 
+
 def integer_value_sequence(dim):
     return integer_value(dim, seq_type=SequenceType.SEQUENCE)
 
+
 def integer_value_sub_sequence(dim):
     return integer_value(dim, seq_type=SequenceType.SUB_SEQUENCE)
 
+
 def integer_sequence(dim):
     return index_slot(dim, seq_type=SequenceType.SEQUENCE)
 
@@ -102,13 +118,97 @@ class SingleSlotWrapper(object):
 
     def __call__(self, obj, filename):
         for item in self.generator(obj, filename):
-            yield [item]
+            if isinstance(item, dict):
+                yield item
+            else:
+                yield [item]
 
 
-def provider(input_types=None, should_shuffle=True, pool_size=-1,
+class InputOrderWrapper(object):
+    def __init__(self, generator, input_order):
+        self.generator = generator
+        self.input_order = input_order
+
+    def __call__(self, obj, filename):
+        for item in self.generator(obj, filename):
+            if isinstance(item, dict):
+                yield [item.get(input_name, None) for input_name in
+                       self.input_order]
+            else:
+                yield item
+
+
+class CheckWrapper(object):
+    def __init__(self, generator, input_types, check_fail_continue, logger):
+        self.generator = generator
+        self.input_types = input_types
+        self.check_fail_continue = check_fail_continue
+        self.logger = logger
+
+    def __call__(self, obj, filename):
+        for items in self.generator(obj, filename):
+            try:
+                assert len(items) == len(self.input_types)
+                assert len(filter(lambda x: x is None, items)) == 0
+                for item, input_type in itertools.izip(items, self.input_types):
+                    callback = functools.partial(CheckWrapper.loop_callback,
+                                                 input_type)
+
+                    for _ in xrange(input_type.seq_type):
+                        callback = functools.partial(CheckWrapper.loop_check,
+                                                     callback)
+                    callback(item)
+
+                yield items
+            except AssertionError as e:
+                self.logger.warning(
+                    "Item (%s) is not fit the input type with error %s"
+                    % (repr(item), repr(e)))
+
+                if self.check_fail_continue:
+                    continue
+                else:
+                    raise
+
+    @staticmethod
+    def loop_callback(input_type, each):
+        assert isinstance(input_type, InputType)
+        if input_type.type == DataType.Dense:
+            assert isinstance(each, collections.Sequence)
+            for d in each:
+                assert isinstance(d, float)
+            assert len(each, input_type.dim)
+        elif input_type.type == DataType.Index:
+            assert isinstance(each, int)
+            assert each < input_type.dim
+        elif input_type.type == DataType.SparseNonValue \
+                or input_type.type == DataType.SparseValue:
+            assert isinstance(each, collections.Sequence)
+            sparse_id = set()
+            for k in each:
+                if input_type.type == DataType.SparseValue:
+                    k, v = k
+                    assert isinstance(v, float)
+                assert isinstance(k, int)
+                assert k < input_type.dim
+                sparse_id.add(k)
+            assert len(sparse_id) == len(each)
+        else:
+            raise RuntimeError("Not support input type")
+
+    @staticmethod
+    def loop_check(callback, item):
+        for each in item:
+            callback(each)
+
+
+def provider(input_types=None, should_shuffle=None, pool_size=-1,
+             min_pool_size=-1,
              can_over_batch_size=True,
              calc_batch_size=None,
              cache=CacheType.NO_CACHE,
+             check=False, check_fail_continue=False,
+             use_dynamic_order=True,
              init_hook=None, **kwargs):
     """
     Provider decorator. Use it to make a function into PyDataProvider2 object.
@@ -130,30 +230,63 @@ def provider(input_types=None, should_shuffle=True, pool_size=-1,
     :param input_types: Specify the input types, can also be set in init_hook.
                         It is a list of InputType object. For example, input_types= \
                         [dense_vector(9), integer_value(2)].
-    :param should_shuffle: True if data should shuffle.
+    :type input_types: list|tuple
+
+    :param should_shuffle: True if data should shuffle. Pass None means shuffle
+                           when is training and not to shuffle when is testing.
     :type should_shuffle: bool
+
     :param pool_size: Max number of sample in data pool.
     :type pool_size: int
+
+    :param min_pool_size: Set minimal sample in data pool. The PaddlePaddle will
+                          random pick sample in pool. So the min_pool_size
+                          effect the randomize of data.
+    :type min_pool_size: int
+
     :param can_over_batch_size: True if paddle can return a mini-batch larger
                                 than batch size in settings. It is useful when
                                 custom calculate one sample's batch_size.
 
                                 It is very danger to set it to false and use
                                 calc_batch_size together. Default is false.
+    :type can_over_batch_size: bool
+
     :param calc_batch_size: a method to calculate each sample's batch size.
                             Default each sample's batch size is 1. But to you
                             can customize each sample's batch size.
+    :type calc_batch_size: callable
+
     :param cache: Cache strategy of Data Provider. Default is CacheType.NO_CACHE
+    :type cache: int
 
     :param init_hook: Initialize hook. Useful when data provider need load some
                       external data like dictionary. The parameter is
                       (settings, file_list, \*\*kwargs).
 
-                      - settings\: Is the global settings. User can set
-                                   settings.input_types here.
-                      - file_list\: All file names for passed to data provider.
-                      - kwargs: Other keyword arguments passed from
+                      - settings. It is the global settings object. User can set
+                        settings.input_types here.
+                      - file_list. All file names for passed to data provider.
+                      - is_train. Is this data provider used for training or not.
+                      - kwargs. Other keyword arguments passed from
                         trainer_config's args parameter.
+    :type init_hook: callable
+
+    :param check: Check the yield data format is as same as input_types. Enable
+                  this will make data provide process slow but it is very useful
+                  for debug. Default is disabled.
+    :type check: bool
+
+    :param check_fail_continue: Continue train or not when check failed. Just
+                                drop the wrong format data when it is True. Has
+                                no effect when check set to False.
+    :type check_fail_continue: bool
+
+    :param use_dynamic_order: Allow provider to yield a dictionary object, whose
+                              key is a input data layer name, and value is the
+                              feature value. The tuples are still allowed when
+                              use_dynmaic_order is True.
+    :type use_dynamic_order: bool
     """
 
     def __wrapper__(generator):
@@ -168,12 +301,38 @@ def provider(input_types=None, should_shuffle=True, pool_size=-1,
                     self.slots = kwargs['slots']
                 self.slots = input_types
                 self.should_shuffle = should_shuffle
+
+                true_table = [1, 't', 'true', 'on']
+                false_table = [0, 'f', 'false', 'off']
+                if not isinstance(self.should_shuffle, bool) and \
+                                self.should_shuffle is not None:
+
+                    if isinstance(self.should_shuffle, basestring):
+                        self.should_shuffle = self.should_shuffle.lower()
+
+                    if self.should_shuffle in true_table:
+                        self.should_shuffle = True
+                    elif self.should_shuffle in false_table:
+                        self.should_shuffle = False
+                    else:
+                        self.logger.warning(
+                            "Could not recognize should_shuffle (%s), "
+                            "just use default value of should_shuffle."
+                            " Please set should_shuffle to bool value or "
+                            "something in %s" % (
+                                repr(self.should_shuffle),
+                                repr(true_table + false_table)))
+                        self.should_shuffle = None
+
                 self.pool_size = pool_size
                 self.can_over_batch_size = can_over_batch_size
                 self.calc_batch_size = calc_batch_size
                 self.file_list = file_list
                 self.generator = generator
                 self.cache = cache
+                self.min_pool_size = min_pool_size
+                self.input_order = kwargs['input_order']
+                self.check = check
                 if init_hook is not None:
                     init_hook(self, file_list=file_list, **kwargs)
                 if self.input_types is not None:
@@ -184,6 +343,15 @@ def provider(input_types=None, should_shuffle=True, pool_size=-1,
                 if len(self.slots) == 1:
                     self.generator = SingleSlotWrapper(self.generator)
 
+                if use_dynamic_order:
+                    self.generator = InputOrderWrapper(self.generator,
+                                                       self.input_order)
+                if self.check:
+                    self.generator = CheckWrapper(self.generator,
+                                                  self.slots,
+                                                  check_fail_continue,
+                                                  self.logger)
+
         return DataProvider
 
     return __wrapper__
@@ -196,3 +364,4 @@ def deserialize_args(args):
     :return:
     """
     return cPickle.loads(args)
+
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index f2f67f9bd66a4ebab9b5ace7fb13a194959d6c10..4ce01e005ae3ca549bb39c149e4ebf3cb04f8c1c 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -114,15 +114,15 @@ g_layer_type_map = {}
 # Initialize global variables. We use this function so that we can
 # call parse_config() multiple times
 def init_config_environment(
-        g_default_momentum = 0.,
-        g_default_decay_rate = 0.,
+        g_default_momentum = None,
+        g_default_decay_rate = None,
         g_default_initial_mean = 0.,
         g_default_initial_std = 0.01,
-        g_default_num_batches_regularization = 1,
+        g_default_num_batches_regularization = None,
         g_default_initial_strategy = 0,
         g_default_initial_smart = False,
-        g_default_gradient_clipping_threshold = 0.,
-        g_default_device = -1,
+        g_default_gradient_clipping_threshold = None,
+        g_default_device = None,
         g_default_update_hooks = None,
         g_default_compact_func = None,
 
@@ -1099,12 +1099,12 @@ def Evaluator(
         inputs,
         chunk_scheme = None,
         num_chunk_types = None,
-        classification_threshold = 0.5,
-        positive_label = -1,
-        dict_file = "",
-        result_file = "",
-        num_results = 1,
-        delimited = True,
+        classification_threshold = None,
+        positive_label = None,
+        dict_file = None,
+        result_file = None,
+        num_results = None,
+        delimited = None,
         ):
     evaluator = g_config.model_config.evaluators.add()
     evaluator.type = type
@@ -1120,12 +1120,19 @@ def Evaluator(
         evaluator.num_chunk_types = num_chunk_types
     g_current_submodel.evaluator_names.append(evaluator.name)
 
-    evaluator.classification_threshold = classification_threshold
-    evaluator.positive_label = positive_label
-    evaluator.dict_file = dict_file
-    evaluator.result_file = result_file
-    evaluator.num_results = num_results
-    evaluator.delimited = delimited
+    if classification_threshold is not None:
+        evaluator.classification_threshold = classification_threshold
+    if positive_label is not None:
+        evaluator.positive_label = positive_label
+    if dict_file is not None:
+        evaluator.dict_file = dict_file
+
+    if result_file is not None:
+        evaluator.result_file = result_file
+    if num_results is not None:
+        evaluator.num_results = num_results
+    if delimited is not None:
+        evaluator.delimited = delimited
 
 class LayerBase(object):
     def __init__(
@@ -1137,7 +1144,7 @@ class LayerBase(object):
             device=None,
             active_type="",
             drop_rate=0.,
-            coeff=1.):
+            coeff=None):
         config_assert('@' not in name,
                 "layer name: %s contain special character @" % name)
         global g_current_submodel
@@ -1155,10 +1162,12 @@ class LayerBase(object):
             self.inputs = [self.inputs]
 
         self.config = g_config.model_config.layers.add()
+        assert isinstance(self.config, LayerConfig)
         self.config.name = name
         self.config.type = type
         self.config.active_type = active_type
-        self.config.coeff = coeff
+        if coeff is not None:
+            self.config.coeff = float(coeff)
         if size != 0:
             self.config.size = size
         if drop_rate != 0:
@@ -1166,7 +1175,7 @@ class LayerBase(object):
 
         if device is not None:
             self.config.device = device
-        else:
+        elif g_default_device is not None:
             self.config.device = g_default_device
 
         for input_index in xrange(len(self.inputs)):
@@ -1236,10 +1245,12 @@ class LayerBase(object):
             if bias.parameter_name is None:
                 bias.parameter_name = gen_bias_parameter_name(self.config.name)
             if bias.parameter_name not in g_parameter_map:
+                assert isinstance(self.config, LayerConfig)
+
                 Parameter(
                     bias.parameter_name,
                     size,
-                    self.config.device,
+                    self.config.device if self.config.HasField('device') else None,
                     dims,
                     bias.learning_rate,
                     bias.momentum,
@@ -1265,7 +1276,7 @@ class LayerBase(object):
             input_index,
             size,
             dims=None,
-            sparse = False,
+            sparse = None,
             format = "csr"):
         if dims is None:
             # TODO(yuyang18): print warning and callstack here!
@@ -1293,7 +1304,7 @@ class LayerBase(object):
         Parameter(
             input_config.parameter_name,
             size,
-            self.config.device,
+            self.config.device if self.config.HasField("device") else None,
             dims,
             input_config.learning_rate,
             input_config.momentum,
@@ -1353,6 +1364,8 @@ class FCLayer(LayerBase):
 
             if sparse:
                 psize = self.inputs[input_index].nnz
+            else:
+                sparse = None
 
             self.create_input_parameter(input_index, psize, dims, sparse, format)
         self.create_bias_parameter(bias, self.config.size)
@@ -2430,7 +2443,6 @@ class MixedLayer(LayerBase):
         config_assert(inputs, 'inputs cannot be empty')
         super(MixedLayer, self).__init__(
             name, 'mixed', size, inputs=inputs, **xargs)
-
         operator_input_index = []
         for operator in self.operators:
             operator_conf = operator.operator_conf
@@ -2445,21 +2457,31 @@ class MixedLayer(LayerBase):
                 input_layer = self.get_input_layer(input_index)
                 operator_conf.input_sizes.append(input_layer.size)
                 operator_input_index.append(input_index)
-            if self.config.size ==  0:
+            if self.config.size == 0:
                 size = operator.calc_output_size(operator_conf.input_sizes)
                 if size != 0:
                     self.set_layer_size(size)
-
+            else:
+                size = operator.calc_output_size(operator_conf.input_sizes)
+                if size != 0:
+                    config_assert(size == self.config.size,
+                                  "different inputs have different size: %s vs. %s" %
+                                  (size, self.config.size))
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             input = self.inputs[input_index]
             if input_index not in operator_input_index:
                 config_assert(isinstance(input, Projection), "input should be projection or operation")
-            if self.config.size ==  0 and isinstance(input, Projection):
+            if self.config.size == 0 and isinstance(input, Projection):
                 size = input.calc_output_size(input_layer)
                 if size != 0:
                     self.set_layer_size(size)
-
+            elif isinstance(input, Projection):
+            	sz = input.calc_output_size(input_layer)
+            	if sz != 0:
+            		config_assert(sz == self.config.size,
+            		"different inputs have different size: %s vs. %s" %
+            		(sz, self.config.size))
         config_assert(size != 0, "size is not set")
 
         for input_index in xrange(len(self.inputs)):
@@ -2827,27 +2849,44 @@ def Parameter(
     para = g_config.model_config.parameters.add()
     para.name = name
     para.size = size
-    para.device = device
-    para.dims.extend(dims);
-    para.learning_rate = default(learning_rate, 1.)
-    para.momentum = default(momentum, g_default_momentum)
+    if device is not None:
+        para.device = int(device)
+    para.dims.extend(dims)
+
+    if learning_rate is not None:
+        para.learning_rate = float(learning_rate)
+
+    momentum = default(momentum, g_default_momentum)
+    if momentum is not None:
+        para.momentum = float(momentum)
+
     config_assert(not momentum or not decay_rate_l1,
                   "momentum and decay_rate_l1 cannot both be non-zero")
-    para.decay_rate = default(decay_rate, g_default_decay_rate)
+
+    decay_rate = default(decay_rate, g_default_decay_rate)
+    if decay_rate is not None:
+        para.decay_rate = decay_rate
+
     if decay_rate_l1 is not None:
         para.decay_rate_l1 = decay_rate_l1
     para.initial_std = default(initial_std, g_default_initial_std)
     para.initial_mean = default(initial_mean, g_default_initial_mean)
-    para.num_batches_regularization = default(
+
+    num_batches_regularization = default(
         num_batches_regularization, g_default_num_batches_regularization)
+    if num_batches_regularization is not None:
+        para.num_batches_regularization = int(num_batches_regularization)
+
     if sparse_remote_update is not None:
         para.sparse_remote_update = sparse_remote_update
         if sparse_remote_update:
             g_config.opt_config.use_sparse_remote_updater = True
     if sparse_update is not None:
         para.sparse_update = sparse_update
-    para.gradient_clipping_threshold = default(
-        gradient_clipping_threshold, g_default_gradient_clipping_threshold);
+    gradient_clipping_threshold = default(
+        gradient_clipping_threshold, g_default_gradient_clipping_threshold)
+    if gradient_clipping_threshold is not None:
+        para.gradient_clipping_threshold = gradient_clipping_threshold
     para.initial_strategy = default(initial_strategy, g_default_initial_strategy)
     para.initial_smart = default(initial_smart, g_default_initial_smart)
     if para.initial_smart:
@@ -2860,15 +2899,19 @@ def Parameter(
             para.initial_std = 1. / math.sqrt(para.size)
     if g_default_compact_func is not None:
         sparse, format, need_compact = g_default_compact_func(para.name)
-    para.is_sparse = default(sparse, False)
-    para.format = default(format, "")
-    para.need_compact = default(need_compact, False)
+
+    if sparse is not None:
+        para.is_sparse = sparse
+    if format is not None:
+        para.format = format
+    if need_compact is not None:
+        para.need_compact = need_compact
     if is_static is not None:
         para.is_static = is_static
     config_assert(not para.sparse_remote_update or not para.is_static,
                   "sparse_remote_update and is_static cannot both be true")
-
-    para.is_shared = default(is_shared, False)
+    if is_shared is not None:
+        para.is_shared = is_shared
 
     update_hooks = default(update_hooks, g_default_update_hooks)
 
diff --git a/python/paddle/trainer_config_helpers/activations.py b/python/paddle/trainer_config_helpers/activations.py
index 24defb06a6d66692e266c3102a5b3334d7493d38..85534675199e7627f9753e5d233f5208b14decfd 100644
--- a/python/paddle/trainer_config_helpers/activations.py
+++ b/python/paddle/trainer_config_helpers/activations.py
@@ -14,7 +14,7 @@
 
 __all__ = ["TanhActivation", "SigmoidActivation",
            "SoftmaxActivation", "IdentityActivation", "LinearActivation",
-           'SequenceSoftmaxActivation',
+           'SequenceSoftmaxActivation', 'ExpActivation',
            "ReluActivation", "BReluActivation", "SoftReluActivation", "STanhActivation",
            "AbsActivation", "SquareActivation", "BaseActivation"]
 
@@ -185,3 +185,12 @@ class SquareActivation(BaseActivation):
     """
 
     def __init__(self): BaseActivation.__init__(self, 'square', False)
+
+class ExpActivation(BaseActivation):
+    """
+    Exponential Activation.
+    
+    .. math::
+       f(z) = e^z.
+    """
+    def __init__(self): BaseActivation.__init__(self, 'exponential', False)
diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py
index 985fae9f955c950d861d4f1f2f98845562fb6fc9..7a00d0b7ec57af4eca90e76c3e5f955499205698 100644
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
@@ -65,12 +65,12 @@ def evaluator_base(
         name=None,
         chunk_scheme=None,
         num_chunk_types=None,
-        classification_threshold=0.5,
-        positive_label=-1,
-        dict_file="",
-        result_file="",
-        num_results=1,
-        delimited=True):
+        classification_threshold=None,
+        positive_label=None,
+        dict_file=None,
+        result_file=None,
+        num_results=None,
+        delimited=None):
     """
     Evaluator will evaluate the network status while training/testing.
 
@@ -105,9 +105,10 @@ def evaluator_base(
     :type weight: LayerOutput.
     """
     # inputs type assertions.
-    assert isinstance(classification_threshold, float)
-    assert isinstance(positive_label, int)
-    assert isinstance(num_results, int)
+    assert classification_threshold is None or isinstance(
+        classification_threshold, float)
+    assert positive_label is None or isinstance(positive_label, int)
+    assert num_results is None or isinstance(num_results, int)
 
     if not isinstance(input, list):
         input = [input]
@@ -136,7 +137,7 @@ def classification_error_evaluator(
         label,
         name=None,
         weight=None,
-        threshold=0.5):
+        threshold=None):
     """
     Classification Error Evaluator. It will print error rate for classification.
 
@@ -253,7 +254,7 @@ def pnpair_evaluator(
 def precision_recall_evaluator(
         input,
         label,
-        positive_label=-1,
+        positive_label=None,
         weight=None,
         name=None,
         ):
@@ -494,7 +495,7 @@ def gradient_printer_evaluator(
 @wrap_name_default()
 def maxid_printer_evaluator(
         input,
-        num_results=1,
+        num_results=None,
         name=None,
         ):
     """
@@ -518,13 +519,14 @@ def maxid_printer_evaluator(
     """
     evaluator_base(name=name,
                    type="max_id_printer",
-                   input=input)
+                   input=input,
+                   num_results=num_results)
 
 @evaluator(EvaluatorAttribute.FOR_PRINT)
 @wrap_name_default()
 def maxframe_printer_evaluator(
         input,
-        num_results=1,
+        num_results=None,
         name=None,
         ):
     """
@@ -556,9 +558,9 @@ def maxframe_printer_evaluator(
 @wrap_name_default()
 def seqtext_printer_evaluator(
         input,
-        dict_file="",
-        result_file="",
-        delimited=True,
+        result_file,
+        dict_file=None,
+        delimited=None,
         name=None,
         ):
     """
@@ -616,6 +618,7 @@ def seqtext_printer_evaluator(
     :param name: Evaluator name.
     :type name: None|basestring
     """
+    assert isinstance(result_file, basestring)
     evaluator_base(name=name,
                    type="seq_text_printer",
                    input=input,
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index bda0b4f5d60e82c1d577b0063fd5e164bf6117c3..fab7e6e091863fdad6d81f4c63f12132c2be5161 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -28,7 +28,7 @@ except ImportError:
 import copy
 
 __all__ = ["full_matrix_projection", "AggregateLevel", "ExpandLevel",
-           "identity_projection", "dotmul_projection",
+           "identity_projection", "dotmul_projection", "dotmul_operator",
            "table_projection", "mixed_layer", "data_layer",
            "embedding_layer", "fc_layer", "grumemory",
            "pooling_layer", "lstmemory", "last_seq", "first_seq",
@@ -389,7 +389,7 @@ def identity_projection(input, offset=None):
 @wrap_param_attr_default()
 def dotmul_projection(input, param_attr=None, scale=1):
     """
-    1. DotMulProjection if input is a layer.
+    DotMulProjection with a layer as input.
     It performs element-wise multiplication with weight.
 
     ..  math::
@@ -403,48 +403,45 @@ def dotmul_projection(input, param_attr=None, scale=1):
 
        proj = dotmul_projection(input=layer)
 
-    2. DotMulOperator if input is a list or tuple.
-    It takes two inputs, performs element-wise multiplication:
-
-    .. math::
-       out.row[i] += scale * (in1.row[i] .* in2.row[i])
-
-    where :math:`.*` means element-wise multiplication, and
-    scale is a config scalar, its default value is one.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       op = dotmul_projection(input=[layer1, layer2],
-                              scale=2.0)
-
     :param input: Input layer.
-    :type input: LayerOutput|list|tuple
+    :type input: LayerOutput
     :param param_attr: Parameter config, None if use default.
     :type param_attr: ParameterAttribute
     :param scale: config scalar, default value is one.
     :type scale: float
-    :return: A DotMulProjection or DotMulOperator Object.
-    :rtype: DotMulProjection or DotMulOperator
+    :return: A DotMulProjection Object.
+    :rtype: DotMulProjection
     """
-    if isinstance(input, LayerOutput):
-        proj = DotMulProjection(input_layer_name=input.name,
+    proj = DotMulProjection(input_layer_name=input.name,
                                 size=input.size,
                                 **param_attr.attr)
-        proj.origin = input
-        proj.origin.projection = "dot_mul"
-        return proj
-    else:
-        assert isinstance(input, list) or isinstance(input, tuple)
-        assert len(input) == 2
-        assert param_attr is None
-        op = DotMulOperator(input_layer_name=[x.name for x in input],
-                            scale=scale)
-        op.origin = input
-        op.origin.operator = "dot_mul"
-        return op
+    proj.origin = input
+    return proj
 
+def dotmul_operator(x, y, scale=1):
+    """
+    DotMulOperator takes two inputs and performs element-wise multiplication:
+    .. math::
+       out.row[i] += scale * (in1.row[i] .* in2.row[i])
+    where :math:`.*` means element-wise multiplication, and
+    scale is a config scalar, its default value is one.
+    The example usage is:
+    .. code-block:: python
+       op = dotmul_operator(x, y,
+                              scale=1)
+    :param input: Input layer
+    :type input: LayerOutput
+    :param scale: config scalar, default value is one.
+    :type scale: float
+    :return: A DotMulOperator Object.
+    :rtype: DotMulOperator
+    """
+    assert isinstance(x, LayerOutput)
+    assert isinstance(y, LayerOutput)
+    op = DotMulOperator(input_layer_names=[x.name, y.name],
+                        scale=scale)
+    op.origin = [x, y]
+    return op
 
 @wrap_bias_attr_default(['padding_attr'])
 def context_projection(input, context_len, context_start=None,
@@ -539,7 +536,10 @@ class MixedLayerType(LayerOutput):
         if not self.finalized:
             assert isinstance(other, Projection) or isinstance(other, Operator)
             self.inputs.append(other)
-            self.parents.append(other.origin)
+            if isinstance(other, Projection):
+                self.parents.append(other.origin)
+            else:
+                self.parents.extend(other.origin)
             return self
         else:
             raise MixedLayerType.AddToSealedMixedLayerException()
@@ -565,7 +565,7 @@ class MixedLayerType(LayerOutput):
 @wrap_act_default(act=LinearActivation())
 @wrap_bias_attr_default(has_bias=False)
 @layer_support(ERROR_CLIPPING, DROPOUT)
-def mixed_layer(size, input=None, name=None, act=None, bias_attr=False,
+def mixed_layer(size=0, input=None, name=None, act=None, bias_attr=False,
                 layer_attr=None):
     """
     Mixed Layer. A mixed layer will add all inputs together, then activate.
diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py
index ed676ac2152a6c08b90c18480ef0c69d5c0779f8..af85f745f63e59769e900e0fd64a10928affd654 100644
--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
@@ -79,7 +79,7 @@ class MomentumOptimizer(BaseSGDOptimizer):
             'learning_method': 'momentum'
         }
 
-    def __init__(self, momentum=1e-3):
+    def __init__(self, momentum=None):
         self.momentum = momentum
 
 
diff --git a/python/paddle/trainer_config_helpers/tests/layers_test_config.py b/python/paddle/trainer_config_helpers/tests/layers_test_config.py
index 39c85c788eecad5c6bba6dbd2f2734725fa4fff6..27b22ecb701c52ab2a0a1f5f95d7b07186fbbb58 100644
--- a/python/paddle/trainer_config_helpers/tests/layers_test_config.py
+++ b/python/paddle/trainer_config_helpers/tests/layers_test_config.py
@@ -38,8 +38,11 @@ print_layer(input=[out])
 
 outputs(classification_cost(out, data_layer(name="label", size=num_classes)))
 
+dotmul = mixed_layer(input=[dotmul_operator(x=x1, y=y1),
+	                        dotmul_projection(input=y1)])
+
 # for ctc
-tmp = fc_layer(input=x1,
+tmp = fc_layer(input=[x1, dotmul],
                size=num_classes + 1,
                act=SoftmaxActivation())
 ctc = ctc_layer(input=tmp,