diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 89c620bb2f7ef634fa80b64eec7037e8cb9a190c..6140340890c0e5025eb08209e8ea78df918b4dc0 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,3 +1,4 @@
+repos:
 -   repo: https://github.com/Lucas-C/pre-commit-hooks.git
     sha: v1.0.1
     hooks:
@@ -25,6 +26,14 @@
         entry: bash ./.clang_format.hook -i
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
+-   repo: local
+    hooks:
+    -   id: cpplint-cpp-source
+        name: cpplint
+        description: Check C++ code style using cpplint.py.
+        entry: bash ./tools/codestyle/cpplint_pre_commit.hook
+        language: system
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
     sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
     hooks:
diff --git a/.travis.yml b/.travis.yml
index bf6a41d13c4eabc2d8543ab821ce0ff747a061df..929c847bd36d64e79a199b2634ebf68c3225429b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -34,7 +34,7 @@ addons:
       - automake
       - libtool
       - ccache
-  ssh_known_hosts: 52.76.173.135
+  ssh_known_hosts: 13.229.163.131
 before_install:
   - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
   # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index a9b27933a5307aabeaf150aeb859e869197229f5..7066637a7cb27b83724cb4030c29a1019981f52b 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -1,2 +1,9 @@
+add_custom_target(paddle_apis ALL
+                  DEPENDS paddle_v2_apis paddle_fluid_apis)
+
+add_custom_target(paddle_docs ALL
+                  DEPENDS paddle_v2_docs paddle_v2_docs_cn
+                  paddle_fluid_docs paddle_fluid_docs_cn)
+
 add_subdirectory(v2)
 add_subdirectory(fluid)
diff --git a/doc/fluid/CMakeLists.txt b/doc/fluid/CMakeLists.txt
index cc999f5a8d70a2239ea3b130e9da172d5f681c65..9fe79323ef9377a459d8405cfa74c88c52ce9346 100644
--- a/doc/fluid/CMakeLists.txt
+++ b/doc/fluid/CMakeLists.txt
@@ -27,6 +27,8 @@ sphinx_add_target(paddle_fluid_docs
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_EN})
 
+add_dependencies(paddle_fluid_docs gen_proto_py)
+
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
 
@@ -47,3 +49,7 @@ sphinx_add_target(paddle_fluid_docs_cn
                   ${SPHINX_CACHE_DIR_CN}
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_CN})
+
+add_dependencies(paddle_fluid_docs_cn gen_proto_py)
+
+add_subdirectory(api)
diff --git a/doc/fluid/api/CMakeLists.txt b/doc/fluid/api/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ca40dfb9644cea69329be0ec231378506c138bc0
--- /dev/null
+++ b/doc/fluid/api/CMakeLists.txt
@@ -0,0 +1,22 @@
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in"
+    "${BINARY_BUILD_DIR_EN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_fluid_apis
+                  html
+                  ${BINARY_BUILD_DIR_EN}
+                  ${SPHINX_CACHE_DIR_EN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_EN})
+
+add_dependencies(paddle_fluid_apis  gen_proto_py framework_py_proto copy_paddle_pybind)
diff --git a/doc/v2/api/fluid/data_feeder.rst b/doc/fluid/api/data_feeder.rst
similarity index 100%
rename from doc/v2/api/fluid/data_feeder.rst
rename to doc/fluid/api/data_feeder.rst
diff --git a/doc/v2/api/fluid/evaluator.rst b/doc/fluid/api/evaluator.rst
similarity index 100%
rename from doc/v2/api/fluid/evaluator.rst
rename to doc/fluid/api/evaluator.rst
diff --git a/doc/v2/api/fluid/executor.rst b/doc/fluid/api/executor.rst
similarity index 100%
rename from doc/v2/api/fluid/executor.rst
rename to doc/fluid/api/executor.rst
diff --git a/doc/v2/api/fluid/gen_doc.py b/doc/fluid/api/gen_doc.py
similarity index 100%
rename from doc/v2/api/fluid/gen_doc.py
rename to doc/fluid/api/gen_doc.py
diff --git a/doc/v2/api/fluid/gen_doc.sh b/doc/fluid/api/gen_doc.sh
similarity index 100%
rename from doc/v2/api/fluid/gen_doc.sh
rename to doc/fluid/api/gen_doc.sh
diff --git a/doc/v2/api/fluid/index.rst b/doc/fluid/api/index_en.rst
similarity index 100%
rename from doc/v2/api/fluid/index.rst
rename to doc/fluid/api/index_en.rst
diff --git a/doc/v2/api/fluid/initializer.rst b/doc/fluid/api/initializer.rst
similarity index 100%
rename from doc/v2/api/fluid/initializer.rst
rename to doc/fluid/api/initializer.rst
diff --git a/doc/v2/api/fluid/io.rst b/doc/fluid/api/io.rst
similarity index 100%
rename from doc/v2/api/fluid/io.rst
rename to doc/fluid/api/io.rst
diff --git a/doc/v2/api/fluid/layers.rst b/doc/fluid/api/layers.rst
similarity index 100%
rename from doc/v2/api/fluid/layers.rst
rename to doc/fluid/api/layers.rst
diff --git a/doc/v2/api/fluid/nets.rst b/doc/fluid/api/nets.rst
similarity index 100%
rename from doc/v2/api/fluid/nets.rst
rename to doc/fluid/api/nets.rst
diff --git a/doc/v2/api/fluid/optimizer.rst b/doc/fluid/api/optimizer.rst
similarity index 100%
rename from doc/v2/api/fluid/optimizer.rst
rename to doc/fluid/api/optimizer.rst
diff --git a/doc/v2/api/fluid/param_attr.rst b/doc/fluid/api/param_attr.rst
similarity index 100%
rename from doc/v2/api/fluid/param_attr.rst
rename to doc/fluid/api/param_attr.rst
diff --git a/doc/v2/api/fluid/profiler.rst b/doc/fluid/api/profiler.rst
similarity index 100%
rename from doc/v2/api/fluid/profiler.rst
rename to doc/fluid/api/profiler.rst
diff --git a/doc/v2/api/fluid/regularizer.rst b/doc/fluid/api/regularizer.rst
similarity index 100%
rename from doc/v2/api/fluid/regularizer.rst
rename to doc/fluid/api/regularizer.rst
diff --git a/doc/fluid/dev/api_doc_std_en.md b/doc/fluid/dev/api_doc_std_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..e57072d52fd162e92a3482aef33f99ab9394c532
--- /dev/null
+++ b/doc/fluid/dev/api_doc_std_en.md
@@ -0,0 +1,226 @@
+# API Doc Standard
+
+- [API Doc Structure](#API Doc Structure)
+- [Format and Examples](#Format and Examples)
+- [Complete Example](#Complete Example)
+
+
+## API Doc Structure
+
+API Doc should contain the following parts(please write them in order):
+
+- Python API Definition
+
+  The definition of API
+
+- Function Description
+
+  Description of API's function. 
+  The description includes: meaning, purpose and operation on input of API, reference and corresponding link(if any), formula(if necessary) and explanations of key variables in the formula.
+
+- Args Description
+
+  Description of API parameters.
+  Introduce parameters one by one according to the order in API definition.
+  The introduction includes: data type, default value(if any), meaning, etc.
+
+- Returns
+
+  Introduction of API returned value.
+  Introduce meaning of returned value, provide correspoding format if necessary.
+  If returned value is a tuple containing multiple parameters, then introduce parameters one by one in order.
+
+- Raises（if any）
+
+   Abnormality, error that may occur, and possible reasons. If there are more than one possible abnormity or error, they should be listed in order. 
+
+- Note（if any）
+
+  Matters needing attention. If there are more than one matters, they should be listed in order. 
+
+- Examples
+
+  Examples of how to use API.
+
+
+## Format and Examples
+
+API documentation must obey reStructuredText format, please refer to [here](http://sphinx-doc-zh.readthedocs.io/en/latest/rest.html).
+Format and examples of each part of API documantation are as follows: (take fc for example)
+
+- Python API Definition
+
+  - Format
+
+      [Python API Definition]
+
+  - Example
+
+      ```
+      fc(input,
+         size,
+         num_flatten_dims=1,
+         param_attr=None,
+         bias_attr=None,
+         act=None,
+         name=None,
+         main_program=None,
+         startup_program=None)
+      ```
+
+- Function Description
+
+  - Format
+
+      This part contains (please write them in order):
+
+      [Function Description]
+
+      [Formula]
+
+      [Symbols' Descriptions if necessary]
+
+      [References if necessary]
+
+  - Example
+
+      [Function Description]
+
+       ```
+       **Fully Connected Layer**
+
+       The fully connected layer can take multiple tensors as its inputs. It
+       creates a variable called weights for each input tensor, which represents
+       a fully connected weight matrix from each input unit to each output unit.
+       The fully connected layer multiplies each input tensor with its coresponding
+       weight to produce an output Tensor. If multiple input tensors are given,
+       the results of multiple multiplications will be sumed up. If bias_attr is
+       not None, a bias variable will be created and added to the output. Finally,
+       if activation is not None, it will be applied to the output as well.
+       ```
+
+      [Formula]
+
+      ```
+      This process can be formulated as follows:
+
+      .. math::
+
+           Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
+      ```
+
+      [Symbols' Descriptions if necessary]
+
+      ```
+      In the above equation:
+
+      * :math:`N`: Number of the input.
+      * :math:`X_i`: The input tensor.
+      * :math:`W`: The weights created by this layer.
+      * :math:`b`: The bias parameter created by this layer (if needed).
+      * :math:`Act`: The activation function.
+      * :math:`Out`: The output tensor.
+      ```
+
+      [References if necessary]
+
+      Since there is no need for reference of fc, we omit them here. Under other circumstances, please provide explicit reference and link, take layer_norm for example: 
+
+      ```
+      Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_ for more details.
+      ```
+
+
+- Args Description
+
+  - Format
+
+      \[Arg's Name\][(Data Type, Default Value)][Description]
+
+  - Example
+
+      part of fc parameters are as follows:
+
+      ```
+      Args:
+          input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
+              the input tensor(s) is at least 2.
+          param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
+              parameters/weights of this layer.
+          name (str, default None): The name of this layer.
+      ```
+
+- Returns
+
+  - Format
+
+      [Name][Shape]
+
+  - Example
+
+      ```
+      Returns:
+          A tensor variable storing the transformation result.
+      ```
+
+      when returned value contain more than one tuple, please introduce every parameter in order, take dynamic_lstm for example:
+
+      ```
+      Returns:
+          A tuple containing:
+            The hidden state of LSTM whose shape is (T X D).
+            The cell state of LSTM whose shape is (T X D).
+      ```
+
+- Raises
+
+  - Format
+
+      [Exception Type][Condition]
+
+  - Example
+
+      ```
+      Raises:
+          ValueError: If the rank of the input is less than 2.
+      ```
+
+- Note
+
+  - Format
+
+     [Note]
+
+  - Example
+
+      there is no Note in fc, so we omit this part. If there is any note, please write clearly. If there are more than one notes, please list them in order. Take scaled\_dot\_product\_attention for example:
+
+      ```
+      Note:
+          1. When num_heads > 1, three linear projections are learned respectively
+             to map input queries, keys and values into queries', keys' and values'.
+             queries', keys' and values' have the same shapes with queries, keys
+             and values.
+          2. When num_heads == 1, scaled_dot_product_attention has no learnable
+             parameters.
+      ```
+
+- Examples
+
+  - Format
+
+      \[Python Code Snipper]
+
+  - Example
+
+      ```
+      Examples:
+          .. code-block:: python
+
+            data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+            fc = fluid.layers.fc(input=data, size=1000, act="tanh")
+      ```
+
+## Complete Example
+
+Complete Example of fc please see [here](src/fc.py)。
diff --git a/doc/v2/CMakeLists.txt b/doc/v2/CMakeLists.txt
index 286fe8845cd7a909d4030540e72362864b536063..82de7a3a3e1ca7724e1eda877d53454a4fa4129a 100644
--- a/doc/v2/CMakeLists.txt
+++ b/doc/v2/CMakeLists.txt
@@ -20,13 +20,15 @@ configure_file(
     "${BINARY_BUILD_DIR_EN}/conf.py"
     @ONLY)
 
-sphinx_add_target(paddle_docs
+sphinx_add_target(paddle_v2_docs
                   html
                   ${BINARY_BUILD_DIR_EN}
                   ${SPHINX_CACHE_DIR_EN}
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_EN})
 
+add_dependencies(paddle_v2_docs gen_proto_py)
+
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
 
@@ -41,11 +43,13 @@ configure_file(
     "${BINARY_BUILD_DIR_CN}/conf.py"
     @ONLY)
 
-sphinx_add_target(paddle_docs_cn
+sphinx_add_target(paddle_v2_docs_cn
                   html
                   ${BINARY_BUILD_DIR_CN}
                   ${SPHINX_CACHE_DIR_CN}
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_CN})
 
+add_dependencies(paddle_v2_docs_cn gen_proto_py)
+
 add_subdirectory(api)
diff --git a/doc/v2/api/CMakeLists.txt b/doc/v2/api/CMakeLists.txt
index 2ad589e8a260e48d46cba2300d6e2bcd4bdd8019..da1eafc02ed8cd155d4f0f1fbadcb7b237b6fcc1 100644
--- a/doc/v2/api/CMakeLists.txt
+++ b/doc/v2/api/CMakeLists.txt
@@ -12,9 +12,11 @@ configure_file(
     "${BINARY_BUILD_DIR_EN}/conf.py"
     @ONLY)
 
-sphinx_add_target(paddle_api_docs
+sphinx_add_target(paddle_v2_apis
                   html
                   ${BINARY_BUILD_DIR_EN}
                   ${SPHINX_CACHE_DIR_EN}
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_EN})
+
+add_dependencies(paddle_v2_apis  gen_proto_py framework_py_proto copy_paddle_pybind)
diff --git a/doc/v2/faq/build_and_install/index_cn.rst b/doc/v2/faq/build_and_install/index_cn.rst
index 7c7e896d187e4fe1544d7ec933fa4fa9f24df3cd..f292684fb5fe2df06db5239e7f43fdfa1dd2f2bd 100644
--- a/doc/v2/faq/build_and_install/index_cn.rst
+++ b/doc/v2/faq/build_and_install/index_cn.rst
@@ -139,3 +139,77 @@ PaddlePaddle使用avx SIMD指令提高cpu执行效率，因此错误的使用二
     touch ../extern_mklml-stamp/extern_mklml-download
 
     // 4. 接着编译即可
+
+9. 在Mac上无法安装numpy等Python包，权限错误
+------------------
+
+Mac上对自带的Python和包有严格的权限保护，最好不要在自带的Python上安装。建议用virtualenv建立一个新的Python环境来操作。
+
+virtualenv的基本原理是将机器上的Python运行所需的运行环境完整地拷贝一份。我们可以在一台机器上制造多份拷贝，并在这多个拷贝之间自由切换，这样就相当于在一台机器上拥有了多个相互隔离、互不干扰的Python环境。
+
+下面简单介绍下如何用virtualenv为Paddle生成一个专用的Python环境：
+
+安装virtualenv：
+::::::::::::::::
+
+virtualenv本身也是Python的一个包，可以用pip进行安装：
+
+..  code-block:: bash
+
+    sudo -H pip install virtualenv
+
+由于virtualenv需要安装给系统自带的Python，因此需要使用sudo权限。
+
+创建一个新的Python运行环境：
+:::::::::::::::::::
+
+..  code-block:: bash
+
+    virtualenv --no-site-packages paddle
+
+--no-site-packages 参数表示不拷贝已有的任何第三方包，创造一个完全干净的新Python环境。后面的paddle是我们为这个新创建的环境取的名字。
+
+执行完这一步后，当前目录下应该会出现一个名为paddle（或者你取的其他名字）的目录。这个目录里保存了运行一个Python环境所需要的各种文件。
+
+启动运行环境：
+::::::::::::::::
+
+..  code-block:: bash
+
+    source paddle/bin/activate
+
+执行后会发现命令提示符前面增加了(paddle)字样，说明已经成功启动了名为‘paddle’的Python环境。执行which python，可以发现使用的已经是刚刚创建的paddle目录下的Python。
+
+在这个环境中，我们可以自由地进行Paddle的安装、使用和开发工作，无需担心对系统自带Python的影响。
+
+退出运行环境：
+:::::::::::::::
+
+直接执行：
+
+..  code-block:: bash
+
+    deactivate
+
+可以看到命令提示符前面的(paddle)字样消失。
+
+自动启动某一Python环境：
+::::::::::::::::
+
+如果我们经常使用Paddle，我们每次打开终端后都需要执行一下source paddle/bin/activate来启动环境，比较繁琐。为了简便，可以修改终端的配置文件，来让终端每次启动后自动启动特定的Python环境。
+
+执行:
+
+..  code-block:: bash
+
+    vi ~/.bash_profile
+
+打开终端配置文件，并在文件的最后添加一行：
+
+..  code-block:: bash
+
+    source paddle/bin/activate
+
+保存并关闭文件。
+
+这样，每次打开终端时就会自动启动名为‘paddle’的Python环境了。
diff --git a/doc/v2/faq/model/index_en.rst b/doc/v2/faq/model/index_en.rst
index cb26f59655f97dc28a2047994643ae16b8857964..67a33e08e192e5627ac3b0abd76e979f21ed2079 100644
--- a/doc/v2/faq/model/index_en.rst
+++ b/doc/v2/faq/model/index_en.rst
@@ -2,4 +2,80 @@
 Model Configuration
 ###################
 
-TBD
+..  contents::
+
+1. How to deal with error :code:`Duplicated layer name`
+----------------------------------------------------------
+
+The general reason for this error is that users may have set the same value for the attribute :code:`name` in different layers. Try to find out the :code:`name` attribute with the same value in diffrent layers and set them differently.
+
+2. How to use :code:`paddle.layer.memory`'s attribute :code:`name`
+----------------------------------------------------------------------
+
+* :code:`paddle.layer.memory` is used to get the output of a layer's last timestep and the layer is specified by the attribute :code:`name` . Thus,  :code:`paddle.layer.memory` will associate with the layer that has the same value of attribute :code:`name` , and uses the output of the layer's last timestep as the input of its current timestep.
+
+* All the PaddlePaddle's layers have a unique name, which is set by the attribute :code:`name` . PaddlePaddle will automatically set it for the user when it is not explicitly set. :code:`paddle.layer.memory` is not a real layer, its name is set by the attribute :code:`memory_name`  and PaddlePaddle will also automatically set it when the user does not explicitly set. The :code:`paddle.layer.memory` attribute :code:`name` is used to specify the layer it is associated with, and needs to be explicitly set by the user.
+
+
+3. What is the difference between the two ways of using dropout
+-----------------------------------------------------------------
+
+* There are two ways to use dropout in PaddlePaddle
+
+  * Set the :code:`drop_rate` parameter in the layer's :code:`layer_atter` attribute. Take :code:`paddle.layer.fc` as an example:
+
+  ..  code-block:: python
+
+      fc = paddle.layer.fc(input=input, layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=0.5))
+
+  * Use :code:`paddle.layer.dropout` layer. Take :code:`paddle.layer.fc` as an example:
+
+  ..  code-block:: python
+
+      fc = paddle.layer.fc(input=input)
+      drop_fc = paddle.layer.dropout(input=fc, dropout_rate=0.5)
+
+* :code:`paddle.layer.dropout` actually uses the :code:`paddle.layer.add_to` layer and sets :code:`drop_rate` as the previous method. This method is very memory intensive.
+
+* PaddlePaddle implements dropout in the activation function rather than in the layer.
+
+* :code:`paddle.layer.lstmemory`, :code:`paddle.layer.grumemory`, :code:`paddle.layer.recurrent` implement activation of output in an unusual way, so we cannot use dropout by setting :code:`drop_rate` . To use dropout for these layers, we could use the second method, which is to use :code:`paddle.layer.dropout`.
+
+4. The differences between different recurrent layers
+--------------------------------------------------------
+Take LSTM as an example. There are several kinds of recurrent layers in PaddlePaddle:
+
+* :code:`paddle.layer.lstmemory`
+* :code:`paddle.networks.simple_lstm`
+* :code:`paddle.networks.lstmemory_group`
+* :code:`paddle.networks.bidirectional_lstm`
+
+According to implementations, recurrent layer can be classified into 2 types:
+
+1. Recurrent layer implemented by recurrent_group:
+
+  * Using this type of recurrent layers, users can access the intermediate value calculated by the recurrent unit within a timestep (eg: hidden states, memory cells, etc.)
+  * :code:`paddle.networks.lstmemory_group` belongs to this type of recurrent layers.
+
+2. Recurrent layer implemented as a complete operation：
+
+  * Users can only access output values when using this type of recurrent layers.
+  * :code:`paddle.networks.lstmemory_group` , :code:`paddle.networks.simple_lstm` and  :code:`paddle.networks.bidirectional_lstm` belong to this type of recurrent layer；
+
+By implementing recurrent layer as a complete operation, CPU and GPU calculations can be optimized. Therefore, the second type of recurrent layer is more efficient than the first one. In practical applications, we propose to use the second type of recurrent layers if there is no need to access the intermediate variable of LSTM.
+
+In addition, PaddlePaddle also contains a kind of LSTM calculation unit: :code:`paddle.networks.lstmemory_unit`:
+
+  * Unlike the recurrent layer described above, :code:`paddle.networks.lstmemory_unit` defines the computational process of an LSTM unit in a timestep. It is not a complete recurrent layer, nor can it receive sequence data as input.
+  * :code:`paddle.networks.lstmemory_unit` can only be used as a step function in recurrent_group.
+
+5. Can Softmax's calculation dimension be specified？
+--------------------------------------------------------------------
+
+We can't specify calculation dimension for PaddlePaddle's softmax. It can only be calculated by rows.
+In image tasks, for NCHW, if you need to calculate softmax in C dimension, you could use :code:`paddle.layer.switch_order` to change the dimension order, that is, convert NCHW to NHWC, then do the reshape operation and calculate softmax.
+
+6. Does PaddlePaddle support variable-dimensional data inputs
+----------------------------------------------------------------
+
+PaddlePaddle provides :code:`paddle.data_type.dense_array` to support variable-dimensional data input. Simply set the dimension of the data layer to a value larger than the dimension of the input data for occupancy.
diff --git a/doc/v2/howto/cmd_parameter/index_en.rst b/doc/v2/howto/cmd_parameter/index_en.rst
index 0e3c72d27aca063f1b6f1c23e55718dba373c40a..f49683948ef78f363e2439cc25332431830eeb24 100644
--- a/doc/v2/howto/cmd_parameter/index_en.rst
+++ b/doc/v2/howto/cmd_parameter/index_en.rst
@@ -2,10 +2,25 @@
 
 Set Command-line Parameters
 ===========================
+The implementation of deep learning algorithms has a variety of characteristics, such as running environment, running stage, structure of the model and the traning strategy. PaddlePaddle supports the user to set various command-line parameters flexibly, which helps to achieve control of the model training or prediction process.
+
+In this part, we take several actual scenarios as an example, and the use of some command-line parameters is displayed:
 
 ..  toctree::
   :maxdepth: 1
 
   use_case_en.md
+
+Then, we summarize and classify the use of all command-line parameters:
+
+..  toctree::
+  :maxdepth: 1
+
   arguments_en.md
+
+Finally, the detailed descriptions are given, and we try to explain the propeties and significance of these command-line parameters in detail:
+
+..  toctree::
+  :maxdepth: 1
+
   detail_introduction_en.md
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index a34e22ff8765fccbd5ac3a284b7c6820f0055ec3..c425c71160a8fa3830a5fbdae1baaed850710877 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -104,7 +104,7 @@ cc_test(init_test SRCS init_test.cc DEPS init)
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
       
-# cc_test(channel_test SRCS channel_test.cc)
+cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )
 cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
         channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
diff --git a/paddle/fluid/framework/channel_impl.h b/paddle/fluid/framework/channel_impl.h
index c47d629289af2c3d1f7c30d711d338745bf5234c..e056779ea0dd0a31191b628f82724298efaf50ff 100644
--- a/paddle/fluid/framework/channel_impl.h
+++ b/paddle/fluid/framework/channel_impl.h
@@ -138,8 +138,8 @@ void ChannelImpl<T>::Send(T *item) {
 
   // If channel is closed, throw exception
   if (closed_) {
-    lock.unlock();
     send_return();
+    lock.unlock();
     PADDLE_THROW("Cannot send on closed channel");
   }
 
@@ -152,11 +152,9 @@ void ChannelImpl<T>::Send(T *item) {
     if (m != nullptr) {
       *(m->data) = std::move(*item);
       m->Notify();
-      lock.unlock();
       send_return();
       return;
     } else {
-      lock.unlock();
       Send(item);
       send_return();
       return;
@@ -169,8 +167,6 @@ void ChannelImpl<T>::Send(T *item) {
   if (buf_.size() < cap_) {
     // Copy to buffer
     buf_.push_back(std::move(*item));
-    // Release lock and return true
-    lock.unlock();
     send_return();
     return;
   }
@@ -181,8 +177,8 @@ void ChannelImpl<T>::Send(T *item) {
   sendq.push_back(m);
   m->Wait(lock);
   if (m->chan_closed) {
-    lock.unlock();
     send_return();
+    lock.unlock();
     PADDLE_THROW("Cannot send on closed channel");
   }
   send_return();
@@ -195,10 +191,7 @@ bool ChannelImpl<T>::Receive(T *item) {
 
   // If channel is closed and buffer is empty or
   // channel is unbuffered
-  if (closed_ && buf_.empty()) {
-    lock.unlock();
-    return recv_return(false);
-  }
+  if (closed_ && buf_.empty()) return recv_return(false);
 
   // If there is a sender, directly receive the value we want
   // from the sender. In case of a buffered channel, read from
@@ -229,7 +222,6 @@ bool ChannelImpl<T>::Receive(T *item) {
       } else
         return recv_return(Receive(item));
     }
-    lock.unlock();
     return recv_return(true);
   }
 
@@ -238,8 +230,7 @@ bool ChannelImpl<T>::Receive(T *item) {
     // Directly read from buffer
     *item = std::move(buf_.front());
     buf_.pop_front();
-    // Release lock and return true
-    lock.unlock();
+    // return true
     return recv_return(true);
   }
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index b39a1164dbd9877d9f45cc6415d74f930921a42f..f6a43804ef2fd73c4a2c2c3b3dfbb90bff1c451b 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -517,6 +517,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   // do data transform
   Scope& new_scope = scope.NewScope();
 
+  std::vector<std::string> inplace_vars;
   for (auto& var_name_item : this->Inputs()) {
     for (auto& var_name : var_name_item.second) {
       auto* var = scope.FindVar(var_name);
@@ -529,10 +530,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
             auto out_var_names = OutputVars(true);
             if (std::find(out_var_names.begin(), out_var_names.end(),
                           var_name) != out_var_names.end()) {
-              PADDLE_THROW(
-                  "var %s is both input and output, "
-                  "does not support transform",
-                  var_name);
+              inplace_vars.push_back(var_name);
             }
             VLOG(3) << "Transform Variable " << var_name << " from "
                     << kernel_type_for_var << " to " << expected_kernel_key;
@@ -551,6 +549,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   kernel_iter->second->Compute(
       ExecutionContext(*this, new_scope, *new_dev_ctx));
 
+  for (auto& var_name : inplace_vars) {
+    VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
+    auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name));
+    auto* transformed_tensor = GetTensorFromVar(new_scope.FindVar(var_name));
+    original_tensor->ShareDataWith(*transformed_tensor);
+  }
+
   /*For profiling/benchmark only*/
   if (FLAGS_benchmark) {
     new_dev_ctx->Wait();
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 8a90f231d741b01532fe3c18e11c54648d97f868..91f2db9354c2a00ec7e51ea4595c7cfa00da23ea 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/parallel_executor.h"
+#include <string>
 
 #include "ThreadPool.h"
 
@@ -102,30 +103,43 @@ void ParallelExecutor::BCastParamsToGPUs(
   auto *main_scope = member_->local_scopes_[0];
 
   for (auto *var_desc : startup_program.Block(0).AllVars()) {
+    size_t idx = var_desc->Name().find("@GRAD");
+    if (idx != std::string::npos) continue;
     if (var_desc->GetType() == proto::VarType::LOD_TENSOR) {
       auto &main_tensor =
           main_scope->FindVar(var_desc->Name())->Get<LoDTensor>();
-      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
-      auto &dims = main_tensor.dims();
-      size_t numel = main_tensor.numel();
 
-      platform::NCCLGroupGuard guard;
+      auto &dims = main_tensor.dims();
 
-      for (size_t i = 0; i < member_->places_.size(); ++i) {
-        auto place = member_->places_[i];
-        void *buffer;
-        if (i == 0) {
-          buffer = const_cast<void *>(main_tensor.data<void>());
-        } else {
+      if (paddle::platform::is_gpu_place(main_tensor.place())) {
+        size_t numel = main_tensor.numel();
+        ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
+        platform::NCCLGroupGuard guard;
+        for (size_t i = 0; i < member_->places_.size(); ++i) {
+          auto place = member_->places_[i];
+          void *buffer;
+          if (i == 0) {
+            buffer = const_cast<void *>(main_tensor.data<void>());
+          } else {
+            auto local_scope = member_->local_scopes_[i];
+            auto *t =
+                local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>();
+            t->Resize(dims);
+            buffer = t->mutable_data(place, main_tensor.type());
+          }
+          auto &nccl_ctx = member_->nccl_ctxs_->at(place);
+          platform::dynload::ncclBcast(buffer, numel, data_type, 0,
+                                       nccl_ctx.comm_, nccl_ctx.stream());
+        }
+      } else {
+        platform::CPUPlace cpu;
+        for (size_t i = 1; i < member_->places_.size(); ++i) {
           auto local_scope = member_->local_scopes_[i];
           auto *t = local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>();
           t->Resize(dims);
-          buffer = t->mutable_data(place, main_tensor.type());
+          t->mutable_data(cpu, main_tensor.type());
+          paddle::framework::TensorCopy(main_tensor, cpu, t);
         }
-
-        auto &nccl_ctx = member_->nccl_ctxs_->at(place);
-        platform::dynload::ncclBcast(buffer, numel, data_type, 0,
-                                     nccl_ctx.comm_, nccl_ctx.stream());
       }
     }
     member_->nccl_ctxs_->WaitAll();
diff --git a/paddle/fluid/operators/compare_op.cc b/paddle/fluid/operators/compare_op.cc
index 86f7046058c7001fcaa588727b1cdc0f3f20c35f..9a139ab27ec53395a8d1ab1347dbce93ea68fd8e 100644
--- a/paddle/fluid/operators/compare_op.cc
+++ b/paddle/fluid/operators/compare_op.cc
@@ -29,6 +29,11 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Y", string::Sprintf(
                       "(LoDTensor) the right hand operand of %s operator",
                       comment.type));
+    AddAttr<bool>("force_cpu",
+                  "(bool, default false) Force fill output variable to cpu "
+                  "memory. Otherwise, fill output variable to the running "
+                  "device")
+        .SetDefault(false);
     AddOutput("Out", string::Sprintf(
                          "(LoDTensor) n-dim bool tensor. Each element is %s",
                          comment.equation));
@@ -75,7 +80,9 @@ class CompareOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
     // CompareOp kernel's device type is decided by input tensor place
-    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
+    bool force_cpu = ctx.Attr<bool>("force_cpu");
+    kt.place_ = force_cpu ? platform::CPUPlace()
+                          : ctx.Input<framework::LoDTensor>("X")->place();
     return kt;
   }
 };
diff --git a/paddle/fluid/operators/conditional_block_op.cc b/paddle/fluid/operators/conditional_block_op.cc
index 337b34e8f0bf4cb89753235205be9eb058dd01ab..bff2c34ec893d0e6212426b108dd98b0d0d0fb48 100644
--- a/paddle/fluid/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
@@ -54,7 +54,18 @@ class ConditionalOp : public framework::OperatorBase {
           "numel should be 1, actual numel is %d",
           ips[0]->numel());
     }
-    return ips[0]->data<bool>()[0];
+    bool res = false;
+    if (platform::is_gpu_place(ips[0]->place())) {
+#ifdef PADDLE_WITH_CUDA
+      framework::LoDTensor cpu_tensor;
+      framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor);
+      platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait();
+      res = cpu_tensor.data<bool>()[0];
+#endif
+    } else {
+      res = ips[0]->data<bool>()[0];
+    }
+    return res;
   }
 };
 
diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt
index 2b19f0448955d2d7582f23ac133c14ffdf5c9e49..f8cd2852f3eed7a960f22ebd45292b3cb56116bb 100644
--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
@@ -2,7 +2,8 @@ if(WITH_DISTRIBUTE)
   grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
       grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-  set_source_files_properties(test_serde.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  cc_test(serde_test SRCS test_serde.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
+  set_source_files_properties(serde_test.cc grpc_server_test PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
       cares zlib protobuf sendrecvop_grpc)
+  cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
 endif()
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index 9652bb888b5937390cc183a96ff7ebf5a4fa2426..ba9882ce244f69d5fbe3214d3c3470cd4ec87510 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -150,7 +150,8 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
     s->response_call_back_ = ProcGetResponse;
 
     auto call = s->stub_g_.PrepareUnaryCall(
-        s->context_.get(), "/sendrecv.SendRecvService/GetVariable", req, &cq_);
+        s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
+        &cq_);
     call->StartCall();
     call->Finish(&s->reply_, &s->status_, (void*)s);
   });
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index 9691d1e86b111def5b82e022dd01795aaf5c7b0d..591b3e334acba19421f55474aba8de2fa3d3a4d4 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -128,6 +128,47 @@ class RequestGet final : public RequestBase {
   SimpleBlockQueue<MessageWithName>* queue_;
 };
 
+class RequestPrefetch final : public RequestBase {
+ public:
+  explicit RequestPrefetch(GrpcService::AsyncService* service,
+                           ::grpc::ServerCompletionQueue* cq,
+                           framework::Scope* scope,
+                           const platform::DeviceContext* dev_ctx,
+                           framework::Executor* executor,
+                           framework::ProgramDesc* program, int blkid)
+      : RequestBase(service, cq, dev_ctx),
+        responder_(&ctx_),
+        scope_(scope),
+        executor_(executor),
+        program_(program),
+        blkid_(blkid) {
+    int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
+    service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_,
+                                cq_, this);
+  }
+
+  virtual ~RequestPrefetch() {}
+
+  virtual std::string GetReqName() { return request_.varname(); }
+
+  virtual void Process() {
+    // prefetch process...
+    ::grpc::ByteBuffer reply;
+    // TODO(Yancey1989): execute the Block which containers prefetch ops
+
+    responder_.Finish(reply, ::grpc::Status::OK, this);
+    status_ = FINISH;
+  }
+
+ protected:
+  sendrecv::VariableMessage request_;
+  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
+  framework::Scope* scope_;
+  framework::Executor* executor_;
+  framework::ProgramDesc* program_;
+  int blkid_;
+};
+
 void AsyncGRPCServer::WaitClientGet(int count) {
   int fetch_barriers = 0;
   while (fetch_barriers < count) {
@@ -147,6 +188,7 @@ void AsyncGRPCServer::RunSyncUpdate() {
 
   cq_send_ = builder.AddCompletionQueue();
   cq_get_ = builder.AddCompletionQueue();
+  cq_prefetch_ = builder.AddCompletionQueue();
 
   server_ = builder.BuildAndStart();
   LOG(INFO) << "Server listening on " << address_ << std::endl;
@@ -155,6 +197,8 @@ void AsyncGRPCServer::RunSyncUpdate() {
       std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this);
   std::function<void()> get_register =
       std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this);
+  std::function<void()> prefetch_register =
+      std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this);
 
   t_send_.reset(
       new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
@@ -163,24 +207,27 @@ void AsyncGRPCServer::RunSyncUpdate() {
   t_get_.reset(
       new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
                                 cq_get_.get(), "cq_get", get_register)));
-
+  t_prefetch_.reset(new std::thread(
+      std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(),
+                "cq_prefetch", prefetch_register)));
   // wait server
   server_->Wait();
   t_send_->join();
   t_get_->join();
+  t_prefetch_->join();
 }
 
 void AsyncGRPCServer::ShutdownQueue() {
   std::unique_lock<std::mutex> lock(cq_mutex_);
   cq_send_->Shutdown();
   cq_get_->Shutdown();
-  is_shut_down_ = true;
 }
 
 // This URL explains why shutdown is complicate:
 void AsyncGRPCServer::ShutDown() {
-  server_->Shutdown();
+  is_shut_down_ = true;
   ShutdownQueue();
+  server_->Shutdown();
 }
 
 void AsyncGRPCServer::TryToRegisterNewSendOne() {
@@ -203,6 +250,18 @@ void AsyncGRPCServer::TryToRegisterNewGetOne() {
   VLOG(4) << "Create RequestGet status:" << get->Status();
 }
 
+void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  if (is_shut_down_) {
+    return;
+  }
+  RequestPrefetch* prefetch =
+      new RequestPrefetch(&service_, cq_prefetch_.get(), scope_, dev_ctx_,
+                          executor_, program_, prefetch_blk_id_);
+
+  VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status();
+}
+
 // FIXME(typhoonzero): change cq_name to enum.
 void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
                                     std::string cq_name,
@@ -213,14 +272,14 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
   bool ok = false;
   while (true) {
     if (!cq->Next(&tag, &ok)) {
-      LOG(INFO) << cq_name << " get CompletionQueue shutdown!";
+      LOG(INFO) << cq_name << " CompletionQueue shutdown!";
       break;
     }
 
     PADDLE_ENFORCE(tag);
     // FIXME(typhoonzero): de-couple the barriers with recv_op
-    if (cq_name == "cq_get") WaitCond(1);
-    if (cq_name == "cq_send") WaitCond(0);
+    if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1);
+    if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0);
 
     RequestBase* base = (RequestBase*)tag;
     // reference:
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
index 10e6dd45a901d36de4a6577db4da05551645eb73..dd5cf4b377cb8e4a53c9a161cb32985613de32eb 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -17,7 +17,9 @@ limitations under the License. */
 #include <grpc++/grpc++.h>
 #include <thread>
 
+#include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
@@ -53,6 +55,12 @@ class AsyncGRPCServer final {
 
   void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; }
 
+  void SetProgram(framework::ProgramDesc *program) { program_ = program; }
+
+  void SetPrefetchBlkdId(int blkid) { prefetch_blk_id_ = blkid; }
+
+  void SetExecutor(framework::Executor *executor) { executor_ = executor; }
+
   const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); }
 
   void Push(const std::string &msg_name) {
@@ -66,6 +74,7 @@ class AsyncGRPCServer final {
                      std::function<void()> TryToRegisterNewOne);
   void TryToRegisterNewSendOne();
   void TryToRegisterNewGetOne();
+  void TryToRegisterNewPrefetchOne();
   void ShutdownQueue();
 
  private:
@@ -73,6 +82,7 @@ class AsyncGRPCServer final {
   volatile bool is_shut_down_ = false;
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_send_;
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_get_;
+  std::unique_ptr<::grpc::ServerCompletionQueue> cq_prefetch_;
 
   GrpcService::AsyncService service_;
   std::unique_ptr<::grpc::Server> server_;
@@ -92,6 +102,11 @@ class AsyncGRPCServer final {
 
   std::unique_ptr<std::thread> t_send_;
   std::unique_ptr<std::thread> t_get_;
+  std::unique_ptr<std::thread> t_prefetch_;
+
+  int prefetch_blk_id_;
+  framework::ProgramDesc *program_;
+  framework::Executor *executor_;
 };
 
 };  // namespace detail
diff --git a/paddle/fluid/operators/detail/grpc_server_test.cc b/paddle/fluid/operators/detail/grpc_server_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..577374810696c039b8794fc151083ca7ddf43a10
--- /dev/null
+++ b/paddle/fluid/operators/detail/grpc_server_test.cc
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <string>
+#include <thread>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/grpc_server.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace detail = paddle::operators::detail;
+
+std::unique_ptr<detail::AsyncGRPCServer> rpc_service_;
+
+void StartServer(const std::string& endpoint) {
+  rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
+}
+
+TEST(PREFETCH, CPU) {
+  // start up a server instance backend
+  // TODO(Yancey1989): Need to start a server with optimize blocks and
+  // prefetch blocks.
+  std::thread server_thread(StartServer, "127.0.0.1:8889");
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+  // create var on local scope
+  std::string var_name("tmp_0");
+  auto var = scope.Var(var_name);
+  auto tensor = var->GetMutable<framework::LoDTensor>();
+  tensor->Resize({10, 10});
+
+  detail::RPCClient client;
+  client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, var_name, "");
+  server_thread.join();
+  rpc_service_.reset(nullptr);
+}
diff --git a/paddle/fluid/operators/detail/grpc_service.h b/paddle/fluid/operators/detail/grpc_service.h
index ae6f9db3bd31a4b4839b34e8e53dd87f1ecf4b1d..879e21933b452363c3fccacffb4d16ac1bfd6020 100644
--- a/paddle/fluid/operators/detail/grpc_service.h
+++ b/paddle/fluid/operators/detail/grpc_service.h
@@ -76,6 +76,7 @@ namespace detail {
 enum class GrpcMethod {
   kSendVariable,
   kGetVariable,
+  kPrefetchVariable,
 };
 
 static const int kGrpcNumMethods =
@@ -87,6 +88,8 @@ inline const char* GrpcMethodName(GrpcMethod id) {
       return "/sendrecv.SendRecvService/SendVariable";
     case GrpcMethod::kGetVariable:
       return "/sendrecv.SendRecvService/GetVariable";
+    case GrpcMethod::kPrefetchVariable:
+      return "/sendrecv.SendREcvService/PrefetchVariable";
   }
 
   // Shouldn't be reached.
diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto
index 2d33f026e45c51d9a3812b2391381f74d6fddb29..fc12e82a7e6bd10262092d1ca367980df64e91c2 100644
--- a/paddle/fluid/operators/detail/send_recv.proto
+++ b/paddle/fluid/operators/detail/send_recv.proto
@@ -21,6 +21,8 @@ service SendRecvService {
   rpc SendVariable(VariableMessage) returns (VoidMessage) {}
   // Argument VariableMessage for GetVariable should only contain varname.
   rpc GetVariable(VariableMessage) returns (VariableMessage) {}
+  // Prefetch variable by Ids
+  rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
 }
 
 // VariableMessage is serialized paddle variable message.
diff --git a/paddle/fluid/operators/detail/test_serde.cc b/paddle/fluid/operators/detail/serde_test.cc
similarity index 100%
rename from paddle/fluid/operators/detail/test_serde.cc
rename to paddle/fluid/operators/detail/serde_test.cc
diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h
index b5ee86ae2d11dfc835e1a3a6826ce016baf38a29..0628b4b826d2730a8e3fb4842e4ae550b8c00569 100644
--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
@@ -11,9 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
+
 #include <random>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc
index db97ba4f64105c37c49cafbc3fbc4829c5077467..424d273c34b7e8d70c88b591c4fe45db61465f38 100644
--- a/paddle/fluid/operators/dropout_op_test.cc
+++ b/paddle/fluid/operators/dropout_op_test.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <unistd.h>
+
 #include <string>
-#include <thread>
+#include <thread>  // NOLINT
+#include <vector>
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -30,9 +32,9 @@ namespace m = paddle::operators::math;
 
 USE_OP(dropout);
 
-void Compare(f::Scope& scope, p::DeviceContext& ctx) {
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
   // init
-  auto var = scope.Var("X");
+  auto var = scope->Var("X");
   auto tensor = var->GetMutable<f::LoDTensor>();
   tensor->Resize({10, 10});
 
@@ -44,12 +46,12 @@ void Compare(f::Scope& scope, p::DeviceContext& ctx) {
   TensorFromVector(init, ctx, tensor);
 
   auto place = ctx.GetPlace();
-  auto out_var = scope.Var("Out");
+  auto out_var = scope->Var("Out");
   auto out_tensor = out_var->GetMutable<f::LoDTensor>();
   out_tensor->Resize({10, 10});
   out_tensor->mutable_data<float>(place);  // allocate
 
-  auto mask_var = scope.Var("Mask");
+  auto mask_var = scope->Var("Mask");
   auto mask_tensor = mask_var->GetMutable<f::LoDTensor>();
   mask_tensor->Resize({10, 10});
   mask_tensor->mutable_data<float>(place);  // allocate
@@ -63,7 +65,7 @@ void Compare(f::Scope& scope, p::DeviceContext& ctx) {
   auto dropout_op = f::OpRegistry::CreateOp(
       "dropout", {{"X", {"X"}}}, {{"Out", {"Out"}}, {"Mask", {"Mask"}}}, attrs);
 
-  dropout_op->Run(scope, place);
+  dropout_op->Run(*scope, place);
 
   std::vector<float> out_vec;
   TensorToVector(*out_tensor, ctx, &out_vec);
@@ -81,6 +83,11 @@ void Compare(f::Scope& scope, p::DeviceContext& ctx) {
   }
 }
 
+// TODO(wyi): Due to
+// https://github.com/PaddlePaddle/Paddle/issues/9507, I temporarily
+// disable this test to remove the prevention of the merge of
+// unrelated PRs.
+/*
 TEST(Dropout, CPUDense) {
   f::Scope scope;
   p::CPUPlace place;
@@ -94,3 +101,4 @@ TEST(Dropout, GPUDense) {
   p::CUDADeviceContext ctx(place);
   Compare(scope, ctx);
 }
+*/
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
index 6b5c3db13c0929ae0dd2fb2c981867df0a36c1ce..ec2e641679fedec776d48716f13445f44375ce3d 100644
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -1,71 +1,46 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/increment_op.h"
 
 namespace paddle {
 namespace operators {
 
-class IncrementInferShape : public framework::InferShapeBase {
+class IncrementOp : public framework::OperatorWithKernel {
  public:
-  void operator()(framework::InferShapeContext *ctx) const override {
+  IncrementOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of IncrementOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of IncrementOp should not be null.");
     PADDLE_ENFORCE_EQ(1, framework::product(ctx->GetInputDim("X")));
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", "Out");
   }
-};
-
-struct IncrementFunctor {
-  IncrementFunctor(const framework::LoDTensor &x, framework::LoDTensor *out,
-                   float value)
-      : x_(x), out_(out), value_(value) {}
-
-  template <typename T>
-  void operator()() const {
-    *out_->data<T>() = *x_.data<T>() + static_cast<T>(value_);
-  }
-
-  const framework::LoDTensor &x_;
-  framework::LoDTensor *out_;
-  float value_;
-};
-
-class IncrementOp : public framework::OperatorBase {
- public:
-  IncrementOp(const std::string &type, const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
-    auto &out =
-        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
 
-    PADDLE_ENFORCE(platform::is_cpu_place(x.place()));
-    out.Resize(x.dims());
-    out.mutable_data(x.place(), x.type());
-    float value = Attr<float>("step");
-    VLOG(10) << Output("Out") << " increase " << Input("X") << " with "
-             << value;
-    framework::VisitDataType(framework::ToDataType(out.type()),
-                             IncrementFunctor(x, &out, value));
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
+    // IncrementOp kernel's device type is decided by input tensor place
+    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
+    return kt;
   }
 };
 
@@ -108,5 +83,10 @@ class IncrementGradOpMaker : public framework::SingleGradOpDescMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementInferShape,
-                  ops::IncrementOpMaker, ops::IncrementGradOpMaker);
+REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker,
+                  ops::IncrementGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    increment, ops::IncrementKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::IncrementKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::IncrementKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::IncrementKernel<paddle::platform::CPUDeviceContext, int64_t>)
diff --git a/paddle/fluid/operators/increment_op.cu b/paddle/fluid/operators/increment_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7fb6425fe994751c4d7a025bb62e43a84c8d95c2
--- /dev/null
+++ b/paddle/fluid/operators/increment_op.cu
@@ -0,0 +1,22 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/increment_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    increment, ops::IncrementKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::IncrementKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int64_t>)
diff --git a/paddle/fluid/operators/increment_op.h b/paddle/fluid/operators/increment_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0e8c66255ef68b975701fb6b3c145be2590e271
--- /dev/null
+++ b/paddle/fluid/operators/increment_op.h
@@ -0,0 +1,39 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class IncrementKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x_tensor = context.Input<framework::Tensor>("X");
+    auto* out_tensor = context.Output<framework::Tensor>("Out");
+    float step = context.Attr<float>("step");
+
+    out_tensor->mutable_data<T>(context.GetPlace());
+    auto& dev =
+        *context.template device_context<DeviceContext>().eigen_device();
+    framework::EigenScalar<T>::From(*out_tensor).device(dev) =
+        framework::EigenScalar<T>::From(*x_tensor) + static_cast<T>(step);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 9796fabdb6cd3331ce90dca26e3d5115623ae74c..c27ea1268321744f6566b8b65e98f0df6d408186 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -54,6 +54,24 @@ static void CreateTensorFromMessageType(framework::Variable *var,
   }
 }
 
+static void ParallelExecuteBlocks(const std::vector<size_t> &parallel_blkids,
+                                  framework::Executor *executor,
+                                  framework::ProgramDesc *program,
+                                  framework::Scope *scope) {
+  std::vector<std::future<void>> fs;
+  for (size_t idx : parallel_blkids) {
+    fs.push_back(framework::Async([&executor, &program, &scope, idx]() {
+      int run_block = idx;  // thread local
+      try {
+        executor->Run(*program, scope, run_block, false, false);
+      } catch (std::exception &e) {
+        LOG(ERROR) << "run sub program error " << e.what();
+      }
+    }));
+  }
+  for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
+}
+
 class ListenAndServOp : public framework::OperatorBase {
  public:
   ListenAndServOp(const std::string &type,
@@ -70,7 +88,6 @@ class ListenAndServOp : public framework::OperatorBase {
 
   void Stop() override {
     rpc_service_->Push(LISTEN_TERMINATE_MESSAGE);
-    rpc_service_->ShutDown();
     server_thread_->join();
   }
 
@@ -135,34 +152,27 @@ class ListenAndServOp : public framework::OperatorBase {
         break;
       }
 
-      // put optimize blocks in the thread pool to start run, the last block
-      // should be global ops.
       // NOTE: if is_gpu_place, CUDA kernels are laugched by multiple threads
       // and this will still work.
 
-      std::vector<std::future<void>> fs;
+      // The optimize blocks which have the same parent ID would run parallel
+      // TODO(Yancey1989): need to use ParallelExecutor for future
+      size_t last_parent_blkid = program->Block(1).Parent();
+      std::vector<size_t> parallel_blkids;
+      parallel_blkids.push_back(1);
       double ts = detail::GetTimestamp();
-      // block0 contains only listen_and_serv op, start run from block1.
-      for (int blkid = 1; blkid < num_blocks - 1; ++blkid) {
-        fs.push_back(
-            framework::Async([&executor, &program, &recv_scope, blkid]() {
-              int run_block = blkid;  // thread local
-              try {
-                executor.Run(*program, &recv_scope, run_block, false, false);
-              } catch (std::exception &e) {
-                LOG(ERROR) << "run sub program error " << e.what();
-              }
-            }));
-      }
-      for (int i = 0; i < num_blocks - 2; ++i) fs[i].wait();
-      // Run global block at final step, or block1 if there are only 2 blocks
-      if (num_blocks >= 2) {
-        try {
-          executor.Run(*program, &recv_scope, num_blocks - 1, false, false);
-        } catch (std::exception &e) {
-          LOG(ERROR) << "run sub program error " << e.what();
+      for (size_t blkid = 2; blkid < num_blocks; ++blkid) {
+        if (program->Block(blkid).Parent() != last_parent_blkid) {
+          for (size_t idx : parallel_blkids) VLOG(3) << idx;
+          ParallelExecuteBlocks(parallel_blkids, &executor, program,
+                                &recv_scope);
+          parallel_blkids.clear();
+          last_parent_blkid = program->Block(blkid).Parent();
         }
+        parallel_blkids.push_back(blkid);
       }
+      ParallelExecuteBlocks(parallel_blkids, &executor, program, &recv_scope);
+
       VLOG(2) << "run all blocks spent (ms) " << detail::GetTimestamp() - ts;
 
       // Reset the received sparse variables, the sum operator would not
@@ -178,10 +188,6 @@ class ListenAndServOp : public framework::OperatorBase {
       rpc_service_->WaitClientGet(fan_in);
       sparse_vars.clear();
     }  // while(true)
-
-    // for (int i = 0; i < num_blocks; ++i) {
-    //   delete blk_ctx_list[i];
-    // }
   }
 
  protected:
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index b36b5c3a339bd7e534bcc3eb7a2efef313cb2a5d..cb1568398125bbb57da974096da527200c1e0975 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -214,7 +214,10 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
         "Defaults to \"NHWC\". Specify the data format of the output data, "
         "the input will be transformed automatically. ")
         .SetDefault("AnyLayout");
-    AddAttr<bool>("is_test", "").SetDefault(false);
+    AddAttr<bool>("is_test",
+                  "Turns on memory optimization that optimizes away "
+                  "unnecessary memory allocations. Used by MKLDNN.")
+        .SetDefault(false);
 
     AddComment(R"DOC(
 Local Response Normalization Operator.
diff --git a/paddle/fluid/operators/lrn_op.h b/paddle/fluid/operators/lrn_op.h
index 95796f7eecd2bcd61aab7944f557ca568b03e027..0fd3175e8579df9e61368cc151a94fa45e433884 100644
--- a/paddle/fluid/operators/lrn_op.h
+++ b/paddle/fluid/operators/lrn_op.h
@@ -121,6 +121,10 @@ class LRNGradKernel : public framework::OpKernel<T> {
     T alpha = ctx.Attr<T>("alpha");
     T beta = ctx.Attr<T>("beta");
 
+    PADDLE_ENFORCE(
+        !ctx.Attr<bool>("is_test"),
+        "is_test attribute should be set to False in training phase.");
+
     LRNGradFunctor<DeviceContext, T> f;
     f(ctx, x, out, mid, x_g, out_g, N, C, H, W, n, alpha, beta);
   }
diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc
index e9fb845b475ff5776bf948ab120a44c16ed87aa0..04392b3e05fa2d8b602946ba03672bf2491dcfbc 100644
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -122,7 +122,8 @@ void StartServerNet(bool is_sparse) {
 
   // sub program run in listen_and_serv_op, for simple test we use sum
   f::ProgramDesc program;
-  f::BlockDesc *optimize_block = program.MutableBlock(0);
+  const auto &root_block = program.Block(0);
+  auto *optimize_block = program.AppendBlock(root_block);
   // X for server side tensors, RX for received tensers, must be of same shape.
   AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block);
 
diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc
index 366c82bf96e413add60448a56241d88cdcf2d1d4..45cc271bb888fc3a07ecc5daea6b549cb88b6d21 100644
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/profiler.h"
+#ifdef PADDLE_WITH_CUDA
 #include "cuda_runtime.h"
+#endif
 #include "gtest/gtest.h"
 
 TEST(Event, CpuElapsedTime) {
@@ -159,6 +161,7 @@ TEST(RecordEvent, RecordEvent) {
   DisableProfiler(EventSortingKey::kTotal, "/tmp/profiler");
 }
 
+#ifdef PADDLE_WITH_CUDA
 TEST(TMP, stream_wait) {
   cudaStream_t stream;
   cudaStreamCreate(&stream);
@@ -166,3 +169,4 @@ TEST(TMP, stream_wait) {
   cudaStreamSynchronize(stream);
   cudaStreamSynchronize(stream);
 }
+#endif
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 12c3a50d49a808c979545926ae98e0331473ef91..f916295cd7bc762e2052553b321344845f504648 100755
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -125,9 +125,8 @@ EOF
             -DWITH_AVX=${WITH_AVX:-ON} \
             -DWITH_SWIG_PY=ON \
             -DWITH_STYLE_CHECK=OFF
-        make -j `nproc` gen_proto_py framework_py_proto
-        make -j `nproc` copy_paddle_pybind
-        make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
+
+        make -j `nproc` paddle_docs paddle_apis
         popd
     fi
 
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
index c3892491725dc960375f3f2d8fdda7f39dc84d04..d7527d99482bfe93a06e0de150a6c1ece36addde 100755
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -7,9 +7,8 @@ cd $TRAVIS_BUILD_DIR/build
 
 # Compile Documentation only.
 cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON -DWITH_STYLE_CHECK=OFF
-make -j `nproc` gen_proto_py framework_py_proto
-make -j `nproc` copy_paddle_pybind
-make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
+
+make -j `nproc` paddle_docs paddle_apis
 
 # check websites for broken links
 linkchecker doc/v2/en/html/index.html
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index b0242b20b8d0fd81e624447d56e47865e1bf6438..f5ae553c8571e21b351d0f5507afdf1539843a51 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -73,12 +73,13 @@ add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps})
 set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
 
 if (WITH_TESTING)
+  add_subdirectory(paddle/reader/tests)
+  add_subdirectory(paddle/dataset/tests)
   if(NOT WITH_FLUID_ONLY)
     add_subdirectory(paddle/trainer_config_helpers/tests)
     if (WITH_SWIG_PY)
       # enable v2 API unittest only when paddle swig api is compiled
       add_subdirectory(paddle/v2/tests)
-      add_subdirectory(paddle/v2/reader/tests)
       add_subdirectory(paddle/v2/plot/tests)
     endif()
   endif()
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 1030c94e16376c326cb8b32926b8c47625cd38f0..d1cf04161ae4444ebc7da7fbc20e37dafe6c0fb1 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -14,8 +14,14 @@
 try:
     from version import full_version as __version__
     from version import commit as __git_commit__
+
 except ImportError:
     import sys
     sys.stderr.write('''Warning with import paddle: you should not 
      import paddle from the source directory; please install paddlepaddle*.whl firstly.'''
                      )
+
+import reader
+import dataset
+import batch
+batch = batch.batch
diff --git a/python/paddle/v2/minibatch.py b/python/paddle/batch.py
similarity index 100%
rename from python/paddle/v2/minibatch.py
rename to python/paddle/batch.py
diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/dataset/__init__.py
similarity index 97%
rename from python/paddle/v2/dataset/__init__.py
rename to python/paddle/dataset/__init__.py
index c1acbecd9c313b02d6d33d2d04fd33fc1a8b026e..1fdfd49f1c970d89bfde9d12a24076d38c54ba66 100644
--- a/python/paddle/v2/dataset/__init__.py
+++ b/python/paddle/dataset/__init__.py
@@ -28,6 +28,7 @@ import wmt16
 import mq2007
 import flowers
 import voc2012
+import image
 
 __all__ = [
     'mnist',
@@ -43,4 +44,5 @@ __all__ = [
     'mq2007',
     'flowers',
     'voc2012',
+    'image',
 ]
diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/dataset/cifar.py
similarity index 80%
rename from python/paddle/v2/dataset/cifar.py
rename to python/paddle/dataset/cifar.py
index 0a2a1ced11ee5cb2fb407b229ce810d553c2fa46..07f4dcbdab2fecf84a0a7042a48a8c8a9e5f880d 100644
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/dataset/cifar.py
@@ -31,7 +31,7 @@ images per class.
 import cPickle
 import itertools
 import numpy
-import paddle.v2.dataset.common
+import paddle.dataset.common
 import tarfile
 
 __all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
@@ -75,7 +75,7 @@ def train100():
     :rtype: callable
     """
     return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
         'train')
 
 
@@ -90,7 +90,7 @@ def test100():
     :rtype: callable
     """
     return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
         'test')
 
 
@@ -105,7 +105,7 @@ def train10():
     :rtype: callable
     """
     return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
         'data_batch')
 
 
@@ -120,20 +120,20 @@ def test10():
     :rtype: callable
     """
     return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
         'test_batch')
 
 
 def fetch():
-    paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
-    paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
+    paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
+    paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
 
 
 def convert(path):
     """
     Converts dataset to recordio format
     """
-    paddle.v2.dataset.common.convert(path, train100(), 1000, "cifar_train100")
-    paddle.v2.dataset.common.convert(path, test100(), 1000, "cifar_test100")
-    paddle.v2.dataset.common.convert(path, train10(), 1000, "cifar_train10")
-    paddle.v2.dataset.common.convert(path, test10(), 1000, "cifar_test10")
+    paddle.dataset.common.convert(path, train100(), 1000, "cifar_train100")
+    paddle.dataset.common.convert(path, test100(), 1000, "cifar_test100")
+    paddle.dataset.common.convert(path, train10(), 1000, "cifar_train10")
+    paddle.dataset.common.convert(path, test10(), 1000, "cifar_test10")
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/dataset/common.py
similarity index 93%
rename from python/paddle/v2/dataset/common.py
rename to python/paddle/dataset/common.py
index c6ff09a1d1e3ca56877e986c3ed3ae9ecd0a7316..68660601c161d2332b17b448fae089506238ba78 100644
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -19,7 +19,7 @@ import errno
 import shutil
 import sys
 import importlib
-import paddle.v2.dataset
+import paddle.dataset
 import cPickle
 import glob
 import cPickle as pickle
@@ -105,24 +105,24 @@ def download(url, module_name, md5sum, save_name=None):
 
 def fetch_all():
     for module_name in filter(lambda x: not x.startswith("__"),
-                              dir(paddle.v2.dataset)):
+                              dir(paddle.dataset)):
         if "fetch" in dir(
-                importlib.import_module("paddle.v2.dataset.%s" % module_name)):
+                importlib.import_module("paddle.dataset.%s" % module_name)):
             getattr(
-                importlib.import_module("paddle.v2.dataset.%s" % module_name),
+                importlib.import_module("paddle.dataset.%s" % module_name),
                 "fetch")()
 
 
 def fetch_all_recordio(path):
     for module_name in filter(lambda x: not x.startswith("__"),
-                              dir(paddle.v2.dataset)):
+                              dir(paddle.dataset)):
         if "convert" in dir(
-                importlib.import_module("paddle.v2.dataset.%s" % module_name)) and \
+                importlib.import_module("paddle.dataset.%s" % module_name)) and \
                 not module_name == "common":
             ds_path = os.path.join(path, module_name)
             must_mkdirs(ds_path)
             getattr(
-                importlib.import_module("paddle.v2.dataset.%s" % module_name),
+                importlib.import_module("paddle.dataset.%s" % module_name),
                 "convert")(ds_path)
 
 
@@ -130,7 +130,7 @@ def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
     """
     you can call the function as:
 
-    split(paddle.v2.dataset.cifar.train10(), line_count=1000,
+    split(paddle.dataset.cifar.train10(), line_count=1000,
         suffix="imikolov-train-%05d.pickle")
 
     the output files as:
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/dataset/conll05.py
similarity index 88%
rename from python/paddle/v2/dataset/conll05.py
rename to python/paddle/dataset/conll05.py
index 0d544efac9cd20157f87b5cd3b68f97ab5ed2dbc..4e94ce89892f8e6822c15fdc510805e75dfca988 100644
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/dataset/conll05.py
@@ -23,7 +23,7 @@ to initialize SRL model.
 import tarfile
 import gzip
 import itertools
-import paddle.v2.dataset.common
+import paddle.dataset.common
 
 __all__ = ['test, get_dict', 'get_embedding', 'convert']
 
@@ -203,14 +203,11 @@ def get_dict():
     Get the word, verb and label dictionary of Wikipedia corpus.
     """
     word_dict = load_dict(
-        paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st',
-                                          WORDDICT_MD5))
+        paddle.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
     verb_dict = load_dict(
-        paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st',
-                                          VERBDICT_MD5))
+        paddle.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
     label_dict = load_label_dict(
-        paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st',
-                                          TRGDICT_MD5))
+        paddle.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
     return word_dict, verb_dict, label_dict
 
 
@@ -218,7 +215,7 @@ def get_embedding():
     """
     Get the trained word vector based on Wikipedia corpus.
     """
-    return paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
+    return paddle.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
 
 
 def test():
@@ -235,23 +232,23 @@ def test():
     """
     word_dict, verb_dict, label_dict = get_dict()
     reader = corpus_reader(
-        paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5),
+        paddle.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5),
         words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
         props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
     return reader_creator(reader, word_dict, verb_dict, label_dict)
 
 
 def fetch():
-    paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
-    paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
-    paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
-    paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
-    paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5)
+    paddle.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
+    paddle.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
+    paddle.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
+    paddle.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
+    paddle.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5)
 
 
 def convert(path):
     """
     Converts dataset to recordio format
     """
-    paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_train")
-    paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_test")
+    paddle.dataset.common.convert(path, test(), 1000, "conl105_train")
+    paddle.dataset.common.convert(path, test(), 1000, "conl105_test")
diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/dataset/flowers.py
similarity index 99%
rename from python/paddle/v2/dataset/flowers.py
rename to python/paddle/dataset/flowers.py
index 7bdddeaabec733ef26b3f766c6437f5c53d65044..f082e33be3357fbe405ab1a1ef5e0e601108a363 100644
--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -34,8 +34,8 @@ import functools
 from common import download
 import tarfile
 import scipy.io as scio
-from paddle.v2.image import *
-from paddle.v2.reader import *
+from paddle.dataset.image import *
+from paddle.reader import *
 import os
 import numpy as np
 from multiprocessing import cpu_count
diff --git a/python/paddle/v2/image.py b/python/paddle/dataset/image.py
similarity index 100%
rename from python/paddle/v2/image.py
rename to python/paddle/dataset/image.py
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/dataset/imdb.py
similarity index 91%
rename from python/paddle/v2/dataset/imdb.py
rename to python/paddle/dataset/imdb.py
index 37c4296f9bcea7e16daa46f778934331513c30c4..5ff05b1e9b7f4c42909370a21beb140ecdcd6868 100644
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/dataset/imdb.py
@@ -20,7 +20,7 @@ of 25,000 highly polar movie reviews for training, and 25,000 for testing.
 Besides, this module also provides API for building dictionary.
 """
 
-import paddle.v2.dataset.common
+import paddle.dataset.common
 import collections
 import tarfile
 import re
@@ -37,8 +37,7 @@ def tokenize(pattern):
     Read files that match the given pattern.  Tokenize and yield each file.
     """
 
-    with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb',
-                                                        MD5)) as tarf:
+    with tarfile.open(paddle.dataset.common.download(URL, 'imdb', MD5)) as tarf:
         # Note that we should use tarfile.next(), which does
         # sequential access of member files, other than
         # tarfile.extractfile, which does random access and might
@@ -136,7 +135,7 @@ def word_dict():
 
 
 def fetch():
-    paddle.v2.dataset.common.download(URL, 'imdb', MD5)
+    paddle.dataset.common.download(URL, 'imdb', MD5)
 
 
 def convert(path):
@@ -144,5 +143,5 @@ def convert(path):
     Converts dataset to recordio format
     """
     w = word_dict()
-    paddle.v2.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train")
-    paddle.v2.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test")
+    paddle.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train")
+    paddle.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test")
diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/dataset/imikolov.py
similarity index 86%
rename from python/paddle/v2/dataset/imikolov.py
rename to python/paddle/dataset/imikolov.py
index 617c722c4165cdfed9e650fc968d623ef6ed4391..c6c0a0f54373dd068b2c493f6fbc9c8578593aef 100644
--- a/python/paddle/v2/dataset/imikolov.py
+++ b/python/paddle/dataset/imikolov.py
@@ -18,7 +18,7 @@ This module will download dataset from
 http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
 into paddle reader creators.
 """
-import paddle.v2.dataset.common
+import paddle.dataset.common
 import collections
 import tarfile
 
@@ -54,9 +54,9 @@ def build_dict(min_word_freq=50):
     train_filename = './simple-examples/data/ptb.train.txt'
     test_filename = './simple-examples/data/ptb.valid.txt'
     with tarfile.open(
-            paddle.v2.dataset.common.download(
-                paddle.v2.dataset.imikolov.URL, 'imikolov',
-                paddle.v2.dataset.imikolov.MD5)) as tf:
+            paddle.dataset.common.download(paddle.dataset.imikolov.URL,
+                                           'imikolov',
+                                           paddle.dataset.imikolov.MD5)) as tf:
         trainf = tf.extractfile(train_filename)
         testf = tf.extractfile(test_filename)
         word_freq = word_count(testf, word_count(trainf))
@@ -77,9 +77,9 @@ def build_dict(min_word_freq=50):
 def reader_creator(filename, word_idx, n, data_type):
     def reader():
         with tarfile.open(
-                paddle.v2.dataset.common.download(
-                    paddle.v2.dataset.imikolov.URL, 'imikolov',
-                    paddle.v2.dataset.imikolov.MD5)) as tf:
+                paddle.dataset.common.download(
+                    paddle.dataset.imikolov.URL, 'imikolov',
+                    paddle.dataset.imikolov.MD5)) as tf:
             f = tf.extractfile(filename)
 
             UNK = word_idx['<unk>']
@@ -145,7 +145,7 @@ def test(word_idx, n, data_type=DataType.NGRAM):
 
 
 def fetch():
-    paddle.v2.dataset.common.download(URL, "imikolov", MD5)
+    paddle.dataset.common.download(URL, "imikolov", MD5)
 
 
 def convert(path):
@@ -154,8 +154,7 @@ def convert(path):
     """
     N = 5
     word_dict = build_dict()
-    paddle.v2.dataset.common.convert(path,
-                                     train(word_dict, N), 1000,
-                                     "imikolov_train")
-    paddle.v2.dataset.common.convert(path,
-                                     test(word_dict, N), 1000, "imikolov_test")
+    paddle.dataset.common.convert(path,
+                                  train(word_dict, N), 1000, "imikolov_train")
+    paddle.dataset.common.convert(path,
+                                  test(word_dict, N), 1000, "imikolov_test")
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/dataset/mnist.py
similarity index 76%
rename from python/paddle/v2/dataset/mnist.py
rename to python/paddle/dataset/mnist.py
index 9f675bed895223e054cd3bb6e504fe1607f19858..6a1b8b5fac223c0d134cae69a61a0c2c00bc1feb 100644
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -17,7 +17,7 @@ MNIST dataset.
 This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
 parse training set and test set into paddle reader creators.
 """
-import paddle.v2.dataset.common
+import paddle.dataset.common
 import subprocess
 import numpy
 import platform
@@ -85,10 +85,10 @@ def train():
     :rtype: callable
     """
     return reader_creator(
-        paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist',
-                                          TRAIN_IMAGE_MD5),
-        paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist',
-                                          TRAIN_LABEL_MD5), 100)
+        paddle.dataset.common.download(TRAIN_IMAGE_URL, 'mnist',
+                                       TRAIN_IMAGE_MD5),
+        paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist',
+                                       TRAIN_LABEL_MD5), 100)
 
 
 def test():
@@ -102,22 +102,21 @@ def test():
     :rtype: callable
     """
     return reader_creator(
-        paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist',
-                                          TEST_IMAGE_MD5),
-        paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist',
-                                          TEST_LABEL_MD5), 100)
+        paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5),
+        paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5),
+        100)
 
 
 def fetch():
-    paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
-    paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
-    paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
-    paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+    paddle.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
+    paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+    paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
+    paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
 
 
 def convert(path):
     """
     Converts dataset to recordio format
     """
-    paddle.v2.dataset.common.convert(path, train(), 1000, "minist_train")
-    paddle.v2.dataset.common.convert(path, test(), 1000, "minist_test")
+    paddle.dataset.common.convert(path, train(), 1000, "minist_train")
+    paddle.dataset.common.convert(path, test(), 1000, "minist_test")
diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/dataset/movielens.py
similarity index 95%
rename from python/paddle/v2/dataset/movielens.py
rename to python/paddle/dataset/movielens.py
index 5b61a9420af1bb81e1d826f8a7b69f34c306d382..ab11716202a8298c182e23b661eb1d2ac74bf4da 100644
--- a/python/paddle/v2/dataset/movielens.py
+++ b/python/paddle/dataset/movielens.py
@@ -23,7 +23,7 @@ set and test set into paddle reader creators.
 """
 
 import zipfile
-import paddle.v2.dataset.common
+import paddle.dataset.common
 import re
 import random
 import functools
@@ -100,7 +100,7 @@ USER_INFO = None
 
 
 def __initialize_meta_info__():
-    fn = paddle.v2.dataset.common.download(URL, "movielens", MD5)
+    fn = paddle.dataset.common.download(URL, "movielens", MD5)
     global MOVIE_INFO
     if MOVIE_INFO is None:
         pattern = re.compile(r'^(.*)\((\d+)\)$')
@@ -247,15 +247,15 @@ def unittest():
 
 
 def fetch():
-    paddle.v2.dataset.common.download(URL, "movielens", MD5)
+    paddle.dataset.common.download(URL, "movielens", MD5)
 
 
 def convert(path):
     """
     Converts dataset to recordio format
     """
-    paddle.v2.dataset.common.convert(path, train(), 1000, "movielens_train")
-    paddle.v2.dataset.common.convert(path, test(), 1000, "movielens_test")
+    paddle.dataset.common.convert(path, train(), 1000, "movielens_train")
+    paddle.dataset.common.convert(path, test(), 1000, "movielens_test")
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/dataset/mq2007.py b/python/paddle/dataset/mq2007.py
similarity index 100%
rename from python/paddle/v2/dataset/mq2007.py
rename to python/paddle/dataset/mq2007.py
diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/dataset/sentiment.py
similarity index 87%
rename from python/paddle/v2/dataset/sentiment.py
rename to python/paddle/dataset/sentiment.py
index b0b9757c1a75d215cf8945b5cedbb1239fd43af7..f5461164fe6b816356e42fc7b7dcf388eccfadfb 100644
--- a/python/paddle/v2/dataset/sentiment.py
+++ b/python/paddle/dataset/sentiment.py
@@ -26,7 +26,7 @@ from itertools import chain
 import nltk
 from nltk.corpus import movie_reviews
 
-import paddle.v2.dataset.common
+import paddle.dataset.common
 
 __all__ = ['train', 'test', 'get_word_dict', 'convert']
 NUM_TRAINING_INSTANCES = 1600
@@ -39,13 +39,13 @@ def download_data_if_not_yet():
     """
     try:
         # make sure that nltk can find the data
-        if paddle.v2.dataset.common.DATA_HOME not in nltk.data.path:
-            nltk.data.path.append(paddle.v2.dataset.common.DATA_HOME)
+        if paddle.dataset.common.DATA_HOME not in nltk.data.path:
+            nltk.data.path.append(paddle.dataset.common.DATA_HOME)
         movie_reviews.categories()
     except LookupError:
         print "Downloading movie_reviews data set, please wait....."
         nltk.download(
-            'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME)
+            'movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
         print "Download data set success....."
         print "Path is " + nltk.data.find('corpora/movie_reviews').path
 
@@ -129,13 +129,12 @@ def test():
 
 
 def fetch():
-    nltk.download(
-        'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME)
+    nltk.download('movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
 
 
 def convert(path):
     """
     Converts dataset to recordio format
     """
-    paddle.v2.dataset.common.convert(path, train, 1000, "sentiment_train")
-    paddle.v2.dataset.common.convert(path, test, 1000, "sentiment_test")
+    paddle.dataset.common.convert(path, train, 1000, "sentiment_train")
+    paddle.dataset.common.convert(path, test, 1000, "sentiment_test")
diff --git a/python/paddle/dataset/tests/CMakeLists.txt b/python/paddle/dataset/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..485c38a13b573664d8033c237272a10ebb7c9701
--- /dev/null
+++ b/python/paddle/dataset/tests/CMakeLists.txt
@@ -0,0 +1 @@
+py_test(test_image SRCS test_image.py)
diff --git a/python/paddle/v2/tests/cat.jpg b/python/paddle/dataset/tests/cat.jpg
similarity index 100%
rename from python/paddle/v2/tests/cat.jpg
rename to python/paddle/dataset/tests/cat.jpg
diff --git a/python/paddle/v2/dataset/tests/cifar_test.py b/python/paddle/dataset/tests/cifar_test.py
similarity index 88%
rename from python/paddle/v2/dataset/tests/cifar_test.py
rename to python/paddle/dataset/tests/cifar_test.py
index e0e18229da7818be5752ee592e094a00da286ad9..839125b09dd5c6432e3572374a7345a77a43f7cf 100644
--- a/python/paddle/v2/dataset/tests/cifar_test.py
+++ b/python/paddle/dataset/tests/cifar_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.dataset.cifar
+import paddle.dataset.cifar
 import unittest
 
 
@@ -29,25 +29,25 @@ class TestCIFAR(unittest.TestCase):
 
     def test_test10(self):
         instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.cifar.test10())
+            paddle.dataset.cifar.test10())
         self.assertEqual(instances, 10000)
         self.assertEqual(max_label_value, 9)
 
     def test_train10(self):
         instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.cifar.train10())
+            paddle.dataset.cifar.train10())
         self.assertEqual(instances, 50000)
         self.assertEqual(max_label_value, 9)
 
     def test_test100(self):
         instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.cifar.test100())
+            paddle.dataset.cifar.test100())
         self.assertEqual(instances, 10000)
         self.assertEqual(max_label_value, 99)
 
     def test_train100(self):
         instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.cifar.train100())
+            paddle.dataset.cifar.train100())
         self.assertEqual(instances, 50000)
         self.assertEqual(max_label_value, 99)
 
diff --git a/python/paddle/v2/dataset/tests/common_test.py b/python/paddle/dataset/tests/common_test.py
similarity index 81%
rename from python/paddle/v2/dataset/tests/common_test.py
rename to python/paddle/dataset/tests/common_test.py
index cfa194eba38ea70311c4deeac2635dc0a0103576..e7cc02aa83061599ffefa18de6cb02ac0fc9e9b7 100644
--- a/python/paddle/v2/dataset/tests/common_test.py
+++ b/python/paddle/dataset/tests/common_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.dataset.common
+import paddle.dataset.common
 import unittest
 import tempfile
 import glob
@@ -24,14 +24,14 @@ class TestCommon(unittest.TestCase):
         with open(temp_path, 'w') as f:
             f.write("Hello\n")
         self.assertEqual('09f7e02f1290be211da707a266f153b3',
-                         paddle.v2.dataset.common.md5file(temp_path))
+                         paddle.dataset.common.md5file(temp_path))
 
     def test_download(self):
         yi_avatar = 'https://avatars0.githubusercontent.com/u/1548775?v=3&s=460'
         self.assertEqual(
-            paddle.v2.dataset.common.DATA_HOME + '/test/1548775?v=3&s=460',
-            paddle.v2.dataset.common.download(
-                yi_avatar, 'test', 'f75287202d6622414c706c36c16f8e0d'))
+            paddle.dataset.common.DATA_HOME + '/test/1548775?v=3&s=460',
+            paddle.dataset.common.download(yi_avatar, 'test',
+                                           'f75287202d6622414c706c36c16f8e0d'))
 
     def test_split(self):
         def test_reader():
@@ -42,7 +42,7 @@ class TestCommon(unittest.TestCase):
             return reader
 
         _, temp_path = tempfile.mkstemp()
-        paddle.v2.dataset.common.split(
+        paddle.dataset.common.split(
             test_reader(), 4, suffix=temp_path + '/test-%05d.pickle')
         files = glob.glob(temp_path + '/test-%05d.pickle')
         self.assertEqual(len(files), 3)
@@ -52,7 +52,7 @@ class TestCommon(unittest.TestCase):
         for x in xrange(5):
             with open(temp_path + '/%05d.test' % x) as f:
                 f.write('%d\n' % x)
-        reader = paddle.v2.dataset.common.cluster_files_reader(
+        reader = paddle.dataset.common.cluster_files_reader(
             temp_path + '/*.test', 5, 0)
         for idx, e in enumerate(reader()):
             self.assertEqual(e, str("0"))
@@ -69,9 +69,9 @@ class TestCommon(unittest.TestCase):
             return reader
 
         path = tempfile.mkdtemp()
-        paddle.v2.dataset.common.convert(path,
-                                         test_reader(), num_shards,
-                                         'random_images')
+        paddle.dataset.common.convert(path,
+                                      test_reader(), num_shards,
+                                      'random_images')
 
         files = glob.glob(path + '/random_images-*')
         self.assertEqual(len(files), num_shards)
diff --git a/python/paddle/v2/dataset/tests/flowers_test.py b/python/paddle/dataset/tests/flowers_test.py
similarity index 89%
rename from python/paddle/v2/dataset/tests/flowers_test.py
rename to python/paddle/dataset/tests/flowers_test.py
index a8ae9a07acc22eb9d3c0cc5ebb07f8f11ed21233..06260fd796ce0271b7cec2f42a8a5a255a02dc24 100644
--- a/python/paddle/v2/dataset/tests/flowers_test.py
+++ b/python/paddle/dataset/tests/flowers_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.dataset.flowers
+import paddle.dataset.flowers
 import unittest
 
 
@@ -30,19 +30,19 @@ class TestFlowers(unittest.TestCase):
 
     def test_train(self):
         instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.flowers.train())
+            paddle.dataset.flowers.train())
         self.assertEqual(instances, 6149)
         self.assertEqual(max_label_value, 102)
 
     def test_test(self):
         instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.flowers.test())
+            paddle.dataset.flowers.test())
         self.assertEqual(instances, 1020)
         self.assertEqual(max_label_value, 102)
 
     def test_valid(self):
         instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.flowers.valid())
+            paddle.dataset.flowers.valid())
         self.assertEqual(instances, 1020)
         self.assertEqual(max_label_value, 102)
 
diff --git a/python/paddle/v2/dataset/tests/imdb_test.py b/python/paddle/dataset/tests/imdb_test.py
similarity index 77%
rename from python/paddle/v2/dataset/tests/imdb_test.py
rename to python/paddle/dataset/tests/imdb_test.py
index c4d82f26895d77d05c6e936bd636b1239e1a0cd8..539da049449cd273db0a9e260851ed40e1be0f04 100644
--- a/python/paddle/v2/dataset/tests/imdb_test.py
+++ b/python/paddle/dataset/tests/imdb_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.dataset.imdb
+import paddle.dataset.imdb
 import unittest
 import re
 
@@ -30,15 +30,13 @@ class TestIMDB(unittest.TestCase):
 
     def test_build_dict(self):
         if self.word_idx == None:
-            self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN,
-                                                              150)
+            self.word_idx = paddle.dataset.imdb.build_dict(TRAIN_PATTERN, 150)
 
         self.assertEqual(len(self.word_idx), 7036)
 
     def check_dataset(self, dataset, expected_size):
         if self.word_idx == None:
-            self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN,
-                                                              150)
+            self.word_idx = paddle.dataset.imdb.build_dict(TRAIN_PATTERN, 150)
 
         sum = 0
         for l in dataset(self.word_idx):
@@ -47,10 +45,10 @@ class TestIMDB(unittest.TestCase):
         self.assertEqual(sum, expected_size)
 
     def test_train(self):
-        self.check_dataset(paddle.v2.dataset.imdb.train, 25000)
+        self.check_dataset(paddle.dataset.imdb.train, 25000)
 
     def test_test(self):
-        self.check_dataset(paddle.v2.dataset.imdb.test, 25000)
+        self.check_dataset(paddle.dataset.imdb.test, 25000)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/dataset/tests/imikolov_test.py b/python/paddle/dataset/tests/imikolov_test.py
similarity index 79%
rename from python/paddle/v2/dataset/tests/imikolov_test.py
rename to python/paddle/dataset/tests/imikolov_test.py
index 714a75d6f1ff31697eec2d893d350a726d6390fe..233fd9fc8cea4cd0b5cd052580030fc8c993693c 100644
--- a/python/paddle/v2/dataset/tests/imikolov_test.py
+++ b/python/paddle/dataset/tests/imikolov_test.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.dataset.imikolov
+import paddle.dataset.imikolov
 import unittest
 
-WORD_DICT = paddle.v2.dataset.imikolov.build_dict()
+WORD_DICT = paddle.dataset.imikolov.build_dict()
 
 
 class TestMikolov(unittest.TestCase):
@@ -25,7 +25,7 @@ class TestMikolov(unittest.TestCase):
 
     def test_train(self):
         n = 5
-        self.check_reader(paddle.v2.dataset.imikolov.train(WORD_DICT, n), n)
+        self.check_reader(paddle.dataset.imikolov.train(WORD_DICT, n), n)
 
         first_line = 'aer banknote berlitz calloway centrust cluett fromstein '\
             'gitano guterman hydro-quebec ipo kia memotec mlx nahb punts '\
@@ -34,16 +34,16 @@ class TestMikolov(unittest.TestCase):
             WORD_DICT.get(ch, WORD_DICT['<unk>'])
             for ch in first_line.split(' ')
         ]
-        for l in paddle.v2.dataset.imikolov.train(
+        for l in paddle.dataset.imikolov.train(
                 WORD_DICT, n=-1,
-                data_type=paddle.v2.dataset.imikolov.DataType.SEQ)():
+                data_type=paddle.dataset.imikolov.DataType.SEQ)():
             read_line = l[0][1:]
             break
         self.assertEqual(first_line, read_line)
 
     def test_test(self):
         n = 5
-        self.check_reader(paddle.v2.dataset.imikolov.test(WORD_DICT, n), n)
+        self.check_reader(paddle.dataset.imikolov.test(WORD_DICT, n), n)
 
         first_line = 'consumers may want to move their telephones a little '\
                 'closer to the tv set'
@@ -51,9 +51,9 @@ class TestMikolov(unittest.TestCase):
             WORD_DICT.get(ch, WORD_DICT['<unk>'])
             for ch in first_line.split(' ')
         ]
-        for l in paddle.v2.dataset.imikolov.test(
+        for l in paddle.dataset.imikolov.test(
                 WORD_DICT, n=-1,
-                data_type=paddle.v2.dataset.imikolov.DataType.SEQ)():
+                data_type=paddle.dataset.imikolov.DataType.SEQ)():
             read_line = l[0][1:]
             break
         self.assertEqual(first_line, read_line)
diff --git a/python/paddle/v2/dataset/tests/mnist_test.py b/python/paddle/dataset/tests/mnist_test.py
similarity index 91%
rename from python/paddle/v2/dataset/tests/mnist_test.py
rename to python/paddle/dataset/tests/mnist_test.py
index 1d344cac3e7483a351033570fbec75a4d19f4a55..8ada19d3f2ee13e194d08e19a4b86b558c69a0a7 100644
--- a/python/paddle/v2/dataset/tests/mnist_test.py
+++ b/python/paddle/dataset/tests/mnist_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.dataset.mnist
+import paddle.dataset.mnist
 import unittest
 
 
@@ -29,13 +29,13 @@ class TestMNIST(unittest.TestCase):
 
     def test_train(self):
         instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.mnist.train())
+            paddle.dataset.mnist.train())
         self.assertEqual(instances, 60000)
         self.assertEqual(max_label_value, 9)
 
     def test_test(self):
         instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.mnist.test())
+            paddle.dataset.mnist.test())
         self.assertEqual(instances, 10000)
         self.assertEqual(max_label_value, 9)
 
diff --git a/python/paddle/v2/dataset/tests/mq2007_test.py b/python/paddle/dataset/tests/mq2007_test.py
similarity index 85%
rename from python/paddle/v2/dataset/tests/mq2007_test.py
rename to python/paddle/dataset/tests/mq2007_test.py
index 59847b6c18eadb12123cae824e8bce1051a69d4c..fba388724a8e84591df7150b41f8ea39a850fc31 100644
--- a/python/paddle/v2/dataset/tests/mq2007_test.py
+++ b/python/paddle/dataset/tests/mq2007_test.py
@@ -12,19 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.dataset.mq2007
+import paddle.dataset.mq2007
 import unittest
 
 
 class TestMQ2007(unittest.TestCase):
     def test_pairwise(self):
-        for label, query_left, query_right in paddle.v2.dataset.mq2007.test(
+        for label, query_left, query_right in paddle.dataset.mq2007.test(
                 format="pairwise"):
             self.assertEqual(query_left.shape(), (46, ))
             self.assertEqual(query_right.shape(), (46, ))
 
     def test_listwise(self):
-        for label_array, query_array in paddle.v2.dataset.mq2007.test(
+        for label_array, query_array in paddle.dataset.mq2007.test(
                 format="listwise"):
             self.assertEqual(len(label_array), len(query_array))
 
diff --git a/python/paddle/v2/tests/test_image.py b/python/paddle/dataset/tests/test_image.py
similarity index 97%
rename from python/paddle/v2/tests/test_image.py
rename to python/paddle/dataset/tests/test_image.py
index c78bbdc40a25878b21ba7e678afedf9d8f0a87cf..8bd56607ae1998935a3b3aaa0e3279515c2a540c 100644
--- a/python/paddle/v2/tests/test_image.py
+++ b/python/paddle/dataset/tests/test_image.py
@@ -15,7 +15,7 @@
 import unittest
 import numpy as np
 
-import paddle.v2.image as image
+import paddle.dataset.image as image
 
 
 class Image(unittest.TestCase):
diff --git a/python/paddle/v2/dataset/tests/test_sentiment.py b/python/paddle/dataset/tests/test_sentiment.py
similarity index 97%
rename from python/paddle/v2/dataset/tests/test_sentiment.py
rename to python/paddle/dataset/tests/test_sentiment.py
index 407405290734609059c1767600748d530e8a13a6..543f4b7378b583ea3857bf785cf330c43e535c2a 100644
--- a/python/paddle/v2/dataset/tests/test_sentiment.py
+++ b/python/paddle/dataset/tests/test_sentiment.py
@@ -17,7 +17,7 @@
 
 import unittest
 import nltk
-import paddle.v2.dataset.sentiment as st
+import paddle.dataset.sentiment as st
 from nltk.corpus import movie_reviews
 
 
diff --git a/python/paddle/v2/dataset/tests/voc2012_test.py b/python/paddle/dataset/tests/voc2012_test.py
similarity index 82%
rename from python/paddle/v2/dataset/tests/voc2012_test.py
rename to python/paddle/dataset/tests/voc2012_test.py
index 31e72ebf5eac0508d12783f9ceaa6eef0fa6d353..0d285461a8ae8a9cc69fbec0dcf5efc106b594f0 100644
--- a/python/paddle/v2/dataset/tests/voc2012_test.py
+++ b/python/paddle/dataset/tests/voc2012_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.dataset.voc2012
+import paddle.dataset.voc2012
 import unittest
 
 
@@ -26,15 +26,15 @@ class TestVOC(unittest.TestCase):
         return sum
 
     def test_train(self):
-        count = self.check_reader(paddle.v2.dataset.voc_seg.train())
+        count = self.check_reader(paddle.dataset.voc_seg.train())
         self.assertEqual(count, 2913)
 
     def test_test(self):
-        count = self.check_reader(paddle.v2.dataset.voc_seg.test())
+        count = self.check_reader(paddle.dataset.voc_seg.test())
         self.assertEqual(count, 1464)
 
     def test_val(self):
-        count = self.check_reader(paddle.v2.dataset.voc_seg.val())
+        count = self.check_reader(paddle.dataset.voc_seg.val())
         self.assertEqual(count, 1449)
 
 
diff --git a/python/paddle/v2/dataset/tests/wmt16_test.py b/python/paddle/dataset/tests/wmt16_test.py
similarity index 89%
rename from python/paddle/v2/dataset/tests/wmt16_test.py
rename to python/paddle/dataset/tests/wmt16_test.py
index cef6c3216e7de8d9785a063976e63f88d90b24df..8b949d8bf5212d51016a33da322095bde2038200 100644
--- a/python/paddle/v2/dataset/tests/wmt16_test.py
+++ b/python/paddle/dataset/tests/wmt16_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.dataset.wmt16
+import paddle.dataset.wmt16
 import unittest
 
 
@@ -34,28 +34,28 @@ class TestWMT16(unittest.TestCase):
 
     def test_train(self):
         for idx, sample in enumerate(
-                paddle.v2.dataset.wmt16.train(
+                paddle.dataset.wmt16.train(
                     src_dict_size=100000, trg_dict_size=100000)()):
             if idx >= 10: break
             self.checkout_one_sample(sample)
 
     def test_test(self):
         for idx, sample in enumerate(
-                paddle.v2.dataset.wmt16.test(
+                paddle.dataset.wmt16.test(
                     src_dict_size=1000, trg_dict_size=1000)()):
             if idx >= 10: break
             self.checkout_one_sample(sample)
 
     def test_val(self):
         for idx, sample in enumerate(
-                paddle.v2.dataset.wmt16.validation(
+                paddle.dataset.wmt16.validation(
                     src_dict_size=1000, trg_dict_size=1000)()):
             if idx >= 10: break
             self.checkout_one_sample(sample)
 
     def test_get_dict(self):
         dict_size = 1000
-        word_dict = paddle.v2.dataset.wmt16.get_dict("en", dict_size, True)
+        word_dict = paddle.dataset.wmt16.get_dict("en", dict_size, True)
         self.assertEqual(len(word_dict), dict_size)
         self.assertEqual(word_dict[0], "<s>")
         self.assertEqual(word_dict[1], "<e>")
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/dataset/uci_housing.py
similarity index 82%
rename from python/paddle/v2/dataset/uci_housing.py
rename to python/paddle/dataset/uci_housing.py
index f10bf7e42a1ead09b3eba0d61e55701215e4360f..6a56e9d5563c76ab6f524ccea9191693dc227010 100644
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/dataset/uci_housing.py
@@ -21,8 +21,7 @@ parse training set and test set into paddle reader creators.
 
 import numpy as np
 import os
-import paddle.v2.dataset.common
-from paddle.v2.parameters import Parameters
+import paddle.dataset.common
 
 __all__ = ['train', 'test']
 
@@ -85,7 +84,7 @@ def train():
     :rtype: callable
     """
     global UCI_TRAIN_DATA
-    load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5))
+    load_data(paddle.dataset.common.download(URL, 'uci_housing', MD5))
 
     def reader():
         for d in UCI_TRAIN_DATA:
@@ -105,7 +104,7 @@ def test():
     :rtype: callable
     """
     global UCI_TEST_DATA
-    load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5))
+    load_data(paddle.dataset.common.download(URL, 'uci_housing', MD5))
 
     def reader():
         for d in UCI_TEST_DATA:
@@ -114,21 +113,13 @@ def test():
     return reader
 
 
-def model():
-    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar',
-                                                 MD5_MODEL)
-    with open(tar_file, 'r') as f:
-        parameters = Parameters.from_tar(f)
-    return parameters
-
-
 def fetch():
-    paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)
+    paddle.dataset.common.download(URL, 'uci_housing', MD5)
 
 
 def convert(path):
     """
     Converts dataset to recordio format
     """
-    paddle.v2.dataset.common.convert(path, train(), 1000, "uci_housing_train")
-    paddle.v2.dataset.common.convert(path, test(), 1000, "uci_houseing_test")
+    paddle.dataset.common.convert(path, train(), 1000, "uci_housing_train")
+    paddle.dataset.common.convert(path, test(), 1000, "uci_houseing_test")
diff --git a/python/paddle/v2/dataset/voc2012.py b/python/paddle/dataset/voc2012.py
similarity index 97%
rename from python/paddle/v2/dataset/voc2012.py
rename to python/paddle/dataset/voc2012.py
index 617e212d67fbe37f9d9663e9c83c62045411fa77..9c945574dbcc15f5cee370206ed7e70ba8ab5014 100644
--- a/python/paddle/v2/dataset/voc2012.py
+++ b/python/paddle/dataset/voc2012.py
@@ -22,8 +22,8 @@ with segmentation has been increased from 7,062 to 9,993.
 import tarfile
 import io
 import numpy as np
-from paddle.v2.dataset.common import download
-from paddle.v2.image import *
+from paddle.dataset.common import download
+from paddle.dataset.image import *
 from PIL import Image
 
 __all__ = ['train', 'test', 'val']
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/dataset/wmt14.py
similarity index 84%
rename from python/paddle/v2/dataset/wmt14.py
rename to python/paddle/dataset/wmt14.py
index 5104e29051e4480f3a7eb18421f1b519841b009b..f0908c737874fa7335cca5b5f0cba83190c9f90f 100644
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
@@ -22,8 +22,7 @@ parse training set and test set into paddle reader creators.
 import tarfile
 import gzip
 
-import paddle.v2.dataset.common
-from paddle.v2.parameters import Parameters
+import paddle.dataset.common
 
 __all__ = [
     'train',
@@ -123,7 +122,7 @@ def train(dict_size):
     :rtype: callable
     """
     return reader_creator(
-        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
         'train/train', dict_size)
 
 
@@ -139,27 +138,20 @@ def test(dict_size):
     :rtype: callable
     """
     return reader_creator(
-        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
         'test/test', dict_size)
 
 
 def gen(dict_size):
     return reader_creator(
-        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
         'gen/gen', dict_size)
 
 
-def model():
-    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
-    with gzip.open(tar_file, 'r') as f:
-        parameters = Parameters.from_tar(f)
-    return parameters
-
-
 def get_dict(dict_size, reverse=True):
     # if reverse = False, return dict = {'a':'001', 'b':'002', ...}
     # else reverse = true, return dict = {'001':'a', '002':'b', ...}
-    tar_file = paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
+    tar_file = paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
     src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
     if reverse:
         src_dict = {v: k for k, v in src_dict.items()}
@@ -168,8 +160,8 @@ def get_dict(dict_size, reverse=True):
 
 
 def fetch():
-    paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
-    paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
+    paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
+    paddle.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
 
 
 def convert(path):
@@ -177,6 +169,5 @@ def convert(path):
     Converts dataset to recordio format
     """
     dict_size = 30000
-    paddle.v2.dataset.common.convert(path,
-                                     train(dict_size), 1000, "wmt14_train")
-    paddle.v2.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test")
+    paddle.dataset.common.convert(path, train(dict_size), 1000, "wmt14_train")
+    paddle.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test")
diff --git a/python/paddle/v2/dataset/wmt16.py b/python/paddle/dataset/wmt16.py
similarity index 94%
rename from python/paddle/v2/dataset/wmt16.py
rename to python/paddle/dataset/wmt16.py
index c8818f715beadd9499ae588f2c19a57fbf26f372..ad23338a96df6856c7e15cb5e3bb713021a55bf0 100644
--- a/python/paddle/v2/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -33,7 +33,7 @@ import tarfile
 import gzip
 from collections import defaultdict
 
-import paddle.v2.dataset.common
+import paddle.dataset.common
 
 __all__ = [
     "train",
@@ -76,7 +76,7 @@ def __build_dict(tar_file, dict_size, save_path, lang):
 
 
 def __load_dict(tar_file, dict_size, lang, reverse=False):
-    dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME,
+    dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
                              "wmt16/%s_%d.dict" % (lang, dict_size))
     if not os.path.exists(dict_path) or (
             len(open(dict_path, "r").readlines()) != dict_size):
@@ -178,8 +178,8 @@ def train(src_dict_size, trg_dict_size, src_lang="en"):
                                                    src_lang)
 
     return reader_creator(
-        tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
-                                                   "wmt16.tar.gz"),
+        tar_file=paddle.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                                "wmt16.tar.gz"),
         file_name="wmt16/train",
         src_dict_size=src_dict_size,
         trg_dict_size=trg_dict_size,
@@ -227,8 +227,8 @@ def test(src_dict_size, trg_dict_size, src_lang="en"):
                                                    src_lang)
 
     return reader_creator(
-        tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
-                                                   "wmt16.tar.gz"),
+        tar_file=paddle.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                                "wmt16.tar.gz"),
         file_name="wmt16/test",
         src_dict_size=src_dict_size,
         trg_dict_size=trg_dict_size,
@@ -274,8 +274,8 @@ def validation(src_dict_size, trg_dict_size, src_lang="en"):
                                                    src_lang)
 
     return reader_creator(
-        tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
-                                                   "wmt16.tar.gz"),
+        tar_file=paddle.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                                "wmt16.tar.gz"),
         file_name="wmt16/val",
         src_dict_size=src_dict_size,
         trg_dict_size=trg_dict_size,
@@ -303,12 +303,12 @@ def get_dict(lang, dict_size, reverse=False):
     if lang == "en": dict_size = min(dict_size, TOTAL_EN_WORDS)
     else: dict_size = min(dict_size, TOTAL_DE_WORDS)
 
-    dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME,
+    dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
                              "wmt16/%s_%d.dict" % (lang, dict_size))
     assert os.path.exists(dict_path), "Word dictionary does not exist. "
     "Please invoke paddle.dataset.wmt16.train/test/validation first "
     "to build the dictionary."
-    tar_file = os.path.join(paddle.v2.dataset.common.DATA_HOME, "wmt16.tar.gz")
+    tar_file = os.path.join(paddle.dataset.common.DATA_HOME, "wmt16.tar.gz")
     return __load_dict(tar_file, dict_size, lang, reverse)
 
 
@@ -323,7 +323,7 @@ def convert(path, src_dict_size, trg_dict_size, src_lang):
     """Converts dataset to recordio format.
     """
 
-    paddle.v2.dataset.common.convert(
+    paddle.dataset.common.convert(
         path,
         train(
             src_dict_size=src_dict_size,
@@ -331,7 +331,7 @@ def convert(path, src_dict_size, trg_dict_size, src_lang):
             src_lang=src_lang),
         1000,
         "wmt16_train")
-    paddle.v2.dataset.common.convert(
+    paddle.dataset.common.convert(
         path,
         test(
             src_dict_size=src_dict_size,
@@ -339,7 +339,7 @@ def convert(path, src_dict_size, trg_dict_size, src_lang):
             src_lang=src_lang),
         1000,
         "wmt16_test")
-    paddle.v2.dataset.common.convert(
+    paddle.dataset.common.convert(
         path,
         validation(
             src_dict_size=src_dict_size,
diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
index 62147d325b699a62bd39cfbaca44874b7fc19a0f..24297ffe33bc720ff7b4f2b0dbd82452dc7e0ae2 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -338,15 +338,24 @@ class DistributeTranspiler:
             else:
                 self._append_pserver_non_opt_ops(block, op)
 
+        append_block = optimize_block
+        # append lr decay ops to the child block if exits
+        lr_ops = self._get_lr_ops()
+        if len(lr_ops) > 0:
+            for _, op in enumerate(lr_ops):
+                self._append_pserver_non_opt_ops(append_block, op)
+
+            append_block = pserver_program.create_block(append_block.idx)
+
         # append op to the current block
-        per_opt_block = optimize_block
+        per_opt_block = append_block
         for _, opt_op in enumerate(opt_op_on_pserver):
             for _, op in enumerate(self.optimize_ops):
                 # optimizer is connected to itself
                 if ufind.is_connected(op, opt_op) and \
                     op not in global_ops:
                     __append_optimize_op__(op, per_opt_block)
-            per_opt_block = pserver_program.create_block(0)
+            per_opt_block = pserver_program.create_block(append_block.idx)
 
         # append global ops
         for glb_op in global_ops:
@@ -786,3 +795,33 @@ class DistributeTranspiler:
             else:
                 iomap[key] = vars
         return iomap
+
+    def _get_lr_ops(self):
+        lr_ops = []
+        # find learning rate variables by optimize op
+        lr_vars = set()
+        for op in self.optimize_ops:
+            if self._is_opt_op(op):
+                lr_vars.add(op.input("LearningRate")[0])
+
+        find_ops = []
+        # find ops which output is lr var
+        block = self.program.global_block()
+        for op in block.ops:
+            if set(op.output_arg_names) & lr_vars:
+                find_ops.append(op)
+        # make a union find struct by the ops in default_main_program
+        ufind = UnionFind(block.ops)
+        for op1 in block.ops:
+            for op2 in block.ops:
+                # NOTE: we need to skip all optimize ops, since it is connected
+                # with forward/backward ops and lr ops, we only need the lr ops.
+                if op1 != op2 and self._is_op_connected(op1, op2) and \
+                    not self._is_opt_op(op1) and not self._is_opt_op(op2):
+                    ufind.union(op1, op2)
+        # find all ops which is related with lr var
+        for op1 in block.ops:
+            for op2 in find_ops:
+                if ufind.is_connected(op1, op2):
+                    lr_ops.append(op1)
+        return lr_ops
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 1bb1aa30ee1019c6f80eb64b6dc20459e7a3073b..b9a53eda9144e9e56cf9bc626db40cf4225bd87f 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -18,6 +18,7 @@ from tensor import assign, fill_constant
 from .. import core
 from ..framework import Program, Variable, Operator
 from ..layer_helper import LayerHelper, unique_name
+from ..initializer import force_init_on_cpu
 from ops import logical_and, logical_not, logical_or
 
 __all__ = [
@@ -949,7 +950,7 @@ def create_array(dtype):
         dtype=dtype)
 
 
-def less_than(x, y, cond=None, **ignored):
+def less_than(x, y, force_cpu=True, cond=None, **ignored):
     """
     **Less than**
 
@@ -958,6 +959,7 @@ def less_than(x, y, cond=None, **ignored):
     Args:
         x(Variable): First operand of *less_than*
         y(Variable): Second operand of *less_than*
+        force_cpu(Bool|True): The output data will be on CPU if set true.
         cond(Variable|None): Optional output variable to store the result of *less_than*
 
     Returns:
@@ -974,8 +976,11 @@ def less_than(x, y, cond=None, **ignored):
         cond.stop_gradient = True
 
     helper.append_op(
-        type='less_than', inputs={'X': [x],
-                                  'Y': [y]}, outputs={'Out': [cond]})
+        type='less_than',
+        inputs={'X': [x],
+                'Y': [y]},
+        outputs={'Out': [cond]},
+        attrs={'force_cpu': force_cpu or force_init_on_cpu()})
     return cond
 
 
@@ -1396,7 +1401,8 @@ class DynamicRNN(object):
                 type='less_than',
                 inputs={'X': self.step_idx,
                         'Y': self.max_seq_len},
-                outputs={'Out': self.cond})
+                outputs={'Out': self.cond},
+                attrs={'force_cpu': True})
 
         input_array = parent_block.create_var(
             name=unique_name.generate('dynamic_rnn_input_array'),
@@ -1445,7 +1451,11 @@ class DynamicRNN(object):
             for new_mem, mem_array in self.mem_link:
                 array_write(x=new_mem, i=self.step_idx, array=mem_array)
 
-            less_than(x=self.step_idx, y=self.max_seq_len, cond=self.cond)
+            less_than(
+                x=self.step_idx,
+                y=self.max_seq_len,
+                force_cpu=True,
+                cond=self.cond)
 
         self.status = DynamicRNN.AFTER_RNN
         for each_array in self.output_array:
diff --git a/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py b/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py
index 983f8f4dbeac83566839de25ec9765eb248be768..ce640dece8a5067bd10f410a2bb58874b7cc0908 100644
--- a/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py
+++ b/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
index 93ef66851b0efd65361122853dadeefe11992ed5..6dfc2997ae0328a41fe22d13dfa8fc51d4d021a6 100644
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import contextlib
 import numpy
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index b01c1875d64d7fc14e0141672f7e8eab2b6a0394..e8bb082be196b6342b1719235f1264bbe3d776ac 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import contextlib
 import math
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index f488527e0bc69059bc44422aa28188441f3d5b54..c0a6df831acbfe2654a5941cf95c91343992ef13 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -15,8 +15,8 @@
 import math
 
 import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.dataset.conll05 as conll05
+import paddle
+import paddle.dataset.conll05 as conll05
 import paddle.fluid as fluid
 from paddle.fluid.initializer import init_on_cpu
 import contextlib
diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py
index 3a1a0859ecfd4ac5337e2112f8b22e32d8474f22..830d78df8b9e56b45f7e928562ef4b89e88f696d 100644
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -14,7 +14,7 @@
 import contextlib
 
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.framework as framework
 import paddle.fluid.layers as pd
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index e85b97a7f430b6d752baa179f27a7d15bc4d9a81..e4997b4069f60ff4382b4254bc026ae8ae29b345 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -14,7 +14,7 @@
 from __future__ import print_function
 import argparse
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle
 import sys
 import numpy
 import unittest
diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py
index 2ce66d32c993672793b0db213267d1f80b5c49dd..2172c275b8082689a6ff5f2c3c27a2ff4e92275a 100644
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -16,7 +16,7 @@ import math
 import sys
 import os
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.framework as framework
 import paddle.fluid.layers as layers
diff --git a/python/paddle/fluid/tests/book/test_understand_sentiment.py b/python/paddle/fluid/tests/book/test_understand_sentiment.py
index d2f3f7404697feb0768f873070b97aeb3ba0cd64..dedd153778d7ad9caeb5fa7090a980bc7f177dea 100644
--- a/python/paddle/fluid/tests/book/test_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/test_understand_sentiment.py
@@ -15,7 +15,7 @@ from __future__ import print_function
 
 import unittest
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle
 import contextlib
 import math
 import numpy as np
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index 26b97c3e254f54b83515436660e44d4908c98fbe..8929779de9448d036e1528b64330b37463ab3988 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import unittest
 import os
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
index ad79e96b958b36a06c8a3cc990dbe3608e32c9ac..8818cf96fa8f08036f9e23aae786f67b5614b2b9 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import math
 import sys
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
index 204669d7e6176e9e8250e8aebc2d10441fa24b67..dfebb9a06ea4f290f128c486dcaccaeccdcef8c4 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import sys
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import math
 import sys
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
index a24834a6f0b19d1265f6c8d7089d31583af82d1f..a1ca6d981fafb401985d03e9f2d63d1cb41b21b5 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
diff --git a/python/paddle/fluid/tests/demo/fc_gan.py b/python/paddle/fluid/tests/demo/fc_gan.py
index 7452ea2a34aa0c75d8e0990639b29705033af98b..8ea1b2b15cc0c0eb5bca67a9c5a6ac6c6774e7e2 100644
--- a/python/paddle/fluid/tests/demo/fc_gan.py
+++ b/python/paddle/fluid/tests/demo/fc_gan.py
@@ -19,7 +19,7 @@ import os
 import matplotlib
 import numpy
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 
 matplotlib.use('Agg')
diff --git a/python/paddle/fluid/tests/test_cpp_reader.py b/python/paddle/fluid/tests/test_cpp_reader.py
index 4b0d039b7e05a55980946a8949e32802e9e57c20..e54c73b2956dd99ee57804318130c261e133d21a 100644
--- a/python/paddle/fluid/tests/test_cpp_reader.py
+++ b/python/paddle/fluid/tests/test_cpp_reader.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import numpy as np
 import sys
diff --git a/python/paddle/fluid/tests/test_error_clip.py b/python/paddle/fluid/tests/test_error_clip.py
index b2fd5ae29c724da52df0a5d3cb56d2ec9e5530f3..89f4c64975802dc1827ec17ed3626b91e36d6971 100644
--- a/python/paddle/fluid/tests/test_error_clip.py
+++ b/python/paddle/fluid/tests/test_error_clip.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 
 BATCH_SIZE = 128
diff --git a/python/paddle/fluid/tests/test_gradient_clip.py b/python/paddle/fluid/tests/test_gradient_clip.py
index 68b682f68b1fd147b821cfdb1e0866cf8aa04bff..d530601f13be6810a8a99b13c92faf584df568f9 100644
--- a/python/paddle/fluid/tests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/test_gradient_clip.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 
 BATCH_SIZE = 128
diff --git a/python/paddle/fluid/tests/test_mnist_if_else_op.py b/python/paddle/fluid/tests/test_mnist_if_else_op.py
index 94395f6cfb4648967558ed265e798e3505c20fc1..d34f52db5ffc889f17513d034ad2c99f696b0cdf 100644
--- a/python/paddle/fluid/tests/test_mnist_if_else_op.py
+++ b/python/paddle/fluid/tests/test_mnist_if_else_op.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 import paddle.fluid.layers as layers
 from paddle.fluid.framework import Program, program_guard, default_main_program, default_startup_program
 from paddle.fluid.executor import Executor
 from paddle.fluid.optimizer import MomentumOptimizer
 import paddle.fluid.core as core
-import paddle.v2 as paddle
 import unittest
 import numpy as np
 
diff --git a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
index df7ab0d29bdfc9410cd7dd4a8f2a7cd440ef4aba..0faed94deb4808783027d776e0f4c61da0db457a 100644
--- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle
 import unittest
 import numpy
 
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
index b03a70f1b9e61162d37541ffeba8510fc11c605a..d3f63ee2c414a71309be8f0af6d3e5912078ecdb 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import unittest
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid.backward import append_backward
diff --git a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
index 8add353303e3626bbce68199a100306d4858766a..0b7a29075939a548320185947b5afa7261029d49 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
@@ -15,8 +15,8 @@
 import unittest
 
 import paddle.fluid as fluid
-import paddle.v2 as paddle
-import paddle.v2.dataset.mnist as mnist
+import paddle
+import paddle.dataset.mnist as mnist
 
 
 class TestMultipleReader(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_multiple_reader.py b/python/paddle/fluid/tests/unittests/test_multiple_reader.py
index 69f8acf81efaba8fc0f3df4cfe3a42dc4e477df2..a60a5d6c4af2b6b3652d0fe2089018b9403eee25 100644
--- a/python/paddle/fluid/tests/unittests/test_multiple_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multiple_reader.py
@@ -15,8 +15,8 @@
 import unittest
 
 import paddle.fluid as fluid
-import paddle.v2 as paddle
-import paddle.v2.dataset.mnist as mnist
+import paddle
+import paddle.dataset.mnist as mnist
 from shutil import copyfile
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
index bbfd03c638dac64de24c0b363f8342d8485f1223..95d0f9da47e97e94ff97eb3647ac5244d5409ca3 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -16,9 +16,9 @@ import numpy
 import unittest
 
 import paddle.fluid as fluid
-import paddle.v2 as paddle
-import paddle.v2.dataset.mnist as mnist
-import paddle.v2.dataset.wmt16 as wmt16
+import paddle
+import paddle.dataset.mnist as mnist
+import paddle.dataset.wmt16 as wmt16
 
 
 def simple_fc_net():
diff --git a/python/paddle/fluid/tests/unittests/test_recordio_reader.py b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
index 24a0074d9b9621d902d12eb8cb29d9b65be22ed3..640264d82f0dc7fa71bf882d5549e30b87b8d7c5 100644
--- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
@@ -15,8 +15,8 @@
 import unittest
 
 import paddle.fluid as fluid
-import paddle.v2 as paddle
-import paddle.v2.dataset.mnist as mnist
+import paddle
+import paddle.dataset.mnist as mnist
 
 
 class TestRecordIO(unittest.TestCase):
diff --git a/python/paddle/v2/reader/__init__.py b/python/paddle/reader/__init__.py
similarity index 100%
rename from python/paddle/v2/reader/__init__.py
rename to python/paddle/reader/__init__.py
diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/reader/creator.py
similarity index 62%
rename from python/paddle/v2/reader/creator.py
rename to python/paddle/reader/creator.py
index fda5246d74f598200b439774a25e80ec3e504077..4c905d959fad4e8c1a8826ce8dc60c5fa834514d 100644
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/reader/creator.py
@@ -16,7 +16,7 @@ Creator package contains some simple reader creator, which could
 be used in user program.
 """
 
-__all__ = ['np_array', 'text_file', 'recordio', 'cloud_reader']
+__all__ = ['np_array', 'text_file', 'recordio']
 
 
 def np_array(x):
@@ -66,7 +66,7 @@ def recordio(paths, buf_size=100):
     """
 
     import recordio as rec
-    import paddle.v2.reader.decorator as dec
+    import paddle.reader.decorator as dec
     import cPickle as pickle
 
     def reader():
@@ -83,48 +83,3 @@ def recordio(paths, buf_size=100):
         f.close()
 
     return dec.buffered(reader, buf_size)
-
-
-pass_num = 0
-
-
-def cloud_reader(paths, etcd_endpoints, timeout_sec=5, buf_size=64):
-    """
-    Create a data reader that yield a record one by one from
-        the paths:
-    :paths: path of recordio files, can be a string or a string list.
-    :etcd_endpoints: the endpoints for etcd cluster
-    :returns: data reader of recordio files.
-
-    ..  code-block:: python
-        from paddle.v2.reader.creator import cloud_reader
-        etcd_endpoints = "http://127.0.0.1:2379"
-        trainer.train.(
-            reader=cloud_reader(["/work/dataset/uci_housing/uci_housing*"], etcd_endpoints),
-        )
-    """
-    import os
-    import cPickle as pickle
-    import paddle.v2.master as master
-    c = master.client(etcd_endpoints, timeout_sec, buf_size)
-
-    if isinstance(paths, basestring):
-        path = [paths]
-    else:
-        path = paths
-    c.set_dataset(path)
-
-    def reader():
-        global pass_num
-        c.paddle_start_get_records(pass_num)
-        pass_num += 1
-
-        while True:
-            r, e = c.next_record()
-            if not r:
-                if e != -2:
-                    print "get record error: ", e
-                break
-            yield pickle.loads(r)
-
-    return reader
diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/reader/decorator.py
similarity index 100%
rename from python/paddle/v2/reader/decorator.py
rename to python/paddle/reader/decorator.py
diff --git a/python/paddle/v2/reader/tests/CMakeLists.txt b/python/paddle/reader/tests/CMakeLists.txt
similarity index 100%
rename from python/paddle/v2/reader/tests/CMakeLists.txt
rename to python/paddle/reader/tests/CMakeLists.txt
diff --git a/python/paddle/v2/reader/tests/__init__.py b/python/paddle/reader/tests/__init__.py
similarity index 100%
rename from python/paddle/v2/reader/tests/__init__.py
rename to python/paddle/reader/tests/__init__.py
diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/reader/tests/creator_test.py
similarity index 92%
rename from python/paddle/v2/reader/tests/creator_test.py
rename to python/paddle/reader/tests/creator_test.py
index 7fe374e663607607cd0839eb6ca9c70c4d15eef8..c4238c12a74759d52eb09f31ce1126cc93dd3489 100644
--- a/python/paddle/v2/reader/tests/creator_test.py
+++ b/python/paddle/reader/tests/creator_test.py
@@ -28,14 +28,14 @@
 import os
 import unittest
 import numpy as np
-import paddle.v2.reader.creator
+import paddle.reader.creator
 
 
 class TestNumpyArray(unittest.TestCase):
     def test_numpy_array(self):
         l = [[1, 2, 3], [4, 5, 6]]
         x = np.array(l, np.int32)
-        reader = paddle.v2.reader.creator.np_array(x)
+        reader = paddle.reader.creator.np_array(x)
         for idx, e in enumerate(reader()):
             self.assertItemsEqual(e, l[idx])
 
@@ -43,14 +43,14 @@ class TestNumpyArray(unittest.TestCase):
 class TestTextFile(unittest.TestCase):
     def test_text_file(self):
         path = os.path.join(os.path.dirname(__file__), "test_data_creator.txt")
-        reader = paddle.v2.reader.creator.text_file(path)
+        reader = paddle.reader.creator.text_file(path)
         for idx, e in enumerate(reader()):
             self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1))
 
 
 class TestRecordIO(unittest.TestCase):
     def do_test(self, path):
-        reader = paddle.v2.reader.creator.recordio(path)
+        reader = paddle.reader.creator.recordio(path)
         idx = 0
         for e in reader():
             if idx == 0:
diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/reader/tests/decorator_test.py
similarity index 81%
rename from python/paddle/v2/reader/tests/decorator_test.py
rename to python/paddle/reader/tests/decorator_test.py
index 6b680e39f3fb299a14e7d8162470996d1d16b83d..bee24d3b6579db5e99ec66931df201fdf9e1af07 100644
--- a/python/paddle/v2/reader/tests/decorator_test.py
+++ b/python/paddle/reader/tests/decorator_test.py
@@ -15,7 +15,7 @@
 import time
 import unittest
 
-import paddle.v2.reader
+import paddle.reader
 
 
 def reader_creator_10(dur):
@@ -39,7 +39,7 @@ class TestMap(unittest.TestCase):
             yield "h"
             yield "i"
 
-        r = paddle.v2.reader.map_readers(tokenize, read)
+        r = paddle.reader.map_readers(tokenize, read)
         for i, e in enumerate(r()):
             self.assertEqual(e, i)
 
@@ -47,7 +47,7 @@ class TestMap(unittest.TestCase):
 class TestBuffered(unittest.TestCase):
     def test_read(self):
         for size in range(20):
-            b = paddle.v2.reader.buffered(reader_creator_10(0), size)
+            b = paddle.reader.buffered(reader_creator_10(0), size)
             c = 0
             for i in b():
                 self.assertEqual(i, c)
@@ -56,7 +56,7 @@ class TestBuffered(unittest.TestCase):
 
     def test_buffering(self):
         # read have 30ms delay.
-        b = paddle.v2.reader.buffered(reader_creator_10(0.03), 10)
+        b = paddle.reader.buffered(reader_creator_10(0.03), 10)
         last_time = time.time()
         for idx, i in enumerate(b()):
             elapsed_time = time.time() - last_time
@@ -70,17 +70,17 @@ class TestBuffered(unittest.TestCase):
 
 class TestCompose(unittest.TestCase):
     def test_compse(self):
-        reader = paddle.v2.reader.compose(
+        reader = paddle.reader.compose(
             reader_creator_10(0), reader_creator_10(0))
         for idx, e in enumerate(reader()):
             self.assertEqual(e, (idx, idx))
 
     def test_compose_not_aligned(self):
         total = 0
-        reader = paddle.v2.reader.compose(
-            paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)),
+        reader = paddle.reader.compose(
+            paddle.reader.chain(reader_creator_10(0), reader_creator_10(0)),
             reader_creator_10(0))
-        with self.assertRaises(paddle.v2.reader.ComposeNotAligned):
+        with self.assertRaises(paddle.reader.ComposeNotAligned):
             for e in reader():
                 total += 1
         # expecting 10, not 20
@@ -88,8 +88,8 @@ class TestCompose(unittest.TestCase):
 
     def test_compose_not_aligned_no_check(self):
         total = 0
-        reader = paddle.v2.reader.compose(
-            paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)),
+        reader = paddle.reader.compose(
+            paddle.reader.chain(reader_creator_10(0), reader_creator_10(0)),
             reader_creator_10(0),
             check_alignment=False)
         for e in reader():
@@ -100,7 +100,7 @@ class TestCompose(unittest.TestCase):
 
 class TestChain(unittest.TestCase):
     def test_chain(self):
-        c = paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0))
+        c = paddle.reader.chain(reader_creator_10(0), reader_creator_10(0))
         idx = 0
         for e in c():
             self.assertEqual(e, idx % 10)
@@ -113,7 +113,7 @@ class TestShuffle(unittest.TestCase):
         case = [(0, True), (1, True), (10, False), (100, False)]
         a = reader_creator_10(0)
         for size, checkEq in case:
-            s = paddle.v2.reader.shuffle(a, size)
+            s = paddle.reader.shuffle(a, size)
             total = 0
             for idx, e in enumerate(s()):
                 if checkEq:
@@ -133,9 +133,9 @@ class TestXmap(unittest.TestCase):
         for order in orders:
             for tNum in thread_nums:
                 for size in buffered_size:
-                    reader = paddle.v2.reader.xmap_readers(mapper,
-                                                           reader_creator_10(0),
-                                                           tNum, size, order)
+                    reader = paddle.reader.xmap_readers(mapper,
+                                                        reader_creator_10(0),
+                                                        tNum, size, order)
                     for n in xrange(3):
                         result = []
                         for i in reader():
@@ -150,7 +150,7 @@ class TestPipeReader(unittest.TestCase):
     def test_pipe_reader(self):
         def example_reader(myfiles):
             for f in myfiles:
-                pr = paddle.v2.reader.PipeReader("cat %s" % f, bufsize=128)
+                pr = paddle.reader.PipeReader("cat %s" % f, bufsize=128)
                 for l in pr.get_line():
                     yield l
 
diff --git a/python/paddle/v2/reader/tests/test_data_creator.txt b/python/paddle/reader/tests/test_data_creator.txt
similarity index 100%
rename from python/paddle/v2/reader/tests/test_data_creator.txt
rename to python/paddle/reader/tests/test_data_creator.txt
diff --git a/python/paddle/v2/reader/tests/test_reader_recordio.dat b/python/paddle/reader/tests/test_reader_recordio.dat
similarity index 100%
rename from python/paddle/v2/reader/tests/test_reader_recordio.dat
rename to python/paddle/reader/tests/test_reader_recordio.dat
diff --git a/python/paddle/v2/reader/tests/test_recordio_creator.dat b/python/paddle/reader/tests/test_recordio_creator.dat
similarity index 100%
rename from python/paddle/v2/reader/tests/test_recordio_creator.dat
rename to python/paddle/reader/tests/test_recordio_creator.dat
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index df710c33d0c0ca16d358dac1eb42327e9cd4c7ae..02b0d077eefa431baed05c421a367ebe3581626c 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -22,17 +22,13 @@ import data_type
 import topology
 import networks
 import evaluator
-from . import dataset
-from . import reader
 from . import plot
 import attr
 import op
 import pooling
 import inference
 import networks
-import minibatch
 import plot
-import image
 import paddle.trainer.config_parser as cp
 
 __all__ = [
@@ -48,14 +44,11 @@ __all__ = [
     'data_type',
     'attr',
     'pooling',
-    'dataset',
-    'reader',
     'topology',
     'networks',
     'infer',
     'plot',
     'evaluator',
-    'image',
     'master',
 ]
 
@@ -153,4 +146,3 @@ def init(**kwargs):
 
 
 infer = inference.infer
-batch = minibatch.batch
diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
index 52f5b947fdec55eea45b9d34eddd576c981fa97c..14b64742fd09bf6c197c5d1aa2354271293df239 100644
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
@@ -15,7 +15,7 @@
 import numpy
 import collections
 import topology
-import minibatch
+import paddle
 import cPickle
 
 __all__ = ['infer', 'Inference']
@@ -80,7 +80,7 @@ class Inference(object):
             for each_sample in input:
                 yield each_sample
 
-        reader = minibatch.batch(__reader_impl__, batch_size=batch_size)
+        reader = paddle.batch(__reader_impl__, batch_size=batch_size)
 
         self.__gradient_machine__.start()
         for data_batch in reader():
diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py
index 6a2bb8d337b7667aa2b1e3ef0815bb80f6e38d6a..a188a03eb3698c972de92c9807f1bdb71a249330 100644
--- a/python/paddle/v2/layer.py
+++ b/python/paddle/v2/layer.py
@@ -20,7 +20,7 @@ The primary usage shows below.
 
 ..  code-block:: python
 
-    import paddle.v2 as paddle
+    import paddle
 
     img = paddle.layer.data(name='img', type=paddle.data_type.dense_vector(784))
     hidden = paddle.layer.fc(input=img, size=200)
diff --git a/python/paddle/v2/tests/CMakeLists.txt b/python/paddle/v2/tests/CMakeLists.txt
index b4333ed530ce464095ec38d72706949cc464fbe4..46e4feb8e1ce1d12f214f5c49b1b589a46110603 100644
--- a/python/paddle/v2/tests/CMakeLists.txt
+++ b/python/paddle/v2/tests/CMakeLists.txt
@@ -1,5 +1,4 @@
 py_test(test_op SRCS test_op.py)
-py_test(test_image SRCS test_image.py)
 py_test(test_layer SRCS test_layer.py)
 py_test(test_topology SRCS test_topology.py)
 py_test(test_rnn_layer SRCS test_rnn_layer.py)
diff --git a/python/paddle/v2/tests/test_paramconf_order.py b/python/paddle/v2/tests/test_paramconf_order.py
index 264442be182ea69c95b39b3bdb4c389d52eff66e..8320217da2795da756cf12a80f39279182789eef 100644
--- a/python/paddle/v2/tests/test_paramconf_order.py
+++ b/python/paddle/v2/tests/test_paramconf_order.py
@@ -27,6 +27,7 @@
 # limitations under the License.
 import unittest
 import math
+import paddle.dataset as dataset
 import paddle.v2 as paddle
 
 
@@ -40,7 +41,7 @@ def wordemb(inlayer):
 
 
 def train():
-    word_dict = paddle.dataset.imikolov.build_dict()
+    word_dict = dataset.imikolov.build_dict()
     dict_size = len(word_dict)
     # Every layer takes integer value of range [0, dict_size)
     firstword = paddle.layer.data(
diff --git a/python/setup.py.in b/python/setup.py.in
index 831d173d424b8c663f728af748ad1942bb20a418..d73a3a6a1c41b87efb9600ac59983bd16547ec6a 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -63,6 +63,8 @@ write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py')
 
 packages=['paddle',
           'paddle.utils',
+          'paddle.dataset',
+          'paddle.reader',
           'paddle.fluid',
           'paddle.fluid.proto',
           'paddle.fluid.proto.profiler',
@@ -73,8 +75,6 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
                'paddle.trainer',
                'paddle.trainer_config_helpers',
                'paddle.v2',
-               'paddle.v2.dataset',
-               'paddle.v2.reader',
                'paddle.v2.master',
                'paddle.v2.plot',
                'py_paddle']
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
new file mode 100755
index 0000000000000000000000000000000000000000..94d1e23ce716f7f1d723bad5f1f4c60030f19eb7
--- /dev/null
+++ b/tools/codestyle/cpplint_pre_commit.hook
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+TOTAL_ERRORS=0
+
+# The trick to remove deleted files: https://stackoverflow.com/a/2413151
+for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
+    cpplint $file;
+    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
+done
+
+exit $TOTAL_ERRORS
+