diff --git a/.copyright.hook b/.copyright.hook
index dc1b096a0ad28db732b794fa856efed71917c5e8..09afff2072df3384a429d01d06188218ae6e85d1 100644
--- a/.copyright.hook
+++ b/.copyright.hook
@@ -9,7 +9,7 @@ import subprocess
 import platform
 
 COPYRIGHT = '''
-  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index 137f11da7f2f1c46eebf6590d93402786ef543c9..c70d83b3f4bb24740ed67b4e2f98a3ced26d1648 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -15,9 +15,9 @@
 include(ExternalProject)
 
 set(BOOST_PROJECT       "extern_boost")
-set(BOOST_VER           "1.66.0")
-set(BOOST_TAR           "boost_1_66_0")
-set(BOOST_URL           "https://dl.bintray.com/boostorg/release/${BOOST_VER}/source/${BOOST_TAR}.tar.gz")
+set(BOOST_VER           "1.41.0")
+set(BOOST_TAR           "boost_1_41_0")
+set(BOOST_URL           "http://sourceforge.net/projects/boost/files/boost/${BOOST_VER}/${BOOST_TAR}.tar.gz")
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
 set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
index 25d28de0aad3b1788083c92c4adff8b9a86da9b1..231ec2d4ba102a5d31c47cbc7a5d484ef17a7f3a 100644
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -18,6 +18,11 @@ dynamic_lstm
 ..  autofunction:: paddle.v2.fluid.layers.dynamic_lstm
     :noindex:
 
+dynamic_lstmp
+-------------
+..  autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
+    :noindex:
+
 dynamic_gru
 -----------
 ..  autofunction:: paddle.v2.fluid.layers.dynamic_gru
@@ -534,3 +539,8 @@ row_conv
 --------
 ..  autofunction:: paddle.v2.fluid.layers.row_conv
     :noindex:
+
+multiplex
+---------
+..  autofunction:: paddle.v2.fluid.layers.multiplex
+    :noindex:
diff --git a/doc/api/v2/fluid/nets.rst b/doc/api/v2/fluid/nets.rst
index f6b1cb4ba10659fb336899f08376c265c67290f1..500019bc507f859c4c91de5d322a82eb1e78e2de 100644
--- a/doc/api/v2/fluid/nets.rst
+++ b/doc/api/v2/fluid/nets.rst
@@ -26,8 +26,8 @@ glu
     :noindex:
 
 
-dot_product_attention
----------------------
-..  autofunction:: paddle.v2.fluid.nets.dot_product_attention
+scaled_dot_product_attention
+----------------------------
+..  autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention
     :noindex:
 
diff --git a/doc/design/dist_refactor/distributed_architecture.md b/doc/design/dist_refactor/distributed_architecture.md
index 3a741f95866fb6c301ca9097af7916281f2278cf..9368c5780dc922953f38bf0f86d9f797a4a8a6fe 100644
--- a/doc/design/dist_refactor/distributed_architecture.md
+++ b/doc/design/dist_refactor/distributed_architecture.md
@@ -152,12 +152,12 @@ for data in train_reader():
 `JobDesc` object describe the distributed job resource specification to run on
 Cluster environment.
 
-<img src="src/remote_executor.png"/>
+<img src="src/remote_executor.png" width="500" align="center" />
 
 `RemoteExecutor.run` sends the `ProgramDesc` and
 [TrainingJob](https://github.com/PaddlePaddle/cloud/blob/develop/doc/autoscale/README.md#training-job-resource)
 to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible
-to start the final Kubernetes Jobs to run the different role of `ProgramDesc`.
+to start the final Kubernetes Jobs to run the different role of `ProgramDesc` from `ConfigMap`.
 
 
 ### Placement Algorithm
diff --git a/doc/design/dist_refactor/src/remote_executor.graffle b/doc/design/dist_refactor/src/remote_executor.graffle
index ce2c18fee5687732053c48af9c8c290a994a8090..41b2067311694b56d211a4f32d1b76884eeffd2d 100644
Binary files a/doc/design/dist_refactor/src/remote_executor.graffle and b/doc/design/dist_refactor/src/remote_executor.graffle differ
diff --git a/doc/design/dist_refactor/src/remote_executor.png b/doc/design/dist_refactor/src/remote_executor.png
index 6be4b1841b99efdb59557975485d0387f422308c..744e2fb2e0f1bbe058e991ba7b2a09000965ee79 100644
Binary files a/doc/design/dist_refactor/src/remote_executor.png and b/doc/design/dist_refactor/src/remote_executor.png differ
diff --git a/doc/design/support_new_device.md b/doc/design/support_new_device.md
index 4c5f10e2ecb9ec09b78926ca27552741d02d7cc9..8983df900460127fc130043c52373dab505363ba 100644
--- a/doc/design/support_new_device.md
+++ b/doc/design/support_new_device.md
@@ -2,9 +2,9 @@
 
 ## Background
 
-Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries flexibly and efficiently.
+Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries in a flexible and efficient manner.
 
-On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example,Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
+On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example, Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
 
 On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent.
 
@@ -17,7 +17,7 @@ For a general overview of fluid, please refer to the [overview doc](https://gith
 
 There are mainly three parts that we have to consider while integrating a new device/library:
 
-- Place and DeviceContext: indicates the device id and manages hardware resources
+- Place and DeviceContext: indicate the device id and manage hardware resources
 
 - Memory and Tensor: malloc/free data on certain device
 
@@ -25,10 +25,10 @@ There are mainly three parts that we have to consider while integrating a new de
 
 ### Place and DeviceContext
 
-Please remind that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
+Please note that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
 
 #### Place
-Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add corresponding `DevicePlace`.
+Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add the corresponding `DevicePlace`.
 
 ```
         |   CPUPlace
@@ -144,7 +144,7 @@ class Tensor {
 };
 ```
 
-`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory.
+`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configurate its shape, and then call `mutuable_data` to allocate the actual memory.
 
 ```cpp
 paddle::framework::Tensor t;
@@ -163,7 +163,7 @@ Fluid implements computing units based on different DeviceContexts. Some computi
 
 Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/math/maxouting.h#L27) as an example:
 
-The interface is defined in header file.
+The interface is defined in the header file.
 
 ```
 template <typename DeviceContext, typename T>
@@ -174,7 +174,7 @@ class MaxOutFunctor {
 };
 ```
 
-CPU implemention is in .cc file
+CPU implementation is in .cc file
 
 ```
 template <typename T>
@@ -188,7 +188,7 @@ class MaxOutFunctor<platform::CPUDeviceContext, T> {
 };
 ```
 
-CUDA implemention is in .cu file
+CUDA implementation is in .cu file
 
 ```
 template <typename T>
@@ -203,9 +203,9 @@ class MaxOutFunctor<platform::CUDADeviceContext, T> {
 ```
 
 
-We get computing handle from a concrete DeviceContext, and make compution on tensors.
+We first obtain the computing handle from a concrete DeviceContext and then compute on tensors.
 
-The implemention of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.
+The implementation of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.
 
 Fluid provides different register interfaces in op_registry.h
 
@@ -231,7 +231,7 @@ REGISTER_OP_CUDA_KERNEL(
 
 ## Advanced topics: How to switch between different Device/Library
 
-Generally, we will impelement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not sutibale on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run at GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
+Generally, we will implement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not suitable on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run on GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
 
 
 For more details, please refer to following docs:
diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/getstarted/build_and_install/pip_install_cn.rst
index 0c741e936b46eda5e7165e4ee54b545b14a28a19..8e4165da6b8135d083766c650f1092158f9d01c2 100644
--- a/doc/getstarted/build_and_install/pip_install_cn.rst
+++ b/doc/getstarted/build_and_install/pip_install_cn.rst
@@ -39,6 +39,7 @@ PaddlePaddle可以使用常用的Python包管理工具
 
     "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
     "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
+    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
     "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
     "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
     "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/getstarted/build_and_install/pip_install_en.rst
index 285ed09805b09790beaef014f6813c227aff33ac..c1e806c0fe5f03139c0dff985f9ae0856eaa2e98 100644
--- a/doc/getstarted/build_and_install/pip_install_en.rst
+++ b/doc/getstarted/build_and_install/pip_install_en.rst
@@ -42,6 +42,7 @@ If the links below shows up the login form, just click "Log in as guest" to star
 
     "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
     "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
+    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
     "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
     "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
     "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
diff --git a/doc/howto/usage/cluster/fluid_cluster_train_en.md b/doc/howto/usage/cluster/fluid_cluster_train_en.md
index 11904a6f71bb6ce37417aeffb8e408ec65961b12..ae825d9a517c7e9005d4e32f8f34b3f6a79be0c9 100644
--- a/doc/howto/usage/cluster/fluid_cluster_train_en.md
+++ b/doc/howto/usage/cluster/fluid_cluster_train_en.md
@@ -16,6 +16,12 @@ PaddlePaddle must be installed on all nodes. If you have GPU cards on your nodes
 
 PaddlePaddle build and installation guide can be found  [here](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html).
 
+In addition to above, the `cmake` command should be run with the option `WITH_DISTRIBUTE` set to on. An example bare minimum `cmake` command would look as follows:
+
+``` bash
+cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON
+```
+
 ### Update the training script
 
 #### Non-cluster training script
@@ -119,7 +125,14 @@ for pass_id in range(100):
 
 ### E2E demo
 
-Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py). In parameter server node run the following in the command line:
+Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py).
+First `cd` into the folder that contains the `python` files. In this case:
+
+```bash
+cd /paddle/python/paddle/v2/fluid/tests/book_distribute
+```
+
+In parameter server node run the following in the command line:
 
 ``` bash
 PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=PSERVER python notest_dist_fit_a_line.py
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 8d9260811a8c9274dcaade9b090bab727d1952ca..280496984251919a8b4b6c52684f950a80b78356 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -74,8 +74,10 @@ cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 
+cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
+
 cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
-framework_proto backward glog lod_rank_table profiler)
+framework_proto backward glog lod_rank_table profiler feed_fetch_method)
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
diff --git a/paddle/framework/channel.h b/paddle/framework/channel.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ba0fc5c558a85b41deb01ad57842d9c4c054e0e
--- /dev/null
+++ b/paddle/framework/channel.h
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <condition_variable>
+#include <mutex>
+#include <queue>
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+class Channel {
+ public:
+  explicit Channel(std::size_t capacity) : capacity_(capacity) {}
+
+  void Send(T* channel_element) {
+    std::unique_lock<std::mutex> lock(mu_);
+
+    if (IsBounded()) {
+      full_cond_var_.wait(lock, [this]() {
+        bool capacity_valid = capacity_ > 0 ? !IsCapacityFull() : true;
+        return capacity_valid;
+      });
+    }
+    channel_.push_back(std::move(*channel_element));
+
+    lock.unlock();
+    empty_cond_var_.notify_one();
+  }
+
+  T* Receive() {
+    std::unique_lock<std::mutex> lock(mu_);
+    empty_cond_var_.wait(lock, [this]() { return !channel_.empty(); });
+
+    T* channel_element = std::move(channel_.front());
+    channel_.pop_front();
+
+    NotifyAllSenders(&lock);
+    return channel_element;
+  }
+
+  size_t Size() {
+    std::unique_lock<std::mutex> lock(mu_);
+    return channel_.size();
+  }
+
+  void Clear() {
+    std::unique_lock<std::mutex> lock(mu_);
+    channel_.clear();
+
+    NotifyAllSenders(&lock);
+  }
+
+ private:
+  std::size_t capacity_;
+  std::mutex mu_;
+  std::condition_variable empty_cond_var_;
+  std::condition_variable full_cond_var_;
+  std::deque<T> channel_;
+
+ private:
+  void NotifyAllSenders(std::unique_lock<std::mutex>* lock) {
+    if (IsBounded()) {
+      lock->unlock();
+      full_cond_var_.notify_one();
+    }
+  }
+
+  bool IsBounded() const { return capacity_ > 0; }
+
+  bool IsCapacityFull() const { return channel_.size() >= capacity_; }
+};
+
+}  // namespace operator
+}  // namespace paddle
diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h
index 6a372ac32e48131eed28e2d42125feb5b92a11c7..98eb3e857d1943e71f1d41f24ecbedbe09e85b7b 100644
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@@ -79,5 +79,33 @@ inline void VisitDataType(proto::DataType type, Visitor visitor) {
   }
 }
 
+inline std::string DataTypeToString(const proto::DataType type) {
+  using namespace paddle::framework::proto;
+  switch (type) {
+    case DataType::FP16:
+      return "float16";
+    case DataType::FP32:
+      return "float32";
+    case DataType::FP64:
+      return "float64";
+    case DataType::INT16:
+      return "int16";
+    case DataType::INT32:
+      return "int32";
+    case DataType::INT64:
+      return "int64";
+    case DataType::BOOL:
+      return "bool";
+    default:
+      PADDLE_THROW("Not support type %d", type);
+  }
+}
+
+inline std::ostream& operator<<(std::ostream& out,
+                                const proto::DataType& type) {
+  out << DataTypeToString(type);
+  return out;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index c28ffefdd0872238299cdbb0653ee17cdad61699..cbf3ec75265fa74aaacffee684b7b7d5f73b7c02 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <set>
 
 #include "gflags/gflags.h"
+#include "paddle/framework/feed_fetch_method.h"
 #include "paddle/framework/feed_fetch_type.h"
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor_array.h"
@@ -149,5 +150,164 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
   }
 }
 
+// Check whether the block already has feed operators and feed_holder.
+// Return false if the block does not have any feed operators.
+// If some feed operators have been prepended to the block, check that
+// the info contained in these feed operators matches the feed_targets
+// and feed_holder_name. Raise exception when any mismatch is found.
+// Return true if the block has feed operators and holder of matching info.
+static bool has_feed_operators(
+    BlockDesc* block, std::map<std::string, const LoDTensor*>& feed_targets,
+    const std::string& feed_holder_name) {
+  size_t feed_count = 0;
+  for (auto* op : block->AllOps()) {
+    if (op->Type() == kFeedOpType) {
+      feed_count++;
+      PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name,
+                        "Input to feed op should be '%s'", feed_holder_name);
+      std::string feed_target_name = op->Output("Out")[0];
+      PADDLE_ENFORCE(
+          feed_targets.find(feed_target_name) != feed_targets.end(),
+          "Feed operator output name '%s' cannot be found in 'feed_targets'",
+          feed_target_name);
+    }
+  }
+
+  if (feed_count > 0) {
+    PADDLE_ENFORCE_EQ(
+        feed_count, feed_targets.size(),
+        "The number of feed operators should match 'feed_targets'");
+
+    // When feed operator are present, so should be feed_holder
+    auto var = block->FindVar(feed_holder_name);
+    PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
+                            feed_holder_name);
+    PADDLE_ENFORCE_EQ(var->GetType(), proto::VarDesc::FEED_MINIBATCH,
+                      "'%s' variable should be 'FEED_MINIBATCH' type",
+                      feed_holder_name);
+  }
+
+  return feed_count > 0;
+}
+
+// Check whether the block already has fetch operators and fetch_holder.
+// Return false if the block does not have any fetch operators.
+// If some fetch operators have been appended to the block, check that
+// the info contained in these fetch operators matches the fetch_targets
+// and fetch_holder_name. Raise exception when any mismatch is found.
+// Return true if the block has fetch operators and holder of matching info.
+static bool has_fetch_operators(
+    BlockDesc* block, std::map<std::string, LoDTensor*>& fetch_targets,
+    const std::string& fetch_holder_name) {
+  size_t fetch_count = 0;
+  for (auto* op : block->AllOps()) {
+    if (op->Type() == kFetchOpType) {
+      fetch_count++;
+      PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name,
+                        "Output of fetch op should be '%s'", fetch_holder_name);
+      std::string fetch_target_name = op->Input("X")[0];
+      PADDLE_ENFORCE(
+          fetch_targets.find(fetch_target_name) != fetch_targets.end(),
+          "Fetch operator input name '%s' cannot be found in 'fetch_targets'",
+          fetch_target_name);
+    }
+  }
+
+  if (fetch_count > 0) {
+    PADDLE_ENFORCE_EQ(
+        fetch_count, fetch_targets.size(),
+        "The number of fetch operators should match 'fetch_targets'");
+
+    // When fetch operator are present, so should be fetch_holder
+    auto var = block->FindVar(fetch_holder_name);
+    PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
+                            fetch_holder_name);
+    PADDLE_ENFORCE_EQ(var->GetType(), proto::VarDesc::FETCH_LIST,
+                      "'%s' variable should be 'FETCH_LIST' type",
+                      fetch_holder_name);
+  }
+
+  return fetch_count > 0;
+}
+
+void Executor::Run(const ProgramDesc& program, Scope* scope,
+                   std::map<std::string, const LoDTensor*>& feed_targets,
+                   std::map<std::string, LoDTensor*>& fetch_targets,
+                   const std::string& feed_holder_name,
+                   const std::string& fetch_holder_name) {
+  auto* copy_program = new ProgramDesc(program);
+  auto* global_block = copy_program->MutableBlock(0);
+
+  if (!has_feed_operators(global_block, feed_targets, feed_holder_name)) {
+    // create feed_holder variable
+    auto* feed_holder = global_block->Var(feed_holder_name);
+    feed_holder->SetType(proto::VarDesc::FEED_MINIBATCH);
+    feed_holder->SetPersistable(true);
+
+    int i = 0;
+    for (auto& feed_target : feed_targets) {
+      std::string var_name = feed_target.first;
+      VLOG(3) << "feed target's name: " << var_name;
+
+      // prepend feed op
+      auto* op = global_block->PrependOp();
+      op->SetType(kFeedOpType);
+      op->SetInput("X", {feed_holder_name});
+      op->SetOutput("Out", {var_name});
+      op->SetAttr("col", {static_cast<int>(i)});
+      op->CheckAttrs();
+
+      i++;
+    }
+  }
+
+  // map the data of feed_targets to feed_holder
+  for (auto* op : global_block->AllOps()) {
+    if (op->Type() == kFeedOpType) {
+      std::string feed_target_name = op->Output("Out")[0];
+      int idx = boost::get<int>(op->GetAttr("col"));
+      SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
+                      idx);
+    }
+  }
+
+  if (!has_fetch_operators(global_block, fetch_targets, fetch_holder_name)) {
+    // create fetch_holder variable
+    auto* fetch_holder = global_block->Var(fetch_holder_name);
+    fetch_holder->SetType(proto::VarDesc::FETCH_LIST);
+    fetch_holder->SetPersistable(true);
+
+    int i = 0;
+    for (auto& fetch_target : fetch_targets) {
+      std::string var_name = fetch_target.first;
+      VLOG(3) << "fetch target's name: " << var_name;
+
+      // append fetch op
+      auto* op = global_block->AppendOp();
+      op->SetType(kFetchOpType);
+      op->SetInput("X", {var_name});
+      op->SetOutput("Out", {fetch_holder_name});
+      op->SetAttr("col", {static_cast<int>(i)});
+      op->CheckAttrs();
+
+      i++;
+    }
+  }
+
+  Run(*copy_program, scope, 0, true, true);
+
+  // obtain the data of fetch_targets from fetch_holder
+  for (auto* op : global_block->AllOps()) {
+    if (op->Type() == kFetchOpType) {
+      std::string fetch_target_name = op->Input("X")[0];
+      int idx = boost::get<int>(op->GetAttr("col"));
+      *fetch_targets[fetch_target_name] =
+          GetFetchVariable(*scope, fetch_holder_name, idx);
+    }
+  }
+
+  delete copy_program;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
index d869e18901b82959a40cc296aa0844c20ea63ac1..035ff48a52bd2fc4b1a46b48b1fbf1fbcb2ac70b 100644
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -41,6 +41,12 @@ class Executor {
   void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true,
            bool create_vars = true);
 
+  void Run(const ProgramDesc& program, Scope* scope,
+           std::map<std::string, const LoDTensor*>& feed_targets,
+           std::map<std::string, LoDTensor*>& fetch_targets,
+           const std::string& feed_holder_name = "feed",
+           const std::string& fetch_holder_name = "fetch");
+
  private:
   const platform::Place place_;
 };
diff --git a/paddle/framework/feed_fetch_method.cc b/paddle/framework/feed_fetch_method.cc
new file mode 100644
index 0000000000000000000000000000000000000000..21201b675519e34b11e9f1f3a6f2a135c06d63a7
--- /dev/null
+++ b/paddle/framework/feed_fetch_method.cc
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/feed_fetch_method.h"
+#include "glog/logging.h"
+#include "paddle/framework/variable.h"
+
+namespace paddle {
+namespace framework {
+
+void SetFeedVariable(Scope* scope, const LoDTensor& input,
+                     const std::string& var_name, size_t index) {
+  // If var_name Variable is not found in GlobalScope, a new variable will
+  // be created.
+  VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
+  Variable* g_feed_value = scope->Var(var_name);
+  auto& feed_inputs =
+      *(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
+  if (index >= feed_inputs.size()) {
+    feed_inputs.resize(index + 1);
+  }
+  // shared data with input tensor
+  feed_inputs[index].ShareDataWith(input);
+  // set lod
+  feed_inputs[index].set_lod(input.lod());
+}
+
+LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
+                            size_t index) {
+  // Since we want to fetch LodTensor from a variable, the variable must
+  // be created alreadly.
+  Variable* g_fetch_value = scope.FindVar(var_name);
+  PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(),
+                 "Only %s can be invoked by GetFetchVariable",
+                 typeid(FeedFetchList).name());
+  auto& fetch_outputs = *g_fetch_value->GetMutable<FeedFetchList>();
+  auto& tensor = fetch_outputs[index];
+  VLOG(3) << "Fetch " << var_name << " with index " << index
+          << " shape= " << tensor.dims();
+  PADDLE_ENFORCE_LT(index, fetch_outputs.size());
+  return tensor;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/feed_fetch_method.h b/paddle/framework/feed_fetch_method.h
index 7feacb1e24708411e7fbb610f9909447cba9e291..b71945fcc8834d2e5fe21151e1e88788b4acd5c1 100644
--- a/paddle/framework/feed_fetch_method.h
+++ b/paddle/framework/feed_fetch_method.h
@@ -13,46 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "glog/logging.h"
+
 #include "paddle/framework/feed_fetch_type.h"
 #include "paddle/framework/scope.h"
-#include "paddle/framework/variable.h"
 
 namespace paddle {
 namespace framework {
 
 void SetFeedVariable(Scope* scope, const LoDTensor& input,
-                     const std::string& var_name, size_t index) {
-  // If var_name Variable is not found in GlobalScope, a new variable will
-  // be created.
-  VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
-  Variable* g_feed_value = scope->Var(var_name);
-  auto& feed_inputs =
-      *(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
-  if (index >= feed_inputs.size()) {
-    feed_inputs.resize(index + 1);
-  }
-  // shared data with input tensor
-  feed_inputs[index].ShareDataWith(input);
-  // set lod
-  feed_inputs[index].set_lod(input.lod());
-}
+                     const std::string& var_name, size_t index);
 
 LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
-                            size_t index) {
-  // Since we want to fetch LodTensor from a variable, the variable must
-  // be created alreadly.
-  Variable* g_fetch_value = scope.FindVar(var_name);
-  PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(),
-                 "Only %s can be invoked by GetFetchVariable",
-                 typeid(FeedFetchList).name());
-  auto& fetch_outputs = *g_fetch_value->GetMutable<FeedFetchList>();
-  auto& tensor = fetch_outputs[index];
-  VLOG(3) << "Fetch " << var_name << " with index " << index
-          << " shape= " << tensor.dims();
-  PADDLE_ENFORCE_LT(index, fetch_outputs.size());
-  return tensor;
-}
+                            size_t index);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_kernel_type_test.cc b/paddle/framework/op_kernel_type_test.cc
index 649afeee8a846b0579545f2edff77e9dbe3b4dd8..cb23bbde01493d1a3b5845e77d6160a75f409c7a 100644
--- a/paddle/framework/op_kernel_type_test.cc
+++ b/paddle/framework/op_kernel_type_test.cc
@@ -26,9 +26,9 @@ TEST(OpKernelType, ToString) {
   OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
                               LibraryType::kCUDNN);
 
-  ASSERT_EQ(
-      paddle::framework::KernelTypeToString(op_kernel_type),
-      "data_type[5]:data_layout[NCHW]:place[CPUPlace]:library_type[CUDNN]");
+  ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type),
+            "data_type[float32]:data_layout[NCHW]:place[CPUPlace]:library_type["
+            "CUDNN]");
 }
 
 TEST(OpKernelType, Hash) {
diff --git a/paddle/framework/threadpool.cc b/paddle/framework/threadpool.cc
index 109a7e7dc440d91e8223f2c0924f489f54a06f64..b2f5ae4a96593fde1623dd10d3b63c984ae228db 100644
--- a/paddle/framework/threadpool.cc
+++ b/paddle/framework/threadpool.cc
@@ -1,24 +1,93 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 
 #include "paddle/framework/threadpool.h"
 
 namespace paddle {
 namespace framework {
 
-std::unique_ptr<ThreadPool> ThreadPool::threadpool(nullptr);
-std::once_flag ThreadPool::init_flag;
+std::unique_ptr<ThreadPool> ThreadPool::threadpool_(nullptr);
+std::once_flag ThreadPool::init_flag_;
+
+ThreadPool* ThreadPool::GetInstance() {
+  std::call_once(init_flag_, &ThreadPool::Init);
+  return threadpool_.get();
+}
+
+void ThreadPool::Init() {
+  if (threadpool_.get() == nullptr) {
+    // TODO(Yancey1989): specify the max threads number
+    int num_threads = std::thread::hardware_concurrency();
+    PADDLE_ENFORCE_GT(num_threads, 0);
+    threadpool_.reset(new ThreadPool(num_threads));
+  }
+}
+
+ThreadPool::ThreadPool(int num_threads)
+    : total_threads_(num_threads), idle_threads_(num_threads), running_(true) {
+  threads_.resize(num_threads);
+  for (auto& thread : threads_) {
+    // TODO(Yancey1989): binding the thread on the specify CPU number
+    thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this)));
+  }
+}
+
+ThreadPool::~ThreadPool() {
+  {
+    // notify all threads to stop running
+    running_ = false;
+    scheduled_.notify_all();
+  }
+
+  for (auto& t : threads_) {
+    t->join();
+    t.reset(nullptr);
+  }
+}
+
+void ThreadPool::Wait() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  completed_.wait(lock, [=] { return Done() == true; });
+}
+
+void ThreadPool::TaskLoop() {
+  while (running_) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });
+
+    if (!running_) {
+      break;
+    }
+    // pop a task from the task queue
+    auto task = std::move(tasks_.front());
+    tasks_.pop();
+
+    --idle_threads_;
+    lock.unlock();
+
+    // run the task
+    task();
+
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      ++idle_threads_;
+      if (Done()) {
+        completed_.notify_all();
+      }
+    }
+  }
+}
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/threadpool.h b/paddle/framework/threadpool.h
index 3ac345851c38557f82698786dd3bc8e1202a4256..8912b1a43a26f9df662d3b5ddf68bfb2b87f4a20 100644
--- a/paddle/framework/threadpool.h
+++ b/paddle/framework/threadpool.h
@@ -20,52 +20,36 @@ limitations under the License. */
 #include <mutex>
 #include <queue>
 #include <thread>
+#include <vector>
 
 #include "paddle/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
 
+// ThreadPool maintains a queue of tasks, and runs them using a fixed
+// number of threads.
 class ThreadPool {
  public:
   typedef std::packaged_task<void()> Task;
 
-  /**
-   * @brief   Get a instance of threadpool, the thread number will
-   *          be specified as the number of hardware thread contexts
-   */
-  static ThreadPool* GetInstance() {
-    std::call_once(init_flag, &ThreadPool::Init);
-    return threadpool.get();
-  }
+  // Returns the singleton of ThreadPool.
+  static ThreadPool* GetInstance();
 
-  ~ThreadPool() {
-    {
-      // notify all threads to stop running
-      running_ = false;
-      scheduled_.notify_all();
-    }
-
-    for (auto& t : threads_) {
-      t->join();
-      t.reset(nullptr);
-    }
-  }
+  ~ThreadPool();
 
-  int GetNumThreads() const { return num_threads_; }
+  // Returns the number of threads created by the constructor.
+  size_t Threads() const { return total_threads_; }
 
-  int GetAvailable() {
+  // Returns the number of currently idle threads.
+  size_t IdleThreads() {
     std::unique_lock<std::mutex> lock(mutex_);
-    return available_;
+    return idle_threads_;
   }
 
-  /**
-   * @brief   Push a function to the queue, and will be scheduled and
-   *          executed if a thread is available.
-   * @param[in] Task, will be pushed to the task queue.
-   * @return    std::future<void>, we could wait for the task finished by
-   *            f.wait().
-   */
+  // Run pushes a function to the task queue and returns a std::future
+  // object.  To wait for the completion of the task, call
+  // std::future::wait().
   template <typename Callback>
   std::future<void> Run(Callback fn) {
     std::unique_lock<std::mutex> lock(mutex_);
@@ -77,84 +61,40 @@ class ThreadPool {
     return f;
   }
 
-  /**
-   * @brief   Wait until all the tasks are completed.
-   */
-  void Wait() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    completed_.wait(lock, [=] { return Done() == true; });
-  }
+  // Wait until all the tasks are completed.
+  void Wait();
 
  private:
   DISABLE_COPY_AND_ASSIGN(ThreadPool);
 
-  explicit ThreadPool(int num_threads)
-      : num_threads_(num_threads), available_(num_threads), running_(true) {
-    threads_.resize(num_threads);
-    for (auto& thread : threads_) {
-      // TODO(Yancey1989): binding the thread on the specify CPU number
-      thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this)));
-    }
-  }
+  explicit ThreadPool(int num_threads);
 
-  /**
-   * @brief   If the task queue is empty and avaialbe
-   *          is equal to the number of threads, means that
-   *          all tasks are completed.
-   *
-   *          Note: this function is not thread-safe.
-   *
-   * @return true if all tasks are completed.
-   */
-  bool Done() { return tasks_.empty() && available_ == num_threads_; }
-
-  void TaskLoop() {
-    while (running_) {
-      std::unique_lock<std::mutex> lock(mutex_);
-      scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });
-
-      if (!running_) {
-        break;
-      }
-      // pop a task from the task queue
-      auto task = std::move(tasks_.front());
-      tasks_.pop();
-
-      --available_;
-      lock.unlock();
-
-      // run the task
-      task();
-
-      {
-        std::unique_lock<std::mutex> lock(mutex_);
-        ++available_;
-        if (Done()) {
-          completed_.notify_all();
-        }
-      }
-    }
-  }
+  // If the task queue is empty and avaialbe is equal to the number of
+  // threads, means that all tasks are completed.  Note: this function
+  // is not thread-safe.  Returns true if all tasks are completed.
+  // Note: don't delete the data member total_threads_ and use
+  // threads_.size() instead; because you'd need to lock the mutex
+  // before accessing threads_.
+  bool Done() { return tasks_.empty() && idle_threads_ == total_threads_; }
 
-  static void Init() {
-    if (threadpool.get() == nullptr) {
-      // TODO(Yancey1989): specify the max threads number
-      int num_threads = std::thread::hardware_concurrency();
-      PADDLE_ENFORCE_GT(num_threads, 0);
-      threadpool.reset(new ThreadPool(num_threads));
-    }
-  }
+  // The constructor starts threads to run TaskLoop, which retrieves
+  // and runs tasks from the queue.
+  void TaskLoop();
+
+  // Init is called by GetInstance.
+  static void Init();
 
  private:
-  static std::unique_ptr<ThreadPool> threadpool;
-  static std::once_flag init_flag;
+  static std::unique_ptr<ThreadPool> threadpool_;
+  static std::once_flag init_flag_;
 
-  int num_threads_;
-  int available_;
-  bool running_;
-  std::queue<Task> tasks_;
   std::vector<std::unique_ptr<std::thread>> threads_;
+  const size_t total_threads_;
+  size_t idle_threads_;
+
+  std::queue<Task> tasks_;
   std::mutex mutex_;
+  bool running_;
   std::condition_variable scheduled_;
   std::condition_variable completed_;
 };
diff --git a/paddle/framework/threadpool_test.cc b/paddle/framework/threadpool_test.cc
index 50b6238cd8786be9d8cf2d5f821daadea12bd208..3fbfe7efc867144dbd0dd2613c824c6a3c41b7d8 100644
--- a/paddle/framework/threadpool_test.cc
+++ b/paddle/framework/threadpool_test.cc
@@ -22,11 +22,7 @@ namespace framework = paddle::framework;
 void do_sum(framework::ThreadPool* pool, std::atomic<int>& sum, int cnt) {
   std::vector<std::future<void>> fs;
   for (int i = 0; i < cnt; ++i) {
-    auto f = pool->Run([&sum]() { sum.fetch_add(1); });
-    fs.push_back(std::move(f));
-  }
-  for (auto& f : fs) {
-    f.wait();
+    fs.push_back(framework::Async([&sum]() { sum.fetch_add(1); }));
   }
 }
 
diff --git a/paddle/inference/inference.cc b/paddle/inference/inference.cc
index 49001778808173b82865a4b6632a6b175ef96242..b43c359ed1787143403336e8c1cb4c7f85b1d7a2 100644
--- a/paddle/inference/inference.cc
+++ b/paddle/inference/inference.cc
@@ -15,18 +15,13 @@ limitations under the License. */
 #include "inference.h"
 #include <fstream>
 #include "paddle/framework/executor.h"
-#include "paddle/framework/feed_fetch_method.h"
 #include "paddle/framework/init.h"
 #include "paddle/framework/scope.h"
 
-#ifdef PADDLE_USE_PTOOLS
-#include "chooseser.h"
-#endif
-
 namespace paddle {
 
 void InferenceEngine::LoadInferenceModel(const std::string& dirname) {
-  std::string model_filename = dirname + "/__model__.dat";
+  std::string model_filename = dirname + "/__model__";
   LOG(INFO) << "loading model from " << model_filename;
   std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
   std::string program_desc_str;
@@ -52,39 +47,15 @@ void InferenceEngine::LoadInferenceModel(const std::string& dirname) {
   }
 }
 
-void InferenceEngine::LoadInferenceModel(
-    const std::string& dirname,
-    const std::vector<std::string>& feed_var_names,
-    const std::vector<std::string>& fetch_var_names) {
-  std::string model_filename = dirname + "/__model__.dat";
-  LOG(INFO) << "loading model from " << model_filename;
-  std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
-  std::string program_desc_str;
-  inputfs.seekg(0, std::ios::end);
-  program_desc_str.resize(inputfs.tellg());
-  inputfs.seekg(0, std::ios::beg);
-  LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
-  inputfs.read(&program_desc_str[0], program_desc_str.size());
-  inputfs.close();
-
-  program_ = new framework::ProgramDesc(program_desc_str);
-  GenerateLoadProgram(dirname);
-
-  if (feed_var_names.empty() || fetch_var_names.empty()) {
-    LOG(FATAL) << "Please specify the feed_var_names and fetch_var_names.";
-  }
-  feed_var_names_ = feed_var_names;
-  fetch_var_names_ = fetch_var_names;
-  PrependFeedOp();
-  AppendFetchOp();
-}
-
 bool InferenceEngine::IsParameter(const framework::VarDesc* var) {
-  if (var->Persistable() && var->Name() != "feed" && var->Name() != "fetch") {
+  if (var->Persistable()) {
     // There are many unreachable variables in the program
     for (size_t i = 0; i < program_->Size(); ++i) {
       const framework::BlockDesc& block = program_->Block(i);
       for (auto* op : block.AllOps()) {
+        if (op->Type() == "feed") {
+          continue;
+        }
         for (auto input_argument_name : op->InputArgumentNames()) {
           if (input_argument_name == var->Name()) {
             return true;
@@ -182,7 +153,7 @@ void InferenceEngine::Execute(const std::vector<framework::LoDTensor>& feeds,
     LOG(FATAL) << "Please initialize the program_ and load_program_ first.";
   }
 
-  if (feeds.size() < feed_var_names_.size()) {
+  if (feeds.size() != feed_var_names_.size()) {
     LOG(FATAL) << "Please feed " << feed_var_names_.size() << " input Tensors.";
   }
 
@@ -193,19 +164,22 @@ void InferenceEngine::Execute(const std::vector<framework::LoDTensor>& feeds,
 
   executor->Run(*load_program_, scope, 0, true, true);
 
+  std::map<std::string, const framework::LoDTensor*> feed_targets;
+  std::map<std::string, framework::LoDTensor*> fetch_targets;
+
   // set_feed_variable
   for (size_t i = 0; i < feed_var_names_.size(); ++i) {
-    framework::SetFeedVariable(scope, feeds[i], "feed", i);
+    feed_targets[feed_var_names_[i]] = &feeds[i];
   }
 
-  executor->Run(*program_, scope, 0, true, true);
-
   // get_fetch_variable
   fetchs.resize(fetch_var_names_.size());
   for (size_t i = 0; i < fetch_var_names_.size(); ++i) {
-    fetchs[i] = framework::GetFetchVariable(*scope, "fetch", i);
+    fetch_targets[fetch_var_names_[i]] = &fetchs[i];
   }
 
+  executor->Run(*program_, scope, feed_targets, fetch_targets);
+
   delete place;
   delete scope;
   delete executor;
diff --git a/paddle/inference/inference.h b/paddle/inference/inference.h
index 7fc09cb9e539a65a8cd3cceb1543bc7d111c22b3..26f259824b945e260b370ced9d065842264075d5 100644
--- a/paddle/inference/inference.h
+++ b/paddle/inference/inference.h
@@ -29,9 +29,6 @@ public:
   }
 
   void LoadInferenceModel(const std::string& dirname);
-  void LoadInferenceModel(const std::string& dirname,
-                          const std::vector<std::string>& feed_var_names,
-                          const std::vector<std::string>& fetch_var_names);
   void Execute(const std::vector<framework::LoDTensor>& feeds,
                std::vector<framework::LoDTensor>& fetchs);
 
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 15f7cb6b560590f55e276fde4900d2e3c0045fb8..48cf5816cce4bb5ee8e66e72c5b1acea8535ab10 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -147,6 +147,7 @@ op_library(max_sequence_len_op DEPS lod_rank_table)
 op_library(sequence_conv_op DEPS context_project)
 op_library(sequence_pool_op DEPS sequence_pooling)
 op_library(lstm_op DEPS sequence2batch lstm_compute)
+op_library(lstmp_op DEPS sequence2batch lstm_compute)
 op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale math_function)
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
index 88c3d1c597a853abdee7753a5110be4a1726e905..c0809abc05104c1e8c1f42331c0530724dd1472f 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -323,7 +323,7 @@ template <typename T>
 struct FloorFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out>
   void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.ceil();
+    out.device(d) = x.floor();
   }
 };
 
diff --git a/paddle/operators/detail/grpc_client.cc b/paddle/operators/detail/grpc_client.cc
index 1e41587c418fb0ce4e452d5c6735c54e2d42f798..c43394554263c761d010494f692a4050cae7e8cb 100644
--- a/paddle/operators/detail/grpc_client.cc
+++ b/paddle/operators/detail/grpc_client.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "grpc_client.h"
+#include "paddle/framework/threadpool.h"
 namespace paddle {
 namespace operators {
 namespace detail {
@@ -22,25 +23,32 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
                                   const framework::Scope& scope,
                                   const std::string& var_name,
                                   int64_t time_out) {
-  sendrecv::VariableMessage req;
-  auto* var = scope.FindVar(var_name);
-  SerializeToMessage(var_name, var, ctx, &req);
-
-  // varhandle
-  VarHandle var_h;
-  var_h.ep = ep;
-  var_h.scope = &scope;
-  var_h.name = var_name;
-  var_h.ctx = &ctx;
-
-  // stub context
-  auto ch = GetChannel(ep);
-  SendProcessor* s = new SendProcessor(ch);
-  s->Prepare(var_h, time_out);
-  s->response_call_back_ = NULL;
-
-  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string var_name_val = var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+
+  framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] {
+    auto* var = p_scope->FindVar(var_name_val);
+    sendrecv::VariableMessage req;
+    SerializeToMessage(var_name_val, var, *p_ctx, &req);
+
+    // varhandle
+    VarHandle var_h;
+    var_h.ep = ep_val;
+    var_h.scope = p_scope;
+    var_h.name = var_name_val;
+    var_h.ctx = p_ctx;
+
+    // stub context
+    SendProcessor* s = new SendProcessor(ch);
+    s->Prepare(var_h, time_out);
+    s->response_call_back_ = NULL;
+
+    auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
+    rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  });
 
   req_count_++;
 
@@ -50,8 +58,6 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
 void ProcGetResponse(const VarHandle& var_h,
                      const sendrecv::VariableMessage& ret_msg) {
   auto* outvar = var_h.scope->FindVar(var_h.name);
-
-  std::istringstream iss(ret_msg.serialized());
   DeserializeFromMessage(ret_msg, *var_h.ctx, outvar);
 }
 
@@ -60,24 +66,31 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
                                  const framework::Scope& scope,
                                  const std::string& var_name,
                                  int64_t time_out) {
-  sendrecv::VariableMessage req;
-  req.set_varname(var_name);
-
-  // varhandle
-  VarHandle var_h;
-  var_h.ep = ep;
-  var_h.scope = &scope;
-  var_h.name = var_name;
-  var_h.ctx = &ctx;
-
-  // stub context
-  auto ch = GetChannel(ep);
-  GetProcessor* s = new GetProcessor(ch);
-  s->Prepare(var_h, time_out);
-  s->response_call_back_ = ProcGetResponse;
-
-  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string var_name_val = var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+
+  framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {
+    sendrecv::VariableMessage req;
+    req.set_varname(var_name_val);
+
+    // varhandle
+    VarHandle var_h;
+    var_h.ep = ep_val;
+    var_h.scope = p_scope;
+    var_h.name = var_name_val;
+    var_h.ctx = p_ctx;
+
+    // stub context
+    GetProcessor* s = new GetProcessor(ch);
+    s->Prepare(var_h, time_out);
+    s->response_call_back_ = ProcGetResponse;
+
+    auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
+    rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  });
 
   req_count_++;
 
@@ -85,19 +98,31 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
 }
 
 bool RPCClient::Wait() {
-  bool ok = true;
+  if (req_count_ <= 0) {
+    return true;
+  }
+  const size_t kReqCnt = req_count_;
+  bool a[kReqCnt];
+  std::vector<std::future<void>> waits(req_count_);
 
-  while (true) {
-    if (req_count_ <= 0) {
-      break;
-    }
+  for (int i = 0; i < req_count_; i++) {
+    waits[i] = framework::Async([i, &a, this] { a[i] = Proceed(); });
+  }
+
+  for (int i = 0; i < req_count_; i++) {
+    waits[i].wait();
+  }
 
-    if (!Proceed()) {
+  int last_req_count = req_count_;
+  req_count_ = 0;
+
+  for (int i = 0; i < last_req_count; i++) {
+    if (!a[i]) {
       return false;
     }
   }
 
-  return ok;
+  return true;
 }
 
 bool RPCClient::Proceed() {
@@ -124,7 +149,6 @@ bool RPCClient::Proceed() {
 
   c->Process();
   delete c;
-  req_count_--;
   return true;
 }
 
diff --git a/paddle/operators/gru_op.cc b/paddle/operators/gru_op.cc
index 76f2adefede3b4bc4035f86f8f8663eed29343ae..fb901b639492a179925ff852f9030fc6674d1f63 100644
--- a/paddle/operators/gru_op.cc
+++ b/paddle/operators/gru_op.cc
@@ -135,14 +135,14 @@ class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 GRU Operator implements part calculations of the complete GRU as following:
 
-\f[
-update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
-reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
-output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
+$$
+update\_gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
+reset\_gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
+output\_candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
 output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
-\f]
+$$
 
-@note To implement the complete GRU, fully-connected operator must be used  
+@note To implement the complete GRU, fully-connected operator must be used
 before to feed xu, xr and xc as the Input of GRU operator.
 )DOC");
   }
diff --git a/paddle/operators/lstmp_op.cc b/paddle/operators/lstmp_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c96b30ba353fabc48630258ea8f88f741b8c415e
--- /dev/null
+++ b/paddle/operators/lstmp_op.cc
@@ -0,0 +1,331 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/lstmp_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LSTMPOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ProjWeight"),
+                   "Input(ProjWeight) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                   "Input(Bias) of LSTMP operator should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("Projection"),
+                   "Output(Projection) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Cell"),
+                   "Output(Cell) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
+                   "Output(BatchGate) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"),
+                   "Output(BatchCellPreAct) of LSTMP operator should not be "
+                   "null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"),
+                   "Output(BatchHidden) of LSTMP operator should not be null.");
+
+    auto in_dims = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2,
+                      "Input(X)'s rank of LSTMP operator must be 2.");
+
+    int frame_size = in_dims[1] / 4;
+    auto w_dims = ctx->GetInputDim("Weight");
+    auto proj_dims = ctx->GetInputDim("ProjWeight");
+    PADDLE_ENFORCE_EQ(w_dims.size(), 2,
+                      "The rank of Input(Weight) should be 2.");
+    PADDLE_ENFORCE_EQ(w_dims[0], proj_dims[1],
+                      "The first dimension of Input(Weight) "
+                      "should be %d.",
+                      proj_dims[1]);
+    PADDLE_ENFORCE_EQ(w_dims[1], 4 * frame_size,
+                      "The second dimension of Input(Weight) "
+                      "should be 4 * %d.",
+                      frame_size);
+
+    PADDLE_ENFORCE_EQ(proj_dims.size(), 2,
+                      "The rank of Input(ProjWeight) should be 2.");
+    PADDLE_ENFORCE_EQ(proj_dims[0], frame_size,
+                      "The first dimension of Input(ProjWeight) "
+                      "should be %d.",
+                      frame_size);
+
+    if (ctx->HasInput("H0")) {
+      PADDLE_ENFORCE(ctx->HasInput("C0"),
+                     "Input(C0) of LSTMP operator should not be null after "
+                     "Input(H0) provided.");
+      auto h_dims = ctx->GetInputDim("H0");
+      auto c_dims = ctx->GetInputDim("C0");
+      PADDLE_ENFORCE(h_dims == c_dims,
+                     "The dimension of Input(H0) and Input(C0) "
+                     "should be the same.");
+      ctx->SetOutputDim("OrderedP0", {h_dims[0], proj_dims[1]});
+    }
+
+    auto b_dims = ctx->GetInputDim("Bias");
+    PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
+    PADDLE_ENFORCE_EQ(b_dims[0], 1,
+                      "The first dimension of Input(Bias) should be 1.");
+
+    if (ctx->Attrs().Get<bool>("use_peepholes")) {
+      PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
+                        "The second dimension of Input(Bias) should be "
+                        "7 * %d if enable peepholes connection",
+                        frame_size);
+    } else {
+      PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
+                        "The second dimension of Input(Bias) should be "
+                        "4 * %d if disable peepholes connection",
+                        frame_size);
+    }
+
+    framework::DDim out_dims({in_dims[0], frame_size});
+    framework::DDim proj_out_dims({in_dims[0], proj_dims[1]});
+    ctx->SetOutputDim("Projection", proj_out_dims);
+    ctx->SetOutputDim("Cell", out_dims);
+    ctx->SetOutputDim("BatchGate", in_dims);
+    ctx->SetOutputDim("BatchCellPreAct", out_dims);
+    ctx->SetOutputDim("BatchHidden", out_dims);
+    ctx->ShareLoD("Input", "Projection");
+    ctx->ShareLoD("Input", "Cell");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+
+class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LSTMPOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(LoDTensor) the input for sequence data, which supports "
+             "variable-time length input sequence. The underlying tensor in "
+             "this LoDTensor is a matrix with shape (T X 4D), where T is the "
+             "total time steps in this mini-batch, D is the hidden size.");
+    AddInput("H0",
+             "(Tensor, optional) the initial hidden state is an optional "
+             "input. This is a tensor with shape (N x D), where N is the "
+             "batch size and D is the hidden size.")
+        .AsDispensable();
+    AddInput("C0",
+             "(Tensor, optional) the initial cell state is an optional "
+             "input. This is a tensor with shape (N x D), where N is the "
+             "batch size. `C0` should not be null if `H0` provided.")
+        .AsDispensable();
+    AddInput("Weight",
+             "(Tensor) the learnable hidden-hidden weights."
+             " - The shape is (P x 4D), where P is the projection layer size "
+             "and  D is the hidden size."
+             " - Weight = {W_cr, W_ir, W_fr, W_or}");
+    AddInput("ProjWeight",
+             "(Tensor) the learnable weight of the projection layer."
+             " - The shape is (D x P), where P is the recurrent projection "
+             "layer size and  D is the hidden size."
+             " - ProjWeight = {W_rh}");
+    AddInput("Bias",
+             "(Tensor) the learnable biases, which contains two parts: "
+             "input-hidden biases and peephole connections weights if "
+             "setting `use_peepholes` to `True`. "
+             "1. `use_peepholes = False` "
+             " - The shape is (1 x 4D). "
+             " - Bias = {b_c, b_i, b_f, b_o}."
+             "2. `use_peepholes = True` "
+             " - The shape is (1 x 7D). "
+             " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
+    AddOutput("Projection",
+              "(LoDTensor) the projection of the hidden state of LSTMP "
+              "operator. The shape is (T x P), and LoD is the same with the "
+              "`Input`.");
+    AddOutput("Cell",
+              "(LoDTensor) the cell state of LSTMP operator. "
+              "The shape is (T x D), and lod is the same with the `Input`.");
+    AddOutput("BatchGate",
+              "(LoDTensor) This LoDTensor contains input gate, forget gate "
+              "and output gate after the activations. This LoDTensor has the "
+              "same shape as the reorganized input, which is also be called "
+              "batch input. The LoD size is 2. The first-level LoD is the "
+              "batch offsets and the second contains the indices, which "
+              "denotes the position of reorganized sequence in the raw input.")
+        .AsIntermediate();
+    AddOutput("BatchCellPreAct",
+              "(LoDTensor) the pre-activation cell state reorganized in batch. "
+              "This LoDTensor is obtained in the forward and used in the "
+              "backward.")
+        .AsIntermediate();
+    AddOutput("BatchHidden",
+              "(LoDTensor) the hidden state reorganized in batch. "
+              "This LoDTensor is obtained in the forward and used in the "
+              "backward.")
+        .AsIntermediate();
+    AddOutput("OrderedP0",
+              "(Tensor) the projection of the initial hidden state "
+              "H0. This is a tensor with shape (N x P), where N is the "
+              "batch size and P is the hidden size.")
+        .AsIntermediate();
+    AddAttr<bool>("use_peepholes",
+                  "(bool, defalut: True) "
+                  "whether to enable diagonal/peephole connections.")
+        .SetDefault(true);
+    AddAttr<bool>("is_reverse",
+                  "(bool, defalut: False) "
+                  "whether to compute reversed LSTMP.")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "gate_activation",
+        "(string, default: sigmoid)"
+        "The activation for input gate, forget gate and output "
+        "gate, `sigmoid` by default.")
+        .SetDefault("sigmoid")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddAttr<std::string>("cell_activation",
+                         "(string, default: tanh)"
+                         "The activation for cell output, `tanh` by defalut.")
+        .SetDefault("tanh")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddAttr<std::string>("candidate_activation",
+                         "(string, default: tanh)"
+                         "The activation for candidate hidden state, "
+                         "`tanh` by default.")
+        .SetDefault("tanh")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddAttr<std::string>("proj_activation",
+                         "(string, default: tanh)"
+                         "The activation for projection output, "
+                         "`tanh` by defalut.")
+        .SetDefault("tanh")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddComment(R"DOC(
+Long-Short Term Memory with recurrent Projection layer (LSTMP) Operator.
+
+LSTMP has a separate projection layer after the LSTM layer, projecting the 
+original hidden state to a lower-dimensional one, which is proposed to reduce 
+the number of total parameters and furthermore computational complexity for 
+the LSTM, espeacially for the case that the size of output units is relative 
+large (https://research.google.com/pubs/archive/43905.pdf). 
+
+The formula is as follows:
+
+$$
+i_t = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i) \\
+
+f_t = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f) \\
+
+\tilde{c_t} = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c) \\
+
+o_t = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_t + b_o) \\
+
+c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\
+
+h_t = o_t \odot act_h(c_t) \\
+
+r_t = \overline{act_h}(W_{rh}h_t)
+$$
+
+where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
+of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
+are diagonal weight matrices for peephole connections. In our implementation,
+we use vectors to reprenset these diagonal weight matrices. The b terms
+denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
+is the activation, such as logistic sigmoid function, and
+$i, f, o$ and $c$ are the input gate, forget gate, output gate,
+and cell activation vectors, respectively, all of which have the same size as
+the cell output activation vector $h$. Here $h$ is usually called the hidden 
+state and $r$ denotes its recurrent projection. And $\tilde{c_t}$ is also 
+called the candidate hidden state, whose computation is based on the current 
+input and previous hidden state.
+
+The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
+are the cell input and cell output activation functions and `tanh` is usually
+used for them. $\overline{act_h}$ is the activation function for the 
+projection output, usually using `identity` or same as $act_h$.
+
+Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
+operations on the input $x_{t}$ are NOT included in this operator.
+Users can choose to use fully-connected operator before LSTMP operator.
+
+)DOC");
+  }
+};
+
+class LSTMPGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Projection"),
+                   "Input(Projection) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Cell"),
+                   "Input(Cell) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ProjWeight"),
+                   "Input(ProjWeight) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                   "Input(Bias) of LSTMP operator should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
+                   "Input(BatchGate) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("BatchCellPreAct"),
+                   "Input(BatchGate) of LSTMP operator should not be null.");
+
+    auto SetOutGradDim = [&ctx](const std::string& name) {
+      auto g_name = framework::GradVarName(name);
+      if (ctx->HasOutput(g_name))
+        ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
+    };
+
+    SetOutGradDim("Input");
+    SetOutGradDim("Weight");
+    SetOutGradDim("ProjWeight");
+    SetOutGradDim("Bias");
+    SetOutGradDim("H0");
+    SetOutGradDim("C0");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker, lstmp_grad,
+            ops::LSTMPGradOp);
+REGISTER_OP_CPU_KERNEL(
+    lstmp, ops::LSTMPKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LSTMPKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    lstmp_grad, ops::LSTMPGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LSTMPGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/lstmp_op.cu b/paddle/operators/lstmp_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7fcbcfecc871976fdfbfffbbb4e0243b91351a29
--- /dev/null
+++ b/paddle/operators/lstmp_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/lstmp_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    lstmp, ops::LSTMPKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LSTMPKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    lstmp_grad,
+    ops::LSTMPGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LSTMPGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/lstmp_op.h b/paddle/operators/lstmp_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee82d5c10a5421b181e525f49a263d4808ede62f
--- /dev/null
+++ b/paddle/operators/lstmp_op.h
@@ -0,0 +1,491 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/operators/activation_op.h"
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/operators/math/lstm_compute.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/sequence2batch.h"
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+inline void ReorderInitState(const DeviceContext& ctx,
+                             const framework::Tensor& src, const size_t* index,
+                             framework::Tensor* dst, bool indexed_src) {
+  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
+  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
+  row_shuffle(ctx, src, index, *dst, indexed_src);
+}
+
+template <typename DeviceContext, typename T>
+class LSTMPKernel : public framework::OpKernel<T> {
+ public:
+  template <typename Device, typename X, typename Y>
+  void ActCompute(const math::detail::ActivationType act_type, const Device& d,
+                  X x, Y y) const {
+    if (act_type == math::detail::ActivationType::kIdentity)
+      y.device(d) = x;
+    else if (act_type == math::detail::ActivationType::kSigmoid)
+      SigmoidFunctor<T>()(d, x, y);
+    else if (act_type == math::detail::ActivationType::kTanh)
+      TanhFunctor<T>()(d, x, y);
+    else if (act_type == math::detail::ActivationType::kReLU)
+      ReluFunctor<T>()(d, x, y);
+    else
+      PADDLE_THROW("unsupported activation type");
+  }
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<LoDTensor>("Input");
+    auto* weight = ctx.Input<Tensor>("Weight");
+    auto* proj_weight = ctx.Input<Tensor>("ProjWeight");
+    auto* bias = ctx.Input<Tensor>("Bias");
+
+    auto* hidden_t0 = ctx.Input<Tensor>("H0");
+    auto* ordered_proj0 = ctx.Output<Tensor>("OrderedP0");
+    auto* cell_t0 = ctx.Input<Tensor>("C0");
+
+    auto* batch_gate = ctx.Output<LoDTensor>("BatchGate");
+    batch_gate->mutable_data<T>(ctx.GetPlace());
+    auto* proj_out = ctx.Output<LoDTensor>("Projection");
+    proj_out->mutable_data<T>(ctx.GetPlace());
+    auto* cell_out = ctx.Output<LoDTensor>("Cell");
+    cell_out->mutable_data<T>(ctx.GetPlace());
+
+    bool is_reverse = ctx.Attr<bool>("is_reverse");
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& device_ctx = ctx.template device_context<DeviceContext>();
+    to_batch(device_ctx, *input, *batch_gate, true, is_reverse);
+
+    auto in_dims = input->dims();
+    int frame_size = static_cast<int>(in_dims[1] / 4);
+    framework::DDim dims({in_dims[0], frame_size});
+    framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]});
+
+    if (bias) {
+      Tensor b = *bias;
+      b.Resize({bias->numel(), 1});
+      Tensor gate_bias = b.Slice(0, 4 * frame_size);
+      math::RowwiseAdd<DeviceContext, T> add_bias;
+      add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
+    }
+
+    math::LstmMetaValue<T> lstmp_value;
+    if (bias && ctx.Attr<bool>("use_peepholes")) {
+      T* bias_data = const_cast<T*>(bias->data<T>());
+      // the code style in LstmpMetaValue will be updated later.
+
+      lstmp_value.check_ig = bias_data + 4 * frame_size;
+      lstmp_value.check_fg = lstmp_value.check_ig + frame_size;
+      lstmp_value.check_og = lstmp_value.check_fg + frame_size;
+    } else {
+      lstmp_value.check_ig = nullptr;
+      lstmp_value.check_fg = nullptr;
+      lstmp_value.check_og = nullptr;
+    }
+    lstmp_value.prev_state_value = nullptr;
+    Tensor ordered_c0;
+    const size_t* order = batch_gate->lod()[2].data();
+    if (cell_t0) {
+      // Since the batch computing for LSTMP reorders the input sequence
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<DeviceContext, T>(device_ctx, *cell_t0, order,
+                                         &ordered_c0, true);
+      lstmp_value.prev_state_value = ordered_c0.data<T>();
+    }
+
+    // Use the local variable as here.
+    LoDTensor batch_proj, batch_cell;
+    auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct");
+    batch_cell_pre_act->mutable_data<T>(dims, ctx.GetPlace());
+    auto* batch_hidden = ctx.Output<LoDTensor>("BatchHidden");
+    batch_hidden->mutable_data<T>(dims, ctx.GetPlace());    // T x D
+    batch_proj.mutable_data<T>(proj_dims, ctx.GetPlace());  // T x P
+    batch_cell.mutable_data<T>(dims, ctx.GetPlace());       // T x D
+
+    auto batch_starts = batch_gate->lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    auto gate_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("gate_activation"));
+    auto cell_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("cell_activation"));
+    auto cand_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("candidate_activation"));
+    auto proj_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("proj_activation"));
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    for (size_t n = 0; n < num_batch; n++) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+
+      Tensor gate_t = batch_gate->Slice(bstart, bend);
+      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+      Tensor proj_t = batch_proj.Slice(bstart, bend);
+      Tensor cell_t = batch_cell.Slice(bstart, bend);
+      Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
+
+      int cur_batch_size = bend - bstart;
+
+      if (n > 0) {
+        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
+        int pre_h_end = pre_h_start + cur_batch_size;
+        auto pre_proj_t = batch_proj.Slice(pre_h_start, pre_h_end);
+        math::matmul<DeviceContext, T>(device_ctx, pre_proj_t, false, *weight,
+                                       false, static_cast<T>(1.0), &gate_t,
+                                       static_cast<T>(1.0));
+      } else if (hidden_t0) {
+        // If n == 0 and there is no initialized hidden state, that is to say
+        // the H0 is zeros, the calculation W_h * H0 will be skiped.
+        // If n == 0 and there is initialized hidden state, calculate W_h * H0.
+
+        // Since the batch computing for LSTMP reorders the input sequence
+        // according to their length. The initialized hidden state also needs
+        // to reorder.
+
+        Tensor ordered_h0;
+        ordered_proj0->mutable_data<T>(ctx.GetPlace());
+        ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
+                                           &ordered_h0, true);
+        math::matmul<DeviceContext, T>(device_ctx, ordered_h0, false,
+                                       *proj_weight, false, static_cast<T>(1.0),
+                                       ordered_proj0, static_cast<T>(0.0));
+        if (proj_act != math::detail::ActivationType::kIdentity) {
+          auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
+          ActCompute(cell_act, place, proj0_dev, proj0_dev);
+        }
+        math::matmul<DeviceContext, T>(device_ctx, *ordered_proj0, false,
+                                       *weight, false, static_cast<T>(1.0),
+                                       &gate_t, static_cast<T>(1.0));
+      }
+
+      lstmp_value.gate_value = gate_t.data<T>();
+      lstmp_value.output_value = hidden_t.data<T>();
+      lstmp_value.state_value = cell_t.data<T>();
+      lstmp_value.state_active_value = cell_pre_act_t.data<T>();
+      math::LstmUnitFunctor<DeviceContext, T>::compute(
+          device_ctx, lstmp_value, frame_size, cur_batch_size, gate_act,
+          cell_act, cand_act);
+      lstmp_value.prev_state_value = lstmp_value.state_value;
+      math::matmul<DeviceContext, T>(device_ctx, hidden_t, false, *proj_weight,
+                                     false, static_cast<T>(1.0), &proj_t,
+                                     static_cast<T>(0.0));
+      if (proj_act != math::detail::ActivationType::kIdentity) {
+        auto proj_t_dev = EigenMatrix<T>::From(proj_t);
+        ActCompute(cell_act, place, proj_t_dev, proj_t_dev);
+      }
+    }
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    batch_proj.set_lod(batch_gate->lod());
+    // restore the output hidden in LoDTensor from the batch hidden
+    to_seq(device_ctx, batch_proj, *proj_out);
+
+    batch_cell.set_lod(batch_gate->lod());
+    // restore the output cell state in LoDTensor from the batch cell
+    to_seq(device_ctx, batch_cell, *cell_out);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LSTMPGradKernel : public framework::OpKernel<T> {
+ public:
+  template <typename Device, typename X, typename Y, typename DX, typename DY>
+  void ActGradCompute(const math::detail::ActivationType act_type,
+                      const Device& d, X x, Y y, DX dx, DY dy) const {
+    // x is dummy and won't be used even in Relu(use y instead)
+    if (act_type == math::detail::ActivationType::kIdentity)
+      dx.device(d) = dy;
+    else if (act_type == math::detail::ActivationType::kSigmoid)
+      SigmoidGradFunctor<T>()(d, x, y, dy, dx);
+    else if (act_type == math::detail::ActivationType::kTanh)
+      TanhGradFunctor<T>()(d, x, y, dy, dx);
+    else if (act_type == math::detail::ActivationType::kReLU)
+      ReluGradFunctor<T>()(d, x, y, dy, dx);
+    else
+      PADDLE_THROW("unsupported activation type");
+  }
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<LoDTensor>("Input");
+    auto* weight = ctx.Input<Tensor>("Weight");
+    auto* proj_weight = ctx.Input<Tensor>("ProjWeight");
+    auto* bias = ctx.Input<Tensor>("Bias");
+
+    auto* proj_out = ctx.Input<LoDTensor>("Projection");
+    auto* cell_out = ctx.Input<LoDTensor>("Cell");
+
+    auto* batch_gate = ctx.Input<LoDTensor>("BatchGate");
+    auto* batch_cell_pre_act = ctx.Input<LoDTensor>("BatchCellPreAct");
+    auto* batch_hidden = ctx.Input<LoDTensor>("BatchHidden");
+
+    auto* projection_g =
+        ctx.Input<LoDTensor>(framework::GradVarName("Projection"));
+
+    auto* in_g = ctx.Output<LoDTensor>(framework::GradVarName("Input"));
+    auto* weight_g = ctx.Output<Tensor>(framework::GradVarName("Weight"));
+    auto* proj_weight_g =
+        ctx.Output<Tensor>(framework::GradVarName("ProjWeight"));
+    auto* bias_g = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto* h0 = ctx.Input<Tensor>("H0");
+    auto* ordered_proj0 = ctx.Input<Tensor>("OrderedP0");
+    auto* c0 = ctx.Input<Tensor>("C0");
+
+    auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
+    auto* c0_g = ctx.Output<Tensor>(framework::GradVarName("C0"));
+
+    auto& device_ctx = ctx.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+    if (weight_g) {
+      weight_g->mutable_data<T>(ctx.GetPlace());
+      zero(device_ctx, weight_g, static_cast<T>(0.0));
+    }
+    if (proj_weight_g) {
+      proj_weight_g->mutable_data<T>(ctx.GetPlace());
+      zero(device_ctx, proj_weight_g, static_cast<T>(0.0));
+    }
+
+    // ordered_h0/c0 is the reordered hidden/cell initialization.
+    // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
+    // initialization.
+    Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
+    const size_t* order = batch_gate->lod()[2].data();
+    if (c0) {
+      ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0,
+                                         true);
+    }
+    if (c0 && c0_g) {
+      ordered_c0_g.mutable_data<T>(c0_g->dims(), ctx.GetPlace());
+    }
+
+    auto in_dims = input->dims();
+    auto out_dims = cell_out->dims();
+    framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]});
+    int frame_size = static_cast<int>(in_dims[1] / 4);
+    PADDLE_ENFORCE_EQ(frame_size, out_dims[1]);
+
+    math::LstmMetaValue<T> lstmp_value;
+    if (bias && ctx.Attr<bool>("use_peepholes")) {
+      T* bias_data = const_cast<T*>(bias->data<T>());
+      lstmp_value.check_ig = bias_data + 4 * frame_size;
+      lstmp_value.check_fg = lstmp_value.check_ig + frame_size;
+      lstmp_value.check_og = lstmp_value.check_fg + frame_size;
+    } else {
+      lstmp_value.check_ig = nullptr;
+      lstmp_value.check_fg = nullptr;
+      lstmp_value.check_og = nullptr;
+    }
+
+    math::LstmMetaGrad<T> lstmp_grad;
+
+    if (bias && bias_g) {
+      bias_g->mutable_data<T>(ctx.GetPlace());
+      zero(device_ctx, bias_g, static_cast<T>(0.0));
+    }
+    if (bias && bias_g && ctx.Attr<bool>("use_peepholes")) {
+      T* bias_g_data = bias_g->data<T>();
+      lstmp_grad.check_ig_grad = bias_g_data + 4 * frame_size;
+      lstmp_grad.check_fg_grad = lstmp_grad.check_ig_grad + frame_size;
+      lstmp_grad.check_og_grad = lstmp_grad.check_fg_grad + frame_size;
+    } else {
+      lstmp_grad.check_ig_grad = nullptr;
+      lstmp_grad.check_fg_grad = nullptr;
+      lstmp_grad.check_og_grad = nullptr;
+    }
+
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+
+    auto ToBatch = [&batch_gate, &to_batch](
+        const DeviceContext& ctx, const framework::LoDTensor& src,
+        const framework::DDim& dims, framework::LoDTensor& dst) {
+      dst.mutable_data<T>(dims, ctx.GetPlace());
+      dst.set_lod(batch_gate->lod());
+      to_batch(ctx, src, dst, false);
+    };
+
+    LoDTensor batch_hidden_g, batch_proj, batch_proj_g, batch_cell;
+    batch_hidden_g.mutable_data<T>(out_dims, ctx.GetPlace());
+    ToBatch(device_ctx, *proj_out, proj_dims, batch_proj);        // T x P
+    ToBatch(device_ctx, *projection_g, proj_dims, batch_proj_g);  // T x P
+    ToBatch(device_ctx, *cell_out, out_dims, batch_cell);         // T x D
+
+    LoDTensor batch_cell_g, batch_gate_g;
+    batch_cell_g.mutable_data<T>(out_dims, ctx.GetPlace());
+    // TODO(qingqing) support the case output cell has gradient.
+    // to_batch(device_ctx, *cell_g, batch_cell_g, false);
+    zero(device_ctx, &batch_cell_g, static_cast<T>(0.0));
+    batch_gate_g.mutable_data<T>(batch_gate->dims(), ctx.GetPlace());
+    batch_gate_g.set_lod(batch_gate->lod());
+
+    auto gate_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("gate_activation"));
+    auto cell_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("cell_activation"));
+    auto cand_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("candidate_activation"));
+    auto proj_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("proj_activation"));
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    auto batch_starts = batch_gate->lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+
+      Tensor cur_proj = batch_proj.Slice(bstart, bend);
+      Tensor proj_g = batch_proj_g.Slice(bstart, bend);
+      if (proj_act != math::detail::ActivationType::kIdentity) {
+        auto cur_proj_dev = EigenMatrix<T>::From(cur_proj);
+        auto proj_g_dev = EigenMatrix<T>::From(proj_g);
+        ActGradCompute(cell_act, place, cur_proj_dev, cur_proj_dev, proj_g_dev,
+                       proj_g_dev);
+      }
+      /* hidden state backwarad */
+      Tensor out_g = batch_hidden_g.Slice(bstart, bend);
+      math::matmul<DeviceContext, T>(device_ctx, proj_g, false, *proj_weight,
+                                     true, static_cast<T>(1.0), &out_g,
+                                     static_cast<T>(0.0));
+      /* projection weight backward*/
+      if (proj_weight_g) {
+        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+        math::matmul<DeviceContext, T>(device_ctx, hidden_t, true, proj_g,
+                                       false, static_cast<T>(1.0),
+                                       proj_weight_g, static_cast<T>(1.0));
+      }
+
+      Tensor gate = batch_gate->Slice(bstart, bend);
+      Tensor cell = batch_cell.Slice(bstart, bend);
+      Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
+      lstmp_value.gate_value = gate.data<T>();
+      lstmp_value.state_value = cell.data<T>();
+      lstmp_value.state_active_value = cell_pre_act.data<T>();
+
+      Tensor gate_g = batch_gate_g.Slice(bstart, bend);
+      Tensor cell_g = batch_cell_g.Slice(bstart, bend);
+      lstmp_grad.state_grad = cell_g.data<T>();
+      lstmp_grad.gate_grad = gate_g.data<T>();
+      lstmp_grad.output_grad = out_g.data<T>();
+
+      if (n > 0) {
+        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
+        Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
+        Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
+        lstmp_value.prev_state_value = cell_pre.data<T>();
+        lstmp_grad.prev_state_grad = cell_pre_g.data<T>();
+      } else {
+        lstmp_value.prev_state_value = c0 ? ordered_c0.data<T>() : nullptr;
+        lstmp_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
+      }
+
+      int cur_batch_size = bend - bstart;
+      math::LstmUnitGradFunctor<DeviceContext, T>::compute(
+          device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size,
+          gate_act, cell_act, cand_act);
+
+      if (n > 0) {
+        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
+        int pre_h_end = pre_h_start + cur_batch_size;
+        auto pre_proj_g = batch_proj_g.Slice(pre_h_start, pre_h_end);
+        math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight, true,
+                                       static_cast<T>(1.0), &pre_proj_g,
+                                       static_cast<T>(1.0));
+        if (weight_g) {
+          /* weight backward*/
+          auto pre_proj = batch_proj.Slice(pre_h_start, pre_h_end);
+          math::matmul<DeviceContext, T>(device_ctx, pre_proj, true, gate_g,
+                                         false, static_cast<T>(1.0), weight_g,
+                                         static_cast<T>(1.0));
+        }
+      } else {
+        if (h0 && weight_g) {
+          ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
+                                             &ordered_h0, true);
+          if (weight_g) {
+            math::matmul<DeviceContext, T>(device_ctx, *ordered_proj0, true,
+                                           gate_g, false, static_cast<T>(1.0),
+                                           weight_g, static_cast<T>(1.0));
+          }
+        }
+        if (h0 && (h0_g || proj_weight_g)) {
+          ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
+          Tensor proj0_g;
+          proj0_g.Resize({in_dims[0], proj_weight->dims()[1]});
+          proj0_g.mutable_data<T>(ctx.GetPlace());
+          math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight,
+                                         true, static_cast<T>(1.0), &proj0_g,
+                                         static_cast<T>(0.0));
+          if (proj_act != math::detail::ActivationType::kIdentity) {
+            auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
+            auto proj0_g_dev = EigenMatrix<T>::From(proj0_g);
+            ActGradCompute(cell_act, place, proj0_dev, proj0_dev, proj0_g_dev,
+                           proj0_g_dev);
+          }
+          if (h0_g) {
+            math::matmul<DeviceContext, T>(
+                device_ctx, proj0_g, false, *proj_weight, true,
+                static_cast<T>(1.0), &ordered_h0_g, static_cast<T>(0.0));
+          }
+          if (proj_weight_g) {
+            math::matmul<DeviceContext, T>(device_ctx, ordered_h0, true,
+                                           proj0_g, false, static_cast<T>(1.0),
+                                           proj_weight_g, static_cast<T>(1.0));
+          }
+        }
+      }
+    }
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    if (in_g) {
+      /* backward data */
+      in_g->mutable_data<T>(ctx.GetPlace());
+      to_seq(device_ctx, batch_gate_g, *in_g);
+    }
+    if (bias && bias_g) {
+      /* backward bias */
+      Tensor b_g = *bias_g;
+      b_g.Resize({bias_g->numel(), 1});
+      Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
+      math::ColwiseSum<DeviceContext, T> col_sum;
+      col_sum(device_ctx, batch_gate_g, &gate_bias_g);
+    }
+
+    if (h0 && h0_g) {
+      ReorderInitState<DeviceContext, T>(device_ctx, ordered_h0_g, order, h0_g,
+                                         false);
+    }
+    if (c0 && c0_g) {
+      ReorderInitState<DeviceContext, T>(device_ctx, ordered_c0_g, order, c0_g,
+                                         false);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index c607704efac86982c8c22e462381aaab488a9b69..28c5aec1996ad04a6cb551ac68c14b613d16858e 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -11,7 +11,7 @@ if(WITH_GPU)
     nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
     nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor)
     nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
-    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor)
+    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor math_function)
     nv_library(sequence_padding SRCS sequence_padding.cc sequence_padding.cu DEPS lod_tensor device_context)
     nv_library(sequence_scale SRCS sequence_scale.cc sequence_scale.cu DEPS lod_tensor device_context)
     nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
@@ -28,7 +28,7 @@ else()
     cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function)
     cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor)
     cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
-    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor)
+    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor math_function)
     cc_library(sequence_padding SRCS sequence_padding.cc DEPS lod_tensor device_context)
     cc_library(sequence_scale SRCS sequence_scale.cc DEPS lod_tensor device_context)
     cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc
index 78263da2fbf843f6a5af2ba95aa0b219a7523b52..d275fa5cbbfbf4a949d7bb16c3acc598543ba000 100644
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -119,7 +119,13 @@ REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker,
 REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp);
 REGISTER_OP_CPU_KERNEL(
     multiplex,
-    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     multiplex_grad,
-    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu
index 4372dc2c65ec7c0f28e46cd070ea471701ce8304..546e6e7a24d3653e9904706eac51c1b833f51463 100644
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
@@ -90,7 +90,13 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(
     multiplex,
-    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     multiplex_grad,
-    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc
index 6546096069d4c3fbc4908a16c2dba2ac6d7e6421..072e4eb2eff1f6f3d8745ac8e16709b8e1a69725 100644
--- a/paddle/operators/nccl_op_test.cu.cc
+++ b/paddle/operators/nccl_op_test.cu.cc
@@ -241,7 +241,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
 // ncclBcastOp with desc
 TEST_F(NCCLTester, ncclBcastOp) {
   std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
-  const int kRoot = 5;
+  const int kRoot = 0;
   op2->SetType("ncclBcast");
   op2->SetInput("X", {"st"});
   op2->SetInput("Communicator", {"comm"});
diff --git a/paddle/operators/one_hot_op.cc b/paddle/operators/one_hot_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e78b7468de4ea5f29378c2dc5905fdd36fb0ae2f
--- /dev/null
+++ b/paddle/operators/one_hot_op.cc
@@ -0,0 +1,95 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/operators/one_hot_op.h"
+#include "paddle/framework/framework.pb.h"
+
+namespace paddle {
+namespace operators {
+
+class OneHotOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of OneHotOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of OneHotOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                      "Rank of Input(X) should be at least 2.");
+    PADDLE_ENFORCE_GE(x_dims[x_dims.size() - 1], 1U,
+                      "Last dimension of Input(X) should be 1.");
+
+    int depth = ctx->Attrs().Get<int>("depth");
+
+    PADDLE_ENFORCE_GT(depth, 0, "Should provide a positive depth (%d).", depth);
+
+    framework::DDim out_dims(x_dims);
+    out_dims[out_dims.size() - 1] = depth;
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /* --> */ "Out");
+  }
+};
+
+class OneHotOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  OneHotOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor, LoDTensor<int>) Input variable with rank at least 2. "
+             "The last dimension of X should be 1. Each value of X is an index "
+             "to indicate the position.");
+    AddOutput("Out",
+              "(Tensor, Tensor<float>) Output tensor with same rank as X. "
+              "The tensor consists of one-hot representations of values in X.");
+    AddAttr<int>("depth",
+                 "A positive integer to specify the length of one-hot vector.");
+    AddAttr<int>("dtype",
+                 "An integer to specify the data type of one-hot "
+                 "vector. The default value is FP32.")
+        .SetDefault(paddle::framework::proto::DataType::FP32);
+    AddComment(R"DOC(
+One Hot Operator. This operator creates the one-hot representations for input
+index values. The following example will help to explain the function of this
+operator:
+
+X is a LoDTensor:
+  X.lod = [[0, 1, 4]]
+  X.shape = [4, 1]
+  X.data = [[1], [1], [3], [0]]
+
+set depth = 4
+
+Out is a LoDTensor:
+  Out.lod = [[0, 1, 4]]
+  Out.shape = [4, 4]
+  Out.data = [[0., 1., 0., 0.],
+              [0., 1., 0., 0.],
+              [0., 0., 0., 1.],
+              [1., 0., 0., 0.]]
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(one_hot, ops::OneHotOp, ops::OneHotOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    one_hot, ops::OneHotKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::OneHotKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/one_hot_op.cu b/paddle/operators/one_hot_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..16f6d9433eabd7be157ed57362a0d55d86c6ee92
--- /dev/null
+++ b/paddle/operators/one_hot_op.cu
@@ -0,0 +1,80 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/operators/one_hot_op.h"
+#include "paddle/platform/cuda_helper.h"
+#include "paddle/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename InT, typename OutT>
+__global__ void FillOutputKernel(const InT* p_in_data, OutT* p_out_data,
+                                 const int64_t numel, const int depth) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < numel) {
+    *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0;
+  }
+}
+
+template <typename DeviceContext, typename InT>
+struct OneHotOpCUDAFunctor {
+  const framework::LoDTensor* in_;
+  framework::LoDTensor* out_;
+  const DeviceContext& ctx_;
+  int depth_;
+
+  OneHotOpCUDAFunctor(const framework::LoDTensor* in, framework::LoDTensor* out,
+                      int depth, const DeviceContext& ctx)
+      : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
+
+  template <typename OutT>
+  void operator()() const {
+    auto* p_in_data = in_->data<InT>();
+    auto numel = in_->numel();
+    auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
+    auto stream = ctx_.stream();
+    math::set_constant(ctx_, out_, 0.0);
+
+    FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
+                           PADDLE_CUDA_NUM_THREADS,
+                       PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        p_in_data, p_out_data, numel, depth_);
+  }
+};
+
+using LoDTensor = framework::LoDTensor;
+template <typename DeviceContext, typename T>
+class OneHotCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    int depth = context.Attr<int>("depth");
+
+    framework::VisitDataType(
+        static_cast<framework::proto::DataType>(context.Attr<int>("dtype")),
+        OneHotOpCUDAFunctor<DeviceContext, T>(
+            in, out, depth, context.template device_context<DeviceContext>()));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    one_hot, ops::OneHotCUDAKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::OneHotCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/operators/one_hot_op.h b/paddle/operators/one_hot_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..12031ede2c3cd042a3d25003b714652b4d0d4453
--- /dev/null
+++ b/paddle/operators/one_hot_op.h
@@ -0,0 +1,68 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename InT>
+struct OneHotOpFunctor {
+  const framework::LoDTensor* in_;
+  framework::LoDTensor* out_;
+  int depth_;
+  const DeviceContext& ctx_;
+
+  OneHotOpFunctor(const framework::LoDTensor* in, framework::LoDTensor* out,
+                  int depth, const DeviceContext& ctx)
+      : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
+
+  template <typename OutT>
+  void operator()() const {
+    auto* p_in_data = in_->data<InT>();
+    auto numel = in_->numel();
+    auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
+    math::set_constant(ctx_, out_, 0.0);
+
+    for (int i = 0; i < numel; ++i) {
+      PADDLE_ENFORCE_GE(p_in_data[i], 0,
+                        "Illegal index value, should be at least 0.");
+      PADDLE_ENFORCE_LT(p_in_data[i], depth_,
+                        "Illegal index value, should be less than depth (%d).",
+                        depth_);
+      *(p_out_data + i * depth_ + p_in_data[i]) = 1.0;
+    }
+  }
+};
+
+using LoDTensor = framework::LoDTensor;
+template <typename DeviceContext, typename T>
+class OneHotKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    int depth = context.Attr<int>("depth");
+
+    framework::VisitDataType(
+        static_cast<framework::proto::DataType>(context.Attr<int>("dtype")),
+        OneHotOpFunctor<DeviceContext, T>(
+            in, out, depth, context.template device_context<DeviceContext>()));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/parallel_do_op.cc b/paddle/operators/parallel_do_op.cc
index 09e808902f8fe3a7a07153d3432866c18e81dc7c..67f9854c02fa92d0141463088915e720733306fb 100644
--- a/paddle/operators/parallel_do_op.cc
+++ b/paddle/operators/parallel_do_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/framework/executor.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/threadpool.h"
+#include "paddle/operators/detail/safe_ref.h"
 
 namespace paddle {
 namespace operators {
@@ -39,8 +40,10 @@ static void SplitTensorAndMoveTensorToScopes(
     const std::vector<std::string> &names) {
   size_t num_sub_scopes = 0;
   for (auto &argu : names) {
-    auto *var = scope.FindVar(argu);
-    const auto &tensor = var->Get<LoDTensor>();
+    const auto &tensor =
+        detail::Ref(scope.FindVar(argu),
+                    "Cannot find variable %s in the parent scope", argu)
+            .Get<LoDTensor>();
     auto lod_tensors = tensor.SplitLoDTensor(places);
 
     for (auto &lod : lod_tensors) {
@@ -60,7 +63,9 @@ static void SplitTensorAndMoveTensorToScopes(
     }
 
     for (size_t i = 0; i < lod_tensors.size(); ++i) {
-      *(*sub_scopes)[i]->Var(argu)->GetMutable<LoDTensor>() = lod_tensors[i];
+      *detail::Ref(sub_scopes->at(i)->Var(argu),
+                   "Cannot find variable in the sub-scope", argu)
+           .GetMutable<LoDTensor>() = lod_tensors[i];
     }
   }
 }
@@ -287,6 +292,17 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
                         this->InputGrad(input_param, false));
       }
     }
+    auto *g_block = this->grad_block_[0];
+
+    // All variable name that needed by gradient operators
+    std::unordered_set<std::string> all_inputs_in_grad_blocks;
+
+    for (size_t i = 0; i < g_block->OpSize(); ++i) {
+      auto *op = g_block->Op(i);
+      for (auto &var_name : op->InputArgumentNames()) {
+        all_inputs_in_grad_blocks.insert(var_name);
+      }
+    }
 
     for (auto &output_param : this->OutputNames()) {
       if (output_param == kParallelScopes) {
@@ -295,8 +311,17 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
                        this->Output(output_param));
       } else {
         grad->SetInput(output_param, this->Output(output_param));
-        grad->SetInput(framework::GradVarName(output_param),
-                       this->OutputGrad(output_param));
+        std::vector<std::string> og_names;
+        for (auto &og_name : this->OutputGrad(output_param)) {
+          if (all_inputs_in_grad_blocks.count(og_name) != 0) {
+            // there are some gradient operators who need the OG. So make this
+            // OG as an input of parallel.do
+            og_names.push_back(og_name);
+          }
+          // else, there is no operator who need the OG. Do not use this OG as
+          // an input
+        }
+        grad->SetInput(framework::GradVarName(output_param), og_names);
       }
     }
     grad->SetAttrMap(this->Attrs());
diff --git a/paddle/operators/pool_op.h b/paddle/operators/pool_op.h
index c3d82ecbdeb412f0234fcddc27361d79b58c7122..d6ba5e298a4939e31fde71bf5bf8484640a7ceaf 100644
--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
@@ -139,10 +139,8 @@ class PoolGradKernel : public framework::OpKernel<T> {
     auto& dev_ctx = context.template device_context<DeviceContext>();
     if (in_x_grad) {
       in_x_grad->mutable_data<T>(context.GetPlace());
-      auto temp = framework::EigenVector<T>::Flatten(*in_x_grad);
-      temp.device(
-          *context.template device_context<DeviceContext>().eigen_device()) =
-          temp.constant(static_cast<T>(0));
+      paddle::operators::math::SetConstant<DeviceContext, T> set_constant;
+      set_constant(dev_ctx, in_x_grad, 0.0);
 
       switch (ksize.size()) {
         case 2: {
diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc
index 58e8fd6124d8c076337ae9bb2f5103e7a3cb7ff0..b9743a5df1092917d13a50aa20ea7e7c52b8d151 100644
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -90,14 +90,10 @@ Reshape Operator.
 Reshape Input(X) into the shape specified by Attr(shape).
 
 An example:
-Given a 2-D tensor X with 2 rows and 2 columns
-
-    [[1, 2], [3, 4]]
+Given a 2-D tensor X with 2 rows and 2 columns : [[1, 2], [3, 4]]
 
 and target shape = [1, 4], the reshape operator will transform
-the tensor X into a 2-D tensor:
-
-    [[1, 2, 3, 4]]
+the tensor X into a 2-D tensor: [[1, 2, 3, 4]]
 
 One dimension in the target shape can be set -1, representing that its
 size is unknown. In this case, the real dimension will be infered from 
diff --git a/paddle/platform/call_once.h b/paddle/platform/call_once.h
index 00337a7f051758559a0f8012d8c78dbe8e3457a6..44a4d38f679ddf6c317e52132b6cf3eb2f0a0649 100644
--- a/paddle/platform/call_once.h
+++ b/paddle/platform/call_once.h
@@ -29,20 +29,25 @@ namespace platform {
 */
 template <typename Callable, typename... Args>
 inline void call_once(std::once_flag& flag, Callable&& f, Args&&... args) {
-  bool good = false;
+  bool good = true;
   std::exception ex;
-  std::call_once(flag,
-                 [&](Args&&... args) {
-                   try {
-                     f(args...);
-                     good = true;
-                   } catch (const std::exception& e) {
-                     ex = e;
-                   } catch (...) {
-                     ex = std::runtime_error("excption caught in call_once");
-                   }
-                 },
-                 args...);
+  try {
+    std::call_once(flag,
+                   [&](Args&&... args) {
+                     try {
+                       f(args...);
+                     } catch (const std::exception& e) {
+                       ex = e;
+                       good = false;
+                     } catch (...) {
+                       ex = std::runtime_error("excption caught in call_once");
+                       good = false;
+                     }
+                   },
+                   args...);
+  } catch (std::system_error& x) {
+    throw std::runtime_error("call once failed");
+  }
   if (!good) {
     throw std::exception(ex);
   }
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
index e78673e0baa03496faab13d069b3bd456660bad6..de53fea0dd692167d61fcca552cc834a7916e209 100644
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -1,7 +1,7 @@
 if(WITH_PYTHON)
   cc_library(paddle_pybind SHARED
     SRCS pybind.cc exception.cc protobuf.cc const_value.cc
-    DEPS pybind python backward proto_desc paddle_memory executor prune init profiler
+    DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
     ${GLOB_OP_LIB})
   if(NOT APPLE AND NOT ANDROID)
     target_link_libraries(paddle_pybind rt)
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 82f5b1922c6e97ee73a187e838350a965f1fd269..490397afdd4de0cc1aafde746d31b1d800eded3b 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -53,7 +53,7 @@ static size_t UniqueIntegerGenerator(const std::string &prefix) {
   return generators[prefix].fetch_add(1);
 }
 
-bool IsCompileGPU() {
+bool IsCompiledWithCUDA() {
 #ifndef PADDLE_WITH_CUDA
   return false;
 #else
@@ -424,14 +424,16 @@ All parameter, weight, gradient are variables in Paddle.
 
   py::class_<framework::Executor>(m, "Executor")
       .def(py::init<const platform::Place &>())
-      .def("run", &Executor::Run);
+      .def("run",
+           (void (Executor::*)(const ProgramDesc &, Scope *, int, bool, bool)) &
+               Executor::Run);
 
   m.def("unique_integer", UniqueIntegerGenerator);
   m.def("init_gflags", framework::InitGflags);
   m.def("init_glog", framework::InitGLOG);
   m.def("init_devices", &framework::InitDevices);
 
-  m.def("is_compile_gpu", IsCompileGPU);
+  m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
 
   m.def("set_feed_variable", framework::SetFeedVariable);
   m.def("get_fetch_variable", framework::GetFetchVariable);
diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py
index 1f041c74597637a7b74e9690a60b6cd8fdd21cf8..787416aed1acf81138df06110317614dfe77fb48 100644
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -89,7 +89,7 @@ def __bootstrap__():
     read_env_flags = [
         'use_pinned_memory', 'check_nan_inf', 'do_memory_benchmark'
     ]
-    if core.is_compile_gpu():
+    if core.is_compiled_with_cuda():
         read_env_flags += ['fraction_of_gpu_memory_to_use', 'op_sync']
     core.init_gflags([sys.argv[0]] +
                      ["--tryfromenv=" + ",".join(read_env_flags)])
diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py
index ae81d68bafd22db5d9f7ab0f9cc0dcdb204493e1..29243c90e872ca4a7d1ce6f84f6297b865655da1 100644
--- a/python/paddle/v2/fluid/backward.py
+++ b/python/paddle/v2/fluid/backward.py
@@ -178,7 +178,7 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
         if _all_in_set_(
                 filter(lambda name: name.find(core.grad_var_suffix()) != -1,
                        op_desc.input_arg_names()), no_grad_set):
-            no_grad_set.union(out_arg_names)
+            no_grad_set.update(out_arg_names)
             return True
         return False
 
diff --git a/python/paddle/v2/fluid/distribute_transpiler.py b/python/paddle/v2/fluid/distribute_transpiler.py
index 4e54ab806b26164190691f19fa5f2564b45239ca..77f80442e06cb18402bb1b8b97aa9119c7473f54 100644
--- a/python/paddle/v2/fluid/distribute_transpiler.py
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
@@ -33,6 +33,10 @@ class VarBlock:
         return "%s:%d:%d" % (self.varname, self.offset, self.size)
 
 
+def same_or_split_var(p_name, var_name):
+    return p_name == var_name or p_name.startswith(var_name + ".block")
+
+
 def split_dense_variable(var_list,
                          pserver_count,
                          min_block_size=1024,
@@ -303,8 +307,8 @@ class DistributeTranspiler:
                 return True
             else:
                 for n in param_names:
-                    if n.startswith(op.inputs["Param"].name+".block") and \
-                       n != op.inputs["Param"].name:
+                    if same_or_split_var(n, op.inputs[
+                            "Param"].name) and n != op.inputs["Param"].name:
                         return True
                 return False
         else:
@@ -335,7 +339,7 @@ class DistributeTranspiler:
             if key == "Grad":
                 grad_block = None
                 for g in self.param_grad_ep_mapping[endpoint]["grads"]:
-                    if g.name.startswith(var.name):
+                    if same_or_split_var(g.name, var.name):
                         grad_block = g
                         break
                 if not grad_block:
@@ -365,7 +369,7 @@ class DistributeTranspiler:
                 # param is already created on global program
                 param_block = None
                 for p in self.param_grad_ep_mapping[endpoint]["params"]:
-                    if p.name.startswith(var.name):
+                    if same_or_split_var(p.name, var.name):
                         param_block = p
                         break
                 if not param_block:
@@ -501,7 +505,7 @@ class DistributeTranspiler:
         def _get_splited_name_and_shape(varname):
             for idx, splited_param in enumerate(params):
                 pname = splited_param.name
-                if pname.startswith(varname) and varname != pname:
+                if same_or_split_var(pname, varname) and varname != pname:
                     return pname, splited_param.shape
             return "", []
 
diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py
index 9d5ed9571a2fa0a871a25e43b23b1a3c3a6102db..9f48815b8b84426c7d539af4e7d45ea47e69d4d9 100644
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -68,6 +68,84 @@ def as_numpy(tensor):
     return ans
 
 
+def has_feed_operators(block, feed_targets, feed_holder_name):
+    """ Check whether the block already has feed operators.
+
+    Return false if the block does not have any feed operators.
+    If some feed operators have been prepended to the block, check that
+    the info contained in these feed operators matches the feed_targets
+    and feed_holder_name. Raise exception when any mismatch is found.
+    Return true when the block has feed operators with matching info.
+
+    Args:
+        block: a block instance (typically global block of a program)
+        feed_targets: a dictionary of {feed_target_name: feed_target_data}
+        feed_holder_name: the name of the variable that holds the data of 
+            all feed targets. The type of this feed_holder variable is 
+            FEED_MINIBATCH, which is essentially vector<LoDTensor>.
+
+    Returns:
+        A boolean value that indicates whether a block has feed operators 
+        that match the info contained in feed_targets and feed_holder_name.
+    """
+
+    feed_count = 0
+    for op in block.ops:
+        if op.desc.type() == 'feed':
+            feed_count += 1
+            assert op.desc.input('X')[0] == feed_holder_name
+            feed_target_name = op.desc.output('Out')[0]
+            if feed_target_name not in feed_targets:
+                raise Exception("'feed_targets' does not have {} variable".
+                                format(feed_target_name))
+        else:
+            break
+    if feed_count > 0 and feed_count != len(feed_targets):
+        raise Exception(
+            "Feed operators in program desc do not match 'feed_targets'")
+    return feed_count > 0
+
+
+def has_fetch_operators(block, fetch_targets, fetch_holder_name):
+    """ Check whether the block already has fetch operators.
+    
+    Return false if the block does not have any fetch operators.
+    If some fetch operators have been appended to the block, check that
+    the info contained in these fetch operators matches the fetch_targets
+    and fetch_holder_name. Raise exception when any mismatch is found.
+    Return true when the block has fetch operators with matching info.
+
+    Args:
+        block: a block instance (typically global block of a program)
+        fetch_targets: a dictionary of {fetch_target_name: fetch_target_data}
+        fetch_holder_name: the name of the variable that holds the data of 
+            all fetch targets. The type of this fetch_holder variable is 
+            FETCH_LIST, which is essentially vector<LoDTensor>.    
+
+    Return:    
+        A boolean value that indicates whether a block has fetch operators 
+        that match the info contained in fetch_targets and fetch_holder_name.     
+    """
+
+    fetch_count = 0
+    for op in block.ops:
+        if op.desc.type() == 'fetch':
+            fetch_count += 1
+            assert op.desc.output('Out')[0] == fetch_holder_name
+            fetch_target_name = op.desc.input('X')[0]
+            if fetch_target_name not in [
+                    var.desc.name() for var in fetch_targets
+            ]:
+                raise Exception("'fetch_targets' does not have {} variable".
+                                format(fetch_target_name))
+            idx = op.desc.attr('col')
+            assert fetch_target_name == fetch_targets[idx].desc.name()
+    if fetch_count > 0 and fetch_count != len(fetch_targets):
+        raise Exception(
+            "Fetch operators in program desc do not match 'fetch_targets'")
+    return fetch_count > 0
+
+
 class Executor(object):
     def __init__(self, places):
         if not isinstance(places, list) and not isinstance(places, tuple):
@@ -147,33 +225,50 @@ class Executor(object):
 
         program = program.clone()
         global_block = program.global_block()
-        feed_var = global_block.create_var(
-            name=feed_var_name,
-            type=core.VarDesc.VarType.FEED_MINIBATCH,
-            persistable=True)
-
-        for i, name in enumerate(feed):
-            out = global_block.var(name)
-            global_block.prepend_op(
-                'feed',
-                inputs={'X': [feed_var]},
-                outputs={'Out': [out]},
-                attrs={'col': i})
-            cur_feed = feed[name]
-            if not isinstance(cur_feed, core.LoDTensor):
-                cur_feed = self.aslodtensor(cur_feed)
-            core.set_feed_variable(scope, cur_feed, feed_var.name, i)
-
-        fetch_var = global_block.create_var(
-            name=fetch_var_name,
-            type=core.VarDesc.VarType.FETCH_LIST,
-            persistable=True)
-        for i, var in enumerate(fetch_list):
-            global_block.append_op(
-                type='fetch',
-                inputs={'X': [var]},
-                outputs={'Out': [fetch_var]},
-                attrs={'col': i})
+
+        if feed_var_name in global_block.vars:
+            feed_var = global_block.var(feed_var_name)
+        else:
+            feed_var = global_block.create_var(
+                name=feed_var_name,
+                type=core.VarDesc.VarType.FEED_MINIBATCH,
+                persistable=True)
+
+        if fetch_var_name in global_block.vars:
+            fetch_var = global_block.var(fetch_var_name)
+        else:
+            fetch_var = global_block.create_var(
+                name=fetch_var_name,
+                type=core.VarDesc.VarType.FETCH_LIST,
+                persistable=True)
+
+        if not has_feed_operators(global_block, feed, feed_var_name):
+            for i, name in enumerate(feed):
+                out = global_block.var(name)
+                global_block.prepend_op(
+                    type='feed',
+                    inputs={'X': [feed_var]},
+                    outputs={'Out': [out]},
+                    attrs={'col': i})
+
+        for op in global_block.ops:
+            if op.desc.type() == 'feed':
+                feed_target_name = op.desc.output('Out')[0]
+                cur_feed = feed[feed_target_name]
+                if not isinstance(cur_feed, core.LoDTensor):
+                    cur_feed = self.aslodtensor(cur_feed)
+                idx = op.desc.attr('col')
+                core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
+            else:
+                break
+
+        if not has_fetch_operators(global_block, fetch_list, fetch_var_name):
+            for i, var in enumerate(fetch_list):
+                global_block.append_op(
+                    type='fetch',
+                    inputs={'X': [var]},
+                    outputs={'Out': [fetch_var]},
+                    attrs={'col': i})
 
         self.executor.run(program.desc, scope, 0, True, True)
         outs = [
diff --git a/python/paddle/v2/fluid/io.py b/python/paddle/v2/fluid/io.py
index 376d6013a38923014fa35e964e58d7f56bf80546..d56ec45c538b580f5520bc060b4b339bb1be0539 100644
--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import os
-import cPickle as pickle
 
+from paddle.v2.fluid.evaluator import Evaluator
 from paddle.v2.fluid.framework import Program, Parameter, default_main_program, Variable
 from . import core
 
@@ -187,18 +187,28 @@ def get_inference_program(target_vars, main_program=None):
         main_program = default_main_program()
     if not isinstance(target_vars, list):
         target_vars = [target_vars]
-
-    pruned_program = main_program.prune(targets=target_vars)
+    vars = []
+    for var in target_vars:
+        if isinstance(var, Evaluator):
+            vars.extend(var.states)
+            vars.extend(var.metrics)
+        else:
+            vars.append(var)
+    pruned_program = main_program.prune(targets=vars)
     inference_program = pruned_program.inference_optimize()
     return inference_program
 
 
-def prepend_feed_ops(inference_program, feeded_var_names):
+def prepend_feed_ops(inference_program,
+                     feed_target_names,
+                     feed_holder_name='feed'):
     global_block = inference_program.global_block()
     feed_var = global_block.create_var(
-        name='feed', type=core.VarDesc.VarType.FEED_MINIBATCH, persistable=True)
+        name=feed_holder_name,
+        type=core.VarDesc.VarType.FEED_MINIBATCH,
+        persistable=True)
 
-    for i, name in enumerate(feeded_var_names):
+    for i, name in enumerate(feed_target_names):
         out = global_block.var(name)
         global_block.prepend_op(
             type='feed',
@@ -207,12 +217,16 @@ def prepend_feed_ops(inference_program, feeded_var_names):
             attrs={'col': i})
 
 
-def append_fetch_ops(inference_program, fetch_var_names):
+def append_fetch_ops(inference_program,
+                     fetch_target_names,
+                     fetch_holder_name='fetch'):
     global_block = inference_program.global_block()
     fetch_var = global_block.create_var(
-        name='fetch', type=core.VarDesc.VarType.FETCH_LIST, persistable=True)
+        name=fetch_holder_name,
+        type=core.VarDesc.VarType.FETCH_LIST,
+        persistable=True)
 
-    for i, name in enumerate(fetch_var_names):
+    for i, name in enumerate(fetch_target_names):
         global_block.append_op(
             type='fetch',
             inputs={'X': [name]},
@@ -262,21 +276,12 @@ def save_inference_model(dirname,
     inference_program = pruned_program.inference_optimize()
     fetch_var_names = [v.name for v in target_vars]
 
-    model_file_name = dirname + "/__model__"
-    with open(model_file_name, "w") as f:
-        pickle.dump({
-            "program_desc_str": inference_program.desc.serialize_to_string(),
-            "feed_var_names": feeded_var_names,
-            "fetch_var_names": fetch_var_names
-        }, f, -1)
-
     prepend_feed_ops(inference_program, feeded_var_names)
     append_fetch_ops(inference_program, fetch_var_names)
 
-    # Save only programDesc of inference_program in binary format
-    # in another file: __model__.dat
-    with open(model_file_name + ".dat", "wb") as fp:
-        fp.write(inference_program.desc.serialize_to_string())
+    model_file_name = dirname + "/__model__"
+    with open(model_file_name, "wb") as f:
+        f.write(inference_program.desc.serialize_to_string())
 
     save_params(executor, dirname, main_program)
 
@@ -299,6 +304,24 @@ def load_persistables_if_exist(executor, dirname, main_program=None):
         predicate=_is_presistable_and_exist_)
 
 
+def get_feed_targets_names(program):
+    feed_targets_names = []
+    global_block = program.global_block()
+    for op in global_block.ops:
+        if op.desc.type() == 'feed':
+            feed_targets_names.insert(0, op.desc.output('Out')[0])
+    return feed_targets_names
+
+
+def get_fetch_targets_names(program):
+    fetch_targets_names = []
+    global_block = program.global_block()
+    for op in global_block.ops:
+        if op.desc.type() == 'fetch':
+            fetch_targets_names.append(op.desc.input('X')[0])
+    return fetch_targets_names
+
+
 def load_inference_model(dirname, executor):
     """
     Load inference model from a directory
@@ -306,24 +329,28 @@ def load_inference_model(dirname, executor):
     :param dirname: directory path
     :param executor: executor that load inference model
 
-    :return: [program, feed_var_names, fetch_var_names]
+    :return: [program, feed_target_names, fetch_targets]
              program: program especially for inference.
-             feeded_var_names: Names of variables that need to feed data
-             fetch_vars: Variables from which we can get inference results.
+             feed_target_names: Names of variables that need to feed data
+             fetch_targets: Variables from which we can get inference results.
     """
     if not os.path.isdir(dirname):
         raise ValueError("There is no directory named '%s'", dirname)
 
     model_file_name = dirname + "/__model__"
-    model = pickle.load(open(model_file_name, "r"))
-    program_desc_str = model["program_desc_str"]
-    feed_var_names = model["feed_var_names"]
-    fetch_var_names = model["fetch_var_names"]
+    with open(model_file_name, "rb") as f:
+        program_desc_str = f.read()
+
     program = Program.parse_from_string(program_desc_str)
     load_persistables_if_exist(executor, dirname, program)
-    fetch_vars = [program.global_block().var(name) for name in fetch_var_names]
 
-    return [program, feed_var_names, fetch_vars]
+    feed_target_names = get_feed_targets_names(program)
+    fetch_target_names = get_fetch_targets_names(program)
+    fetch_targets = [
+        program.global_block().var(name) for name in fetch_target_names
+    ]
+
+    return [program, feed_target_names, fetch_targets]
 
 
 def get_parameter_value(para, executor):
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 10a126933472d2f09d39f34ac9389ea8cfe315b4..d11dccfd22124d58d8634c01a00527c373b92f00 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -26,6 +26,7 @@ __all__ = [
     'fc',
     'embedding',
     'dynamic_lstm',
+    'dynamic_lstmp',
     'dynamic_gru',
     'gru_unit',
     'linear_chain_crf',
@@ -63,6 +64,7 @@ __all__ = [
     'nce',
     'beam_search',
     'row_conv',
+    'multiplex',
 ]
 
 
@@ -110,16 +112,17 @@ def fc(input,
                               into a 2-dimensional matrix. The parameter
                               `num_flatten_dims` determines how the input tensor
                               is flattened: the first `num_flatten_dims`
-                              dimensions will be flatten to form the first
-                              dimension of the final matrix (height of the
-                              matrix), and the rest `rank(X) - num_flatten_dims`
-                              dimensions are flattened to form the second
-                              dimension of the final matrix (width of the matrix).
-                              For example, suppose `X` is a 6-dimensional tensor
-                              with a shape [2, 3, 4, 5, 6], and
-                              `num_flatten_dims` = 3. Then, the flattened matrix
-                              will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
-                              By default, `num_flatten_dims` is set to 1.
+                              (inclusive, index starts from 1) dimensions will
+                              be flatten to form the first dimension of the
+                              final matrix (height of the matrix), and the rest
+                              `rank(X) - num_flatten_dims` dimensions are
+                              flattened to form the second dimension of the
+                              final matrix (width of the matrix). For example,
+                              suppose `X` is a 6-dimensional tensor with a shape
+                              [2, 3, 4, 5, 6], and `num_flatten_dims` = 3. Then,
+                              the flattened matrix will have a shape
+                              [2 x 3 x 4, 5 x 6] = [24, 30]. By default,
+                              `num_flatten_dims` is set to 1.
        param_attr(ParamAttr|list): The parameter attribute for learnable
                                    parameters/weights of the fully connected
                                    layer.
@@ -160,6 +163,7 @@ def fc(input,
         param_shape = [
             reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
         ] + [size]
+
         w = helper.create_parameter(
             attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
         tmp = helper.create_tmp_variable(dtype)
@@ -253,7 +257,8 @@ def dynamic_lstm(input,
                  gate_activation='sigmoid',
                  cell_activation='tanh',
                  candidate_activation='tanh',
-                 dtype='float32'):
+                 dtype='float32',
+                 name=None):
     """
     **Dynamic LSTM Layer**
 
@@ -279,7 +284,7 @@ def dynamic_lstm(input,
     W_{fc}, W_{oc}` are diagonal weight matrices for peephole connections. In
     our implementation, we use vectors to reprenset these diagonal weight
     matrices. The :math:`b` terms denote bias vectors (:math:`b_i` is the input
-    gate bias vector), :math:`\sigma` is the non-line activations, such as
+    gate bias vector), :math:`\sigma` is the non-linear activations, such as
     logistic sigmoid function, and :math:`i, f, o` and :math:`c` are the input
     gate, forget gate, output gate, and cell activation vectors, respectively,
     all of which have the same size as the cell output activation vector :math:`h`.
@@ -305,25 +310,25 @@ def dynamic_lstm(input,
                          (T X 4D), where T is the total time steps in this
                          mini-batch, D is the hidden size.
         size(int): 4 * hidden size.
-        param_attr(ParamAttr): The parameter attribute for the learnable
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
                                hidden-hidden weights.
 
-                               - The shape is (D x 4D), where D is the hidden
-                                 size.
                                - Weights = {:math:`W_{ch}, W_{ih}, \
                                                 W_{fh}, W_{oh}`}
-        bias_attr(ParamAttr): The bias attribute for the learnable bias
+                               - The shape is (D x 4D), where D is the hidden
+                                 size.
+        bias_attr(ParamAttr|None): The bias attribute for the learnable bias
                               weights, which contains two parts, input-hidden
                               bias weights and peephole connections weights if
                               setting `use_peepholes` to `True`.
 
                               1. `use_peepholes = False`
-                                - The shape is (1 x 4D).
                                 - Biases = {:math:`b_c, b_i, b_f, b_o`}.
+                                - The shape is (1 x 4D).
                               2. `use_peepholes = True`
-                                - The shape is (1 x 7D).
                                 - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
                                                  W_{fc}, W_{oc}`}.
+                                - The shape is (1 x 7D).
         use_peepholes(bool): Whether to enable diagonal/peephole connections,
                              default `True`.
         is_reverse(bool): Whether to compute reversed LSTM, default `False`.
@@ -336,6 +341,8 @@ def dynamic_lstm(input,
                               Choices = ["sigmoid", "tanh", "relu", "identity"],
                               default "tanh".
         dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
 
     Returns:
         tuple: The hidden state, and cell state of LSTM. The shape of both \
@@ -350,6 +357,7 @@ def dynamic_lstm(input,
             forward, _ = fluid.layers.dynamic_lstm(
                 input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
     """
+
     helper = LayerHelper('lstm', **locals())
     size = size / 4
     weight = helper.create_parameter(
@@ -386,6 +394,192 @@ def dynamic_lstm(input,
     return hidden, cell
 
 
+def dynamic_lstmp(input,
+                  size,
+                  proj_size,
+                  param_attr=None,
+                  bias_attr=None,
+                  use_peepholes=True,
+                  is_reverse=False,
+                  gate_activation='sigmoid',
+                  cell_activation='tanh',
+                  candidate_activation='tanh',
+                  proj_activation='tanh',
+                  dtype='float32',
+                  name=None):
+    """
+    **Dynamic LSTMP Layer**
+
+    LSTMP (LSTM with recurrent projection) layer has a separate projection 
+    layer after the LSTM layer, projecting the original hidden state to a 
+    lower-dimensional one, which is proposed to reduce the number of total 
+    parameters and furthermore computational complexity for the LSTM, 
+    espeacially for the case that the size of output units is relative 
+    large (https://research.google.com/pubs/archive/43905.pdf). 
+
+    The formula is as follows:
+
+    .. math::
+
+        i_t & = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i)
+
+        f_t & = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f)
+
+        \\tilde{c_t} & = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c)
+
+        o_t & = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_t + b_o)
+
+        c_t & = f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
+
+        h_t & = o_t \odot act_h(c_t)
+
+        r_t & = \overline{act_h}(W_{rh}h_t)
+
+    In the above formula:
+
+    * :math:`W`: Denotes weight matrices (e.g. :math:`W_{xi}` is \
+          the matrix of weights from the input gate to the input).
+    * :math:`W_{ic}`, :math:`W_{fc}`, :math:`W_{oc}`: Diagonal weight \
+          matrices for peephole connections. In our implementation, \
+          we use vectors to reprenset these diagonal weight matrices. 
+    * :math:`b`: Denotes bias vectors (e.g. :math:`b_i` is the input gate \
+          bias vector). 
+    * :math:`\sigma`: The activation, such as logistic sigmoid function.
+    * :math:`i, f, o` and :math:`c`: The input gate, forget gate, output \
+          gate, and cell activation vectors, respectively, all of which have \
+          the same size as the cell output activation vector :math:`h`. 
+    * :math:`h`: The hidden state.
+    * :math:`r`: The recurrent projection of the hidden state. 
+    * :math:`\\tilde{c_t}`: The candidate hidden state, whose \
+          computation is based on the current input and previous hidden state.
+    * :math:`\odot`: The element-wise product of the vectors. 
+    * :math:`act_g` and :math:`act_h`: The cell input and cell output \
+          activation functions and `tanh` is usually used for them. 
+    * :math:`\overline{act_h}`: The activation function for the projection \
+          output, usually using `identity` or same as :math:`act_h`.
+
+    Set `use_peepholes` to `False` to disable peephole connection. The formula
+    is omitted here, please refer to the paper
+    http://www.bioinf.jku.at/publications/older/2604.pdf for details.
+    
+    Note that these :math:`W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}`
+    operations on the input :math:`x_{t}` are NOT included in this operator.
+    Users can choose to use fully-connected layer before LSTMP layer.
+
+    Args:
+        input(Variable): The input of dynamic_lstmp layer, which supports
+                         variable-time length input sequence. The underlying
+                         tensor in this Variable is a matrix with shape
+                         (T X 4D), where T is the total time steps in this
+                         mini-batch, D is the hidden size.
+        size(int): 4 * hidden size.
+        proj_size(int): The size of projection output.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+                               hidden-hidden weight and projection weight.
+
+                               - Hidden-hidden weight = {:math:`W_{ch}, W_{ih}, \
+                                                W_{fh}, W_{oh}`}.
+                               - The shape of hidden-hidden weight is (P x 4D), 
+                                 where P is the projection size and D the hidden 
+                                 size.
+                               - Projection weight = {:math:`W_{rh}`}.
+                               - The shape of projection weight is (D x P).
+        bias_attr(ParamAttr|None): The bias attribute for the learnable bias
+                              weights, which contains two parts, input-hidden
+                              bias weights and peephole connections weights if
+                              setting `use_peepholes` to `True`.
+
+                              1. `use_peepholes = False`
+                                - Biases = {:math:`b_c, b_i, b_f, b_o`}.
+                                - The shape is (1 x 4D).
+                              2. `use_peepholes = True`
+                                - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
+                                                 W_{fc}, W_{oc}`}.
+                                - The shape is (1 x 7D).
+        use_peepholes(bool): Whether to enable diagonal/peephole connections,
+                             default `True`.
+        is_reverse(bool): Whether to compute reversed LSTM, default `False`.
+        gate_activation(str): The activation for input gate, forget gate and
+                              output gate. Choices = ["sigmoid", "tanh", "relu",
+                              "identity"], default "sigmoid".
+        cell_activation(str): The activation for cell output. Choices = ["sigmoid",
+                              "tanh", "relu", "identity"], default "tanh".
+        candidate_activation(str): The activation for candidate hidden state.
+                              Choices = ["sigmoid", "tanh", "relu", "identity"],
+                              default "tanh".
+        proj_activation(str): The activation for projection output.
+                              Choices = ["sigmoid", "tanh", "relu", "identity"],
+                              default "tanh".
+        dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        tuple: The projection of hidden state, and cell state of LSTMP. The \
+               shape of projection is (T x P), for the cell state which is \
+               (T x D), and both LoD is the same with the `input`.
+
+    Examples:
+        .. code-block:: python
+
+            hidden_dim, proj_dim = 512, 256
+            fc_out = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
+                                     act=None, bias_attr=None)
+            proj_out, _ = fluid.layers.dynamic_lstmp(input=fc_out, 
+                                                     size=hidden_dim * 4, 
+                                                     proj_size=proj_dim, 
+                                                     use_peepholes=False,
+                                                     is_reverse=True,
+                                                     cell_activation="tanh",
+                                                     proj_activation="tanh")
+    """
+
+    helper = LayerHelper('lstmp', **locals())
+    size = size / 4
+    weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[proj_size, 4 * size], dtype=dtype)
+    proj_weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[size, proj_size], dtype=dtype)
+    bias_size = [1, 7 * size]
+    if not use_peepholes:
+        bias_size[1] = 4 * size
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+
+    projection = helper.create_tmp_variable(dtype)
+    cell = helper.create_tmp_variable(dtype)
+    ordered_proj0 = helper.create_tmp_variable(dtype)
+    batch_hidden = helper.create_tmp_variable(dtype)
+    batch_gate = helper.create_tmp_variable(dtype)
+    batch_cell_pre_act = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='lstmp',
+        inputs={
+            'Input': input,
+            'Weight': weight,
+            'ProjWeight': proj_weight,
+            'Bias': bias
+        },
+        outputs={
+            'Projection': projection,
+            'Cell': cell,
+            'OrderedP0': ordered_proj0,
+            'BatchHidden': batch_hidden,
+            'BatchGate': batch_gate,
+            'BatchCellPreAct': batch_cell_pre_act
+        },
+        attrs={
+            'use_peepholes': use_peepholes,
+            'is_reverse': is_reverse,
+            'gate_activation': gate_activation,
+            'cell_activation': cell_activation,
+            'candidate_activation': candidate_activation,
+            'proj_activation': proj_activation
+        })
+    return projection, cell
+
+
 def dynamic_gru(input,
                 size,
                 param_attr=None,
@@ -530,8 +724,10 @@ def gru_unit(input,
         size (integer): The input dimension value.
         weight (ParamAttr): The weight parameters for gru unit. Default: None
         bias (ParamAttr): The bias parameters for gru unit. Default: None
-        activation (string): The activation type for cell (actNode). Default: 'tanh'
-        gate_activation (string): The activation type for gates (actGate). Default: 'sigmoid'
+        activation (string): The activation type for cell (actNode).
+                             Default: 'tanh'
+        gate_activation (string): The activation type for gates (actGate).
+                                  Default: 'sigmoid'
 
     Returns:
         tuple: The hidden value, reset-hidden value and gate values.
@@ -670,8 +866,9 @@ def cross_entropy(input, label, **kwargs):
     """
     **Cross Entropy Layer**
 
-    This layer computes the cross entropy between `input` and `label`. It supports
-    both standard cross-entropy and soft-label cross-entropy loss computation.
+    This layer computes the cross entropy between `input` and `label`. It
+    supports both standard cross-entropy and soft-label cross-entropy loss
+    computation.
 
     1) One-hot cross-entropy:
 	`soft_label = False`, `Label[i, 0]` indicates the class index for sample i:
@@ -698,23 +895,28 @@ def cross_entropy(input, label, **kwargs):
 
     Args:
         input (Variable|list):  a 2-D tensor with shape [N x D], where N is the
-            batch size and D is the number of classes. This input is a probability
-            computed by the previous operator, which is almost always the result
-            of a softmax operator.
+                                batch size and D is the number of classes. This
+                                input is a probability computed by the previous
+                                operator, which is almost always the result of
+                                a softmax operator.
         label (Variable|list): the ground truth which is a 2-D tensor. When
-              `soft_label` is set to `False`, `label` is a tensor<int64> with shape
-              [N x 1]. When `soft_label` is set to `True`, `label` is a
-              tensor<float/double> with shape [N x D].
-        soft_label (bool, via `**kwargs`): a flag indicating whether to interpretate
-              the given labels as soft labels, default `False`.
+                               `soft_label` is set to `False`, `label` is a
+                               tensor<int64> with shape [N x 1]. When
+                               `soft_label` is set to `True`, `label` is a
+                               tensor<float/double> with shape [N x D].
+        soft_label (bool, via `**kwargs`): a flag indicating whether to
+                                           interpretate the given labels as soft
+                                           labels, default `False`.
 
     Returns:
          A 2-D tensor with shape [N x 1], the cross entropy loss.
 
     Raises:
-        `ValueError`: 1) the 1st dimension of `input` and `label` are not equal; 2) when \
-              `soft_label == True`, and the 2nd dimension of `input` and `label` are not \
-               equal; 3) when `soft_label == False`, and the 2nd dimension of `label` is not 1.
+        `ValueError`: 1) the 1st dimension of `input` and `label` are not equal.
+                      2) when `soft_label == True`, and the 2nd dimension of
+                         `input` and `label` are not equal.
+                      3) when `soft_label == False`, and the 2nd dimension of
+                         `label` is not 1.
 
     Examples:
         .. code-block:: python
@@ -737,7 +939,9 @@ def square_error_cost(input, label, **kwargs):
     """
     **Square error cost layer**
 
-    This layer accepts input predictions and target label and returns the squared error cost.
+    This layer accepts input predictions and target label and returns the
+    squared error cost.
+
     For predictions, :math:`X`, and target labels, :math:`Y`, the equation is:
 
     .. math::
@@ -755,8 +959,8 @@ def square_error_cost(input, label, **kwargs):
        label(Variable): Label tensor, has target labels.
 
     Returns:
-        Variable: The tensor variable storing the element-wise squared error difference \
-                  of input and label.
+        Variable: The tensor variable storing the element-wise squared error
+                  difference of input and label.
 
     Examples:
         .. code-block:: python
@@ -852,7 +1056,8 @@ def chunk_eval(input,
             "chunk_scheme": chunk_scheme,
             "excluded_chunk_types": excluded_chunk_types or []
         })
-    return precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks
+    return (precision, recall, f1_score, num_infer_chunks, num_label_chunks,
+            num_correct_chunks)
 
 
 def sequence_conv(input,
@@ -910,13 +1115,14 @@ def conv2d(input,
     **Convlution2D Layer**
 
     The convolution2D layer calculates the output based on the input, filter
-    and strides, paddings, dilations, groups parameters. Input(Input) and Output(Output)
-    are in NCHW format. Where N is batch size, C is the number of channels, H is the height
-    of the feature, and W is the width of the feature.
+    and strides, paddings, dilations, groups parameters. Input(Input) and
+    Output(Output) are in NCHW format. Where N is batch size, C is the number of
+    channels, H is the height of the feature, and W is the width of the feature.
     The details of convolution layer, please refer UFLDL's `convolution,
     <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_ .
-    If bias attribution and activation type are provided, bias is added to the output of the convolution,
-    and the corresponding activation function is applied to the final result.
+    If bias attribution and activation type are provided, bias is added to the
+    output of the convolution, and the corresponding activation function is
+    applied to the final result.
 
     For each input :math:`X`, the equation is:
 
@@ -931,7 +1137,8 @@ def conv2d(input,
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
     * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be
+                   different.
 
     Example:
 
@@ -976,17 +1183,20 @@ def conv2d(input,
        act(str): Activation type. Default: None
 
     Returns:
-        Variable: The tensor variable storing the convolution and \
+        Variable: The tensor variable storing the convolution and
                   non-linearity activation result.
 
     Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and groups mismatch.
+        ValueError: If the shapes of input, filter_size, stride, padding and
+                    groups mismatch.
 
     Examples:
         .. code-block:: python
 
-          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
-          conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
+          data = fluid.layers.data(
+              name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d = fluid.layers.conv2d(
+              input=data, num_filters=2, filter_size=3, act="relu")
     """
     if stride is None:
         stride = [1, 1]
@@ -1349,7 +1559,8 @@ def conv2d_transpose(input,
     H is the height of the feature, and W is the width of the feature.
     Parameters(dilations, strides, paddings) are two elements. These two elements
     represent height and width, respectively. The details of convolution transpose
-    layer, please refer to the following explanation and references `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
+    layer, please refer to the following explanation and references
+    `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
 
     For each input :math:`X`, the equation is:
 
@@ -1362,7 +1573,8 @@ def conv2d_transpose(input,
     * :math:`X`: Input value, a tensor with NCHW format.
     * :math:`W`: Filter value, a tensor with MCHW format.
     * :math:`\\ast` : Convolution transpose operation.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be
+                   different.
 
     Example:
 
@@ -1403,7 +1615,8 @@ def conv2d_transpose(input,
        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
            contain two integers, (dilation_H, dilation_W). Otherwise, the
            dilation_H = dilation_W = dilation. Default: dilation = 1.
-       param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer. Default: None
+       param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer.
+                              Default: None
        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
            library is installed. Default: True
        name(str|None): A name for this layer(optional). If set None, the layer
@@ -1413,13 +1626,16 @@ def conv2d_transpose(input,
        Variable: The tensor variable storing the convolution transpose result.
 
     Raises:
-       ValueError: If the shapes of input, filter_size, stride, padding and groups mismatch.
+       ValueError: If the shapes of input, filter_size, stride, padding and
+                   groups mismatch.
 
     Examples:
        .. code-block:: python
 
-          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
-          conv2d_transpose = fluid.layers.conv2d_transpose(input=data, num_filters=2, filter_size=3)
+          data = fluid.layers.data(
+              name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d_transpose = fluid.layers.conv2d_transpose(
+              input=data, num_filters=2, filter_size=3)
     """
     helper = LayerHelper("conv2d_transpose", **locals())
     if not isinstance(input, Variable):
@@ -1643,10 +1859,10 @@ def lstm_unit(x_t,
         tuple: The hidden value and cell value of lstm unit.
 
     Raises:
-        ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev**\
-                not be 2 or the 1st dimensions of **x_t**, **hidden_t_prev** \
-                and **cell_t_prev** not be the same or the 2nd dimensions of \
-                **hidden_t_prev** and **cell_t_prev** not be the same.
+        ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev**
+                    not be 2 or the 1st dimensions of **x_t**, **hidden_t_prev**
+                    and **cell_t_prev** not be the same or the 2nd dimensions of
+                    **hidden_t_prev** and **cell_t_prev** not be the same.
 
     Examples:
 
@@ -1978,7 +2194,7 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
           data = fluid.layers.data(name="data",
                                    shape=(3, 17, 13),
                                    dtype="float32")
-          fc = fluid.layers.l2_normalize(x=data, axis=1)
+          normed = fluid.layers.l2_normalize(x=data, axis=1)
     """
 
     if len(x.shape) == 1: axis = 0
@@ -2030,9 +2246,10 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
 
 def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     """
-    Applies matrix multiplication to two tensors. Currently, the input
-    tensors' rank can be any, but when the rank of anyone inputs is
-    bigger than 3, this two inputs' rank should be equal.
+    Applies matrix multiplication to two tensors.
+
+    Currently, the input tensors' rank can be any, but when the rank of any
+    inputs is bigger than 3, this two inputs' rank should be equal.
 
     The actual behavior depends on the shapes of :math:`x`, :math:`y` and the
     flag values of :attr:`transpose_x`, :attr:`transpose_y`. Specifically:
@@ -2073,25 +2290,56 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
             # Examples to clarify shapes of the inputs and output
             # x: [B, ..., M, K], y: [B, ..., K, N]
             fluid.layers.matmul(x, y)  # out: [B, ..., M, N]
+
             # x: [B, M, K], y: [B, K, N]
             fluid.layers.matmul(x, y)  # out: [B, M, N]
+
             # x: [B, M, K], y: [K, N]
             fluid.layers.matmul(x, y)  # out: [B, M, N]
-            # x: [B, M, K], y: [K]
-            fluid.layers.matmul(x, y)  # out: [B, M]
+
             # x: [M, K], y: [K, N]
             fluid.layers.matmul(x, y)  # out: [M, N]
+
+            # x: [B, M, K], y: [K]
+            fluid.layers.matmul(x, y)  # out: [B, M]
+
             # x: [K], y: [K]
             fluid.layers.matmul(x, y)  # out: [1]
-            # x: [M], y: [N]
 
+            # x: [M], y: [N]
             fluid.layers.matmul(x, y, True, True)  # out: [M, N]
     """
+
+    def __check_input(x, y):
+        if len(y.shape) > len(x.shape):
+            raise ValueError(
+                "Invalid inputs for matmul. "
+                "x's rank should be always greater than or equal to y'rank.")
+
+        x_shape = list(x.shape)
+        y_shape = list(y.shape)
+        if len(x_shape) == 1:
+            x_shape = [1] + x_shape
+        if len(y_shape) == 1:
+            y_shape = y_shape + [1]
+
+        # check the inner 2 dimensions
+        if transpose_x:
+            x_shape[-2], x_shape[-1] = x_shape[-1], x_shape[-2]
+        if transpose_y:
+            y_shape[-2], y_shape[-1] = y_shape[-1], y_shape[-2]
+        if x_shape[-1] != y_shape[-2]:
+            raise ValueError("Invalid inputs for matmul.")
+
+        if len(y_shape) > 2:
+            for i, dim_x in enumerate(x_shape[:-2]):
+                if dim_x != y_shape[i]:
+                    raise ValueError("Invalid inputs for matmul.")
+
+    __check_input(x, y)
+
     helper = LayerHelper('matmul', **locals())
-    assert max(len(x.shape), len(y.shape)) <= 3 or len(x.shape) == len(
-        y.
-        shape), 'Inputs\' rank should be equal or their rank should be less 4.'
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    out = helper.create_tmp_variable(dtype=x.dtype)
     helper.append_op(
         type='matmul',
         inputs={'X': x,
@@ -2108,13 +2356,26 @@ def edit_distance(input,
                   ignored_tokens=None,
                   name=None):
     """
-    EditDistance operator computes the edit distances between a batch of hypothesis strings and their references. Edit distance, also called Levenshtein distance, measures how dissimilar two strings are by counting the minimum number of operations to transform one string into anthor. Here the operations include insertion, deletion, and substitution. For example, given hypothesis string A = "kitten" and reference B = "sitting", the edit distance is 3 for A will be transformed into B at least after two substitutions and one insertion:
+    EditDistance operator computes the edit distances between a batch of
+    hypothesis strings and their references. Edit distance, also called
+    Levenshtein distance, measures how dissimilar two strings are by counting
+    the minimum number of operations to transform one string into anthor.
+    Here the operations include insertion, deletion, and substitution.
+
+    For example, given hypothesis string A = "kitten" and reference
+    B = "sitting", the edit distance is 3 for A will be transformed into B
+    at least after two substitutions and one insertion:
 
-       "kitten" -> "sitten" -> "sittin" -> "sitting"
+    "kitten" -> "sitten" -> "sittin" -> "sitting"
 
-    Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with the total number denoted by `batch_size`, and the separation is specified by the LoD information. And the `batch_size` reference strings are arranged in order in the same way in the LoDTensor Input(Refs).
+    Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with
+    the total number denoted by `batch_size`, and the separation is specified
+    by the LoD information. And the `batch_size` reference strings are arranged
+    in order in the same way in the LoDTensor Input(Refs).
 
-    Output(Out) contains the `batch_size` results and each stands for the edit stance for a pair of strings respectively. If Attr(normalized) is true, the edit distance will be divided by the length of reference string.
+    Output(Out) contains the `batch_size` results and each stands for the edit
+    distance for a pair of strings respectively. If Attr(normalized) is true,
+    the edit distance will be divided by the length of reference string.
 
     Args:
 
@@ -2122,9 +2383,11 @@ def edit_distance(input,
 
         label(Variable): The indices for reference strings.
 
-        normalized(bool): Indicated whether to normalize the edit distance by the length of reference string.
+        normalized(bool): Indicated whether to normalize the edit distance by
+                          the length of reference string.
 
-        ignored_tokens(list of int): Tokens that should be removed before calculating edit distance.
+        ignored_tokens(list of int): Tokens that should be removed before
+                                     calculating edit distance.
 
     Returns:
         Variable: sequence-to-sequence edit distance in shape [batch_size, 1].
@@ -2175,8 +2438,10 @@ def edit_distance(input,
 def ctc_greedy_decoder(input, blank, name=None):
     """
     This op is used to decode sequences by greedy policy by below steps:
-    1. Get the indexes of max value for each row in input. a.k.a. numpy.argmax(input, axis=0).
-    2. For each sequence in result of step1, merge repeated tokens between two blanks and delete all blanks.
+    1. Get the indexes of max value for each row in input. a.k.a.
+       numpy.argmax(input, axis=0).
+    2. For each sequence in result of step1, merge repeated tokens between two
+       blanks and delete all blanks.
 
     A simple example as below:
 
@@ -2206,9 +2471,16 @@ def ctc_greedy_decoder(input, blank, name=None):
 
     Args:
 
-        input(Variable): (LoDTensor<float>), the probabilities of variable-length sequences, which is a 2-D Tensor with LoD information. It's shape is [Lp, num_classes + 1], where Lp is the sum of all input sequences' length and num_classes is the true number of classes. (not including the blank label).
+        input(Variable): (LoDTensor<float>), the probabilities of
+                         variable-length sequences, which is a 2-D Tensor with
+                         LoD information. It's shape is [Lp, num_classes + 1],
+                         where Lp is the sum of all input sequences' length and
+                         num_classes is the true number of classes. (not
+                         including the blank label).
 
-        blank(int): the blank label index of Connectionist Temporal Classification (CTC) loss, which is in thehalf-opened interval [0, num_classes + 1).
+        blank(int): the blank label index of Connectionist Temporal
+                    Classification (CTC) loss, which is in thehalf-opened
+                    interval [0, num_classes + 1).
 
     Returns:
         Variable: CTC greedy decode result.
@@ -2276,8 +2548,10 @@ def warpctc(input, label, blank=0, norm_by_times=False, **kwargs):
 
     Examples:
         .. code-block:: python
-            y = layers.data(name='y', shape=[11, 8], dtype='float32', lod_level=1)
-            y_predict = layers.data(name='y_predict', shape=[11, 1], dtype='float32')
+            y = layers.data(
+                name='y', shape=[11, 8], dtype='float32', lod_level=1)
+            y_predict = layers.data(
+                name='y_predict', shape=[11, 1], dtype='float32')
             cost = layers.warpctc(input=y_predict, label=y)
 
     """
@@ -2431,6 +2705,12 @@ def transpose(x, perm, name=None):
         raise ValueError(
             "Input(perm) is the permutation of dimensions of Input(input). "
             "It's length shoud be equal to Input(input)'s rank.")
+    for idx, dim in enumerate(perm):
+        if dim >= len(x.shape):
+            raise ValueError(
+                "Each element in perm should be less than x's rank. "
+                "%d-th element in perm is %d which accesses x's rank %d." %
+                (idx, perm[idx], len(x.shape)))
 
     helper = LayerHelper('transpose', **locals())
     out = helper.create_tmp_variable(x.dtype)
@@ -2539,7 +2819,8 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
 
         .. code-block:: python
 
-            output = fluid.layers.im2sequence(input=layer, stride=[1, 1], filter_size=[2, 2])
+            output = fluid.layers.im2sequence(
+                input=layer, stride=[1, 1], filter_size=[2, 2])
 
     """
 
@@ -2618,3 +2899,55 @@ def row_conv(input, future_context_size, param_attr=None, act=None):
                 'Filter': [filter_param]},
         outputs={'Out': [out]})
     return helper.append_activation(out)
+
+
+def multiplex(inputs, index):
+    """
+    **Multiplex Layer**
+
+    Referring to the given index variable, this layer selects rows from the
+    input variables to construct a multiplex variable. Assuming that there are
+    :math:`m` input variables and :math:`I_i` represents the i-th input
+    variable and :math:`i` is in [0, :math:`m`). All input variables are
+    tensors with same shape [:math:`d_0`, :math:`d_1`, ..., :math:`d_R`].
+    Please note that rank of the input tensor should be at least 2. Each input
+    variable will be treated as a 2-D matrix with shape [:math:`M`, :math:`N`]
+    where :math:`M` for :math:`d_0` and :math:`N` for :math:`d_1` * :math:`d_2`
+    * ... * :math:`d_R`. Let :math:`I_i[j]` be the j-th row of the i-th input
+    variable. The given index variable should be a 2-D tensor with shape
+    [:math:`M`, 1]. Let `ID[i]` be the i-th index value of the index variable.
+    Then the output variable will be a tensor with shape [:math:`d_0`,
+    :math:`d_1`, ..., :math:`d_R`]. If we treat the output tensor as a 2-D
+    matrix with shape [:math:`M`, :math:`N`] and let :math:`O[i]` be the i-th
+    row of the matrix, then `O[i]` is equal to :math:`I_{ID[i]}[i]`.
+
+    Args:
+       inputs (list): A list of variables to gather from. All variables have the
+                same shape and the rank is at least 2.
+       index (Variable): Tensor<int32>, index variable which is a 2-D tensor
+                with shape [M, 1] where M is the batch size.
+
+    Returns:
+        Variable: Multiplex variable gathered from input variables.
+
+    Examples:
+        .. code-block:: python
+
+            x1 = fluid.layers.data(name='x1', shape=[4], dtype='float32')
+            x2 = fluid.layers.data(name='x2', shape=[4], dtype='float32')
+            index = fluid.layers.data(name='index', shape=[1], dtype='int32')
+            out = fluid.layers.multiplex(inputs=[x1, x2], index=index)
+    """
+    helper = LayerHelper('multiplex', **locals())
+
+    if not isinstance(inputs, list) and len(inputs) < 2:
+        raise ValueError("inputs should be a list object and contains at least "
+                         "2 elements.")
+
+    out = helper.create_tmp_variable(inputs[0].dtype)
+    helper.append_op(
+        type='multiplex',
+        inputs={'X': inputs,
+                'Ids': index},
+        outputs={'Out': [out]})
+    return out
diff --git a/python/paddle/v2/fluid/nets.py b/python/paddle/v2/fluid/nets.py
index 6146e3711d3c62d22591b2855d73b5791e4b47d0..cb63d43709e23ae04c4d23457bbb79e6f7f0ce3c 100644
--- a/python/paddle/v2/fluid/nets.py
+++ b/python/paddle/v2/fluid/nets.py
@@ -11,14 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import layers
 
 __all__ = [
     "simple_img_conv_pool",
     "sequence_conv_pool",
     "glu",
-    "dot_product_attention",
+    "scaled_dot_product_attention",
 ]
 
 
@@ -160,7 +159,11 @@ def glu(input, dim=-1):
     return out
 
 
-def dot_product_attention(querys, keys, values):
+def scaled_dot_product_attention(queries,
+                                 keys,
+                                 values,
+                                 num_heads=1,
+                                 dropout_rate=0.):
     """
     The dot-product attention.
 
@@ -174,39 +177,162 @@ def dot_product_attention(querys, keys, values):
 
         .. math::
 
-            Attention(Q, K, V)= softmax(QK^\mathrm{T})V
+            Attention(Q, K, V)= softmax(QK^\mathrm{T})V
 
     Refer to `Attention Is All You Need
     <https://arxiv.org/pdf/1706.03762.pdf>`_.
 
-    Note that batch data containing sequences with different lengths is not
-    supported by this because of the (batch) matrix multipication.
-
     Args:
-        query (Variable): The input variable which is a Tensor or LoDTensor.
-        key (Variable): The input variable which is a Tensor or LoDTensor.
-        value (Variable): The input variable which is a Tensor or LoDTensor.
+
+        queries (Variable): The input variable which should be a 3-D Tensor.
+        keys (Variable): The input variable which should be a 3-D Tensor.
+        values (Variable): The input variable which should be a 3-D Tensor.
+        num_heads (int): Head number to compute the scaled dot product
+                         attention. Default value is 1.
+        dropout_rate (float): The dropout rate to drop the attention weight.
+                              Default value is 0.
 
     Returns:
-        tuple: The Tensor variables representing the output and attention scores.
+
+        Variable: A 3-D Tensor computed by multi-head scaled dot product
+                  attention.
+
+    Raises:
+
+        ValueError: If input queries, keys, values are not 3-D Tensors.
+
+    NOTE:
+        1. When num_heads > 1, three linear projections are learned respectively
+        to map input queries, keys and values into queries', keys' and values'.
+        queries', keys' and values' have the same shapes with queries, keys
+        and values.
+
+        1. When num_heads == 1, scaled_dot_product_attention has no learnable
+        parameters.
 
     Examples:
         .. code-block:: python
 
-            # Suppose q, k, v are tensor variables with the following shape:
+            # Suppose q, k, v are Tensors with the following shape:
             # q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10]
-            out, attn_scores = fluid.nets.dot_product_attention(q, k, v)
-            out.shape  # [3, 5, 10]
-            attn_scores.shape  # [3, 5, 6]
+
+            contexts = fluid.nets.scaled_dot_product_attention(q, k, v)
+            contexts.shape  # [3, 5, 10]
     """
-    assert keys.shape[-2] == values.shape[
-        -2], 'The shapes of keys and values mismatch.'
-    assert querys.shape[-1] == keys.shape[
-        -1], 'The shapes of querys and keys mismatch.'
-    product = layers.matmul(x=querys, y=keys, transpose_y=True)
-    attn_scores = layers.reshape(
+    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
+        raise ValueError(
+            "Inputs quries, keys and values should all be 3-D tensors.")
+
+    if queries.shape[-1] != keys.shape[-1]:
+        raise ValueError(
+            "The hidden size of queries and keys should be the same.")
+    if keys.shape[-2] != values.shape[-2]:
+        raise ValueError(
+            "The max sequence length in query batch and in key batch "
+            "should be the same.")
+    if keys.shape[-1] % num_heads != 0:
+        raise ValueError("The hidden size of keys (%d) must be divisible "
+                         "by the number of attention heads (%d)." %
+                         (keys.shape[-1], num_heads))
+    if values.shape[-1] % num_heads != 0:
+        raise ValueError("The hidden size of values (%d) must be divisible "
+                         "by the number of attention heads (%d)." %
+                         (values.shape[-1], num_heads))
+
+    def __compute_qkv(queries, keys, values, num_heads):
+        """
+        Add linear projection to queries, keys, and values.
+
+        Args:
+            queries(Tensor): a 3-D input Tensor.
+            keys(Tensor): a 3-D input Tensor.
+            values(Tensor): a 3-D input Tensor.
+            num_heads(int): The number of heads. Linearly project the inputs
+                            ONLY when num_heads > 1.
+
+        Returns:
+            Tensor: linearly projected output Tensors: queries', keys' and
+                    values'. They have the same shapes with queries, keys and
+                    values.
+        """
+
+        if num_heads == 1:
+            return queries, keys, values
+
+        q = layers.fc(input=queries, size=queries.shape[-1], num_flatten_dims=2)
+        k = layers.fc(input=keys, size=keys.shape[-1], num_flatten_dims=2)
+        v = layers.fc(input=values, size=values.shape[-1], num_flatten_dims=2)
+        return q, k, v
+
+    def __split_heads(x, num_heads):
+        """
+        Reshape the last dimension of inpunt tensor x so that it becomes two
+        dimensions.
+
+        Args:
+            x(Tensor): a 3-D input Tensor.
+            num_heads(int): The number of heads.
+
+        Returns:
+            Tensor: a Tensor with shape [..., n, m/num_heads], where m is size
+                    of the last dimension of x.
+        """
+        if num_heads == 1:
+            return x
+
+        hidden_size = x.shape[-1]
+        # reshape the 3-D input: [batch_size, max_sequence_length, hidden_dim]
+        # into a 4-D output:
+        # [batch_size, max_sequence_length, num_heads, hidden_size_per_head].
+        reshaped = layers.reshape(
+            x=x,
+            shape=list(x.shape[:-1]) + [num_heads, hidden_size // num_heads])
+
+        # permuate the dimensions into:
+        # [batch_size, num_heads, max_sequence_len, hidden_size_per_head]
+        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
+
+    def __combine_heads(x):
+        """
+        Reshape the last two dimensions of inpunt tensor x so that it becomes
+        one dimension.
+
+        Args:
+            x(Tensor): a 4-D input Tensor with shape
+                       [bs, num_heads, max_sequence_length, hidden_dim].
+
+        Returns:
+            Tensor: a Tensor with shape
+                    [bs, max_sequence_length, num_heads * hidden_dim].
+        """
+
+        if len(x.shape) == 3: return x
+        if len(x.shape) != 4:
+            raise ValueError("Input(x) should be a 4-D Tensor.")
+
+        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
+        return layers.reshape(
+            x=trans_x,
+            shape=map(int, [
+                trans_x.shape[0], trans_x.shape[1],
+                trans_x.shape[2] * trans_x.shape[3]
+            ]))
+
+    q, k, v = __compute_qkv(queries, keys, values, num_heads)
+
+    q = __split_heads(q, num_heads)
+    k = __split_heads(k, num_heads)
+    v = __split_heads(v, num_heads)
+
+    key_dim_per_head = keys.shape[-1] // num_heads
+    scaled_q = layers.scale(x=q, scale=key_dim_per_head**-0.5)
+    product = layers.matmul(x=k, y=scaled_q, transpose_y=True)
+
+    weights = layers.reshape(
         x=layers.reshape(
-            x=product, shape=[-1, product.shape[-1]], act='softmax'),
+            x=product, shape=[-1, product.shape[-1]], act="softmax"),
         shape=product.shape)
-    out = layers.matmul(attn_scores, values)
-    return out, attn_scores
+    if dropout_rate:
+        weights = layers.dropout(x, dropout_prob=dropout_rate, is_test=False)
+    ctx_multiheads = layers.matmul(weights, v)
+    return __combine_heads(ctx_multiheads)
diff --git a/python/paddle/v2/fluid/tests/book/CMakeLists.txt b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
index a35abe3e0c436be4eaed01c9b9183344c6d3b275..dda02c03fd531445c1b33b39a6ded10921991d9c 100644
--- a/python/paddle/v2/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
@@ -1,9 +1,33 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-list(REMOVE_ITEM TEST_OPS test_image_classification_train)
+list(REMOVE_ITEM TEST_OPS test_image_classification_train test_recognize_digits)
 py_test(test_image_classification_train_resnet SRCS test_image_classification_train.py ARGS resnet)
 py_test(test_image_classification_train_vgg SRCS test_image_classification_train.py ARGS vgg)
+py_test(test_recognize_digits_mlp_cpu
+  SRCS test_recognize_digits.py
+  ARGS mlp)
+py_test(test_recognize_digits_mlp_cuda
+  SRCS test_recognize_digits.py
+  ARGS mlp --use_cuda)
+py_test(test_recognize_digits_conv_cpu
+  SRCS test_recognize_digits.py
+  ARGS conv)
+py_test(test_recognize_digits_conv_cuda
+  SRCS test_recognize_digits.py
+  ARGS conv --use_cuda)
+py_test(test_recognize_digits_mlp_cpu_parallel
+  SRCS test_recognize_digits.py
+  ARGS mlp --parallel)
+py_test(test_recognize_digits_mlp_cuda_parallel
+  SRCS test_recognize_digits.py
+  ARGS mlp --use_cuda --parallel)
+py_test(test_recognize_digits_conv_cpu_parallel
+  SRCS test_recognize_digits.py
+  ARGS conv --parallel)
+py_test(test_recognize_digits_conv_cuda_parallel
+  SRCS test_recognize_digits.py
+  ARGS conv --use_cuda --parallel)
 
 # default test
 foreach(src ${TEST_OPS})
diff --git a/python/paddle/v2/fluid/tests/book/__init__.py b/python/paddle/v2/fluid/tests/book/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b94a21a7e406b833797f8f521c62a2351c2bc30a
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac7ef4046f9ff55c2cbfc28b50784b9bffb80d53
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
@@ -0,0 +1,149 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import paddle.v2.fluid as fluid
+import paddle.v2 as paddle
+import sys
+import numpy
+
+
+def parse_arg():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "nn_type",
+        help="The neural network type, in ['mlp', 'conv']",
+        type=str,
+        choices=['mlp', 'conv'])
+    parser.add_argument(
+        "--parallel",
+        help='Run in parallel or not',
+        default=False,
+        action="store_true")
+    parser.add_argument(
+        "--use_cuda",
+        help="Run the program by using CUDA",
+        default=False,
+        action="store_true")
+    return parser.parse_args()
+
+
+BATCH_SIZE = 64
+
+
+def loss_net(hidden, label):
+    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    return fluid.layers.mean(x=loss), fluid.layers.accuracy(
+        input=prediction, label=label)
+
+
+def mlp(img, label):
+    hidden = fluid.layers.fc(input=img, size=200, act='tanh')
+    hidden = fluid.layers.fc(input=hidden, size=200, act='tanh')
+    return loss_net(hidden, label)
+
+
+def conv_net(img, label):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    return loss_net(conv_pool_2, label)
+
+
+def main():
+    args = parse_arg()
+    print("recognize digits with args: {0}".format(" ".join(sys.argv[1:])))
+
+    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    if args.nn_type == 'mlp':
+        net_conf = mlp
+    else:
+        net_conf = conv_net
+
+    if args.parallel:
+        places = fluid.layers.get_places()
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            img_ = pd.read_input(img)
+            label_ = pd.read_input(label)
+            for o in net_conf(img_, label_):
+                pd.write_output(o)
+
+        avg_loss, acc = pd()
+        # get mean loss and acc through every devices.
+        avg_loss = fluid.layers.mean(x=avg_loss)
+        acc = fluid.layers.mean(x=acc)
+    else:
+        avg_loss, acc = net_conf(img, label)
+
+    test_program = fluid.default_main_program().clone()
+
+    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+    optimizer.minimize(avg_loss)
+
+    place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
+
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=500),
+        batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
+    feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
+
+    PASS_NUM = 100
+    for pass_id in range(PASS_NUM):
+        for batch_id, data in enumerate(train_reader()):
+            # train a mini-batch, fetch nothing
+            exe.run(feed=feeder.feed(data))
+            if (batch_id + 1) % 10 == 0:
+                acc_set = []
+                avg_loss_set = []
+                for test_data in test_reader():
+                    acc_np, avg_loss_np = exe.run(program=test_program,
+                                                  feed=feeder.feed(test_data),
+                                                  fetch_list=[acc, avg_loss])
+                    acc_set.append(float(acc_np))
+                    avg_loss_set.append(float(avg_loss_np))
+                # get test acc and loss
+                acc_val = numpy.array(acc_set).mean()
+                avg_loss_val = numpy.array(avg_loss_set).mean()
+                if float(acc_val) > 0.85:  # test acc > 85%
+                    exit(0)
+                else:
+                    print(
+                        'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
+                        format(pass_id, batch_id + 1,
+                               float(avg_loss_val), float(acc_val)))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
deleted file mode 100644
index 4710d16c24e95a11108801a014f94687558fd91e..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
+++ /dev/null
@@ -1,74 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-
-images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
-label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-conv_pool_1 = fluid.nets.simple_img_conv_pool(
-    input=images,
-    filter_size=5,
-    num_filters=20,
-    pool_size=2,
-    pool_stride=2,
-    act="relu")
-conv_pool_2 = fluid.nets.simple_img_conv_pool(
-    input=conv_pool_1,
-    filter_size=5,
-    num_filters=50,
-    pool_size=2,
-    pool_stride=2,
-    act="relu")
-
-predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
-cost = fluid.layers.cross_entropy(input=predict, label=label)
-avg_cost = fluid.layers.mean(x=cost)
-optimizer = fluid.optimizer.Adam(learning_rate=0.01)
-optimizer.minimize(avg_cost)
-
-accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
-
-BATCH_SIZE = 50
-PASS_NUM = 3
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.mnist.train(), buf_size=500),
-    batch_size=BATCH_SIZE)
-
-place = fluid.CPUPlace()
-exe = fluid.Executor(place)
-feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
-exe.run(fluid.default_startup_program())
-
-for pass_id in range(PASS_NUM):
-    accuracy.reset(exe)
-    for data in train_reader():
-        loss, acc = exe.run(fluid.default_main_program(),
-                            feed=feeder.feed(data),
-                            fetch_list=[avg_cost] + accuracy.metrics)
-        pass_acc = accuracy.eval(exe)
-        print("pass_id=" + str(pass_id) + " acc=" + str(acc) + " pass_acc=" +
-              str(pass_acc))
-        # print loss, acc
-        if loss < 10.0 and pass_acc > 0.9:
-            # if avg cost less than 10.0 and accuracy is larger than 0.9, we think our code is good.
-            exit(0)
-
-    pass_acc = accuracy.eval(exe)
-    print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
-
-exit(1)
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
deleted file mode 100644
index 8776a65bf804e93dfeb295ecca34fac0840b0a90..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
+++ /dev/null
@@ -1,96 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-
-BATCH_SIZE = 128
-image = fluid.layers.data(name='x', shape=[784], dtype='float32')
-
-regularizer = fluid.regularizer.L2Decay(0.0005 * BATCH_SIZE)
-
-hidden1 = fluid.layers.fc(input=image,
-                          size=128,
-                          act='relu',
-                          param_attr=fluid.ParamAttr(
-                              regularizer=regularizer,
-                              gradient_clip=fluid.clip.ClipByValue(10)))
-
-hidden2 = fluid.layers.fc(input=hidden1,
-                          size=64,
-                          act='relu',
-                          param_attr=regularizer)
-
-predict = fluid.layers.fc(input=hidden2,
-                          size=10,
-                          act='softmax',
-                          param_attr=regularizer)
-
-label = fluid.layers.data(name='y', shape=[1], dtype='int64')
-
-cost = fluid.layers.cross_entropy(input=predict, label=label)
-avg_cost = fluid.layers.mean(x=cost)
-
-optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
-opts = optimizer.minimize(avg_cost)
-
-accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
-
-inference_program = fluid.default_main_program().clone()
-with fluid.program_guard(inference_program):
-    test_accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
-    test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states
-    inference_program = fluid.io.get_inference_program(test_target)
-
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.mnist.train(), buf_size=8192),
-    batch_size=BATCH_SIZE)
-
-test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
-
-place = fluid.CPUPlace()
-exe = fluid.Executor(place)
-feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
-exe.run(fluid.default_startup_program())
-
-PASS_NUM = 100
-for pass_id in range(PASS_NUM):
-    accuracy.reset(exe)
-    for data in train_reader():
-        out, acc = exe.run(fluid.default_main_program(),
-                           feed=feeder.feed(data),
-                           fetch_list=[avg_cost] + accuracy.metrics)
-        pass_acc = accuracy.eval(exe)
-
-        test_accuracy.reset(exe)
-        for data in test_reader():
-            out, acc = exe.run(inference_program,
-                               feed=feeder.feed(data),
-                               fetch_list=[avg_cost] + test_accuracy.metrics)
-
-        test_pass_acc = test_accuracy.eval(exe)
-        print("pass_id=" + str(pass_id) + " train_cost=" + str(
-            out) + " train_acc=" + str(acc) + " train_pass_acc=" + str(pass_acc)
-              + " test_acc=" + str(test_pass_acc))
-
-        if test_pass_acc > 0.7:
-            fluid.io.save_inference_model(
-                "./recognize_digits_mlp.inference.model/", ["x"], [predict],
-                exe)
-            exit(0)
-
-exit(1)
diff --git a/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py b/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdc60861760163d2ebad3b050e551929321baafd
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -0,0 +1,204 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor
+
+dict_size = 30000
+source_dict_dim = target_dict_dim = dict_size
+src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+hidden_dim = 32
+embedding_dim = 16
+batch_size = 10
+max_length = 50
+topk_size = 50
+encoder_size = decoder_size = hidden_dim
+IS_SPARSE = True
+USE_PEEPHOLES = False
+
+
+def bi_lstm_encoder(input_seq, hidden_size):
+    input_forward_proj = fluid.layers.fc(input=input_seq,
+                                         size=hidden_size * 4,
+                                         bias_attr=True)
+    forward, _ = fluid.layers.dynamic_lstm(
+        input=input_forward_proj,
+        size=hidden_size * 4,
+        use_peepholes=USE_PEEPHOLES)
+    input_backward_proj = fluid.layers.fc(input=input_seq,
+                                          size=hidden_size * 4,
+                                          bias_attr=True)
+    backward, _ = fluid.layers.dynamic_lstm(
+        input=input_backward_proj,
+        size=hidden_size * 4,
+        is_reverse=True,
+        use_peepholes=USE_PEEPHOLES)
+
+    forward_last = fluid.layers.sequence_last_step(input=forward)
+    backward_first = fluid.layers.sequence_first_step(input=backward)
+
+    return forward_last, backward_first
+
+
+# FIXME(peterzhang2029): Replace this function with the lstm_unit_op.
+def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
+    def linear(inputs):
+        return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
+
+    forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
+
+    cell_t = fluid.layers.sums(input=[
+        fluid.layers.elementwise_mul(
+            x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
+                x=input_gate, y=cell_tilde)
+    ])
+
+    hidden_t = fluid.layers.elementwise_mul(
+        x=output_gate, y=fluid.layers.tanh(x=cell_t))
+
+    return hidden_t, cell_t
+
+
+def lstm_decoder_without_attention(target_embedding, decoder_boot, context,
+                                   decoder_size):
+    rnn = fluid.layers.DynamicRNN()
+
+    cell_init = fluid.layers.fill_constant_batch_size_like(
+        input=decoder_boot,
+        value=0.0,
+        shape=[-1, decoder_size],
+        dtype='float32')
+    cell_init.stop_gradient = False
+
+    with rnn.block():
+        current_word = rnn.step_input(target_embedding)
+        context = rnn.static_input(context)
+
+        hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
+        cell_mem = rnn.memory(init=cell_init)
+        decoder_inputs = fluid.layers.concat(
+            input=[context, current_word], axis=1)
+        h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size)
+        rnn.update_memory(hidden_mem, h)
+        rnn.update_memory(cell_mem, c)
+        out = fluid.layers.fc(input=h,
+                              size=target_dict_dim,
+                              bias_attr=True,
+                              act='softmax')
+        rnn.output(out)
+    return rnn()
+
+
+def seq_to_seq_net():
+    """Construct a seq2seq network."""
+
+    src_word_idx = fluid.layers.data(
+        name='source_sequence', shape=[1], dtype='int64', lod_level=1)
+
+    src_embedding = fluid.layers.embedding(
+        input=src_word_idx,
+        size=[source_dict_dim, embedding_dim],
+        dtype='float32')
+
+    src_forward_last, src_backward_first = bi_lstm_encoder(
+        input_seq=src_embedding, hidden_size=encoder_size)
+
+    encoded_vector = fluid.layers.concat(
+        input=[src_forward_last, src_backward_first], axis=1)
+
+    decoder_boot = fluid.layers.fc(input=src_backward_first,
+                                   size=decoder_size,
+                                   bias_attr=False,
+                                   act='tanh')
+
+    trg_word_idx = fluid.layers.data(
+        name='target_sequence', shape=[1], dtype='int64', lod_level=1)
+
+    trg_embedding = fluid.layers.embedding(
+        input=trg_word_idx,
+        size=[target_dict_dim, embedding_dim],
+        dtype='float32')
+
+    prediction = lstm_decoder_without_attention(trg_embedding, decoder_boot,
+                                                encoded_vector, decoder_size)
+    label = fluid.layers.data(
+        name='label_sequence', shape=[1], dtype='int64', lod_level=1)
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    return avg_cost
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = core.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    avg_cost = seq_to_seq_net()
+
+    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
+    optimizer.minimize(avg_cost)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+
+    place = core.CPUPlace()
+    exe = Executor(place)
+
+    exe.run(framework.default_startup_program())
+
+    batch_id = 0
+    for pass_id in xrange(2):
+        for data in train_data():
+            word_data = to_lodtensor(map(lambda x: x[0], data), place)
+            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
+            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
+            outs = exe.run(framework.default_main_program(),
+                           feed={
+                               'source_sequence': word_data,
+                               'target_sequence': trg_word,
+                               'label_sequence': trg_word_next
+                           },
+                           fetch_list=[avg_cost])
+            avg_cost_val = np.array(outs[0])
+            print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
+                  " avg_cost=" + str(avg_cost_val))
+            if batch_id > 3:
+                exit(0)
+            batch_id += 1
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_image_classification.py b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_image_classification.py
index 218dea31e10757d901c5524567f13501b64dbea5..298ecfc386b3ae093cf714a41f5072759cb2cf2e 100644
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_image_classification.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_image_classification.py
@@ -1,21 +1,19 @@
-#Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 from __future__ import print_function
 
-import sys
-
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 import os
@@ -106,10 +104,10 @@ if len(sys.argv) >= 2:
     net_type = sys.argv[1]
 
 if net_type == "vgg":
-    print("train vgg net")
+    print("training vgg net")
     net = vgg16_bn_drop(images)
 elif net_type == "resnet":
-    print("train resnet")
+    print("training resnet")
     net = resnet_cifar10(images, 32)
 else:
     raise ValueError("%s network is not supported" % net_type)
@@ -129,6 +127,7 @@ train_reader = paddle.batch(
     batch_size=BATCH_SIZE)
 
 place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
 exe = fluid.Executor(place)
 
 t = fluid.DistributeTranspiler()
@@ -146,17 +145,14 @@ if training_role == "PSERVER":
     if not current_endpoint:
         print("need env SERVER_ENDPOINT")
         exit(1)
-    print("start pserver at:", current_endpoint)
     pserver_prog = t.get_pserver_program(current_endpoint)
     pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
     exe.run(pserver_startup)
     exe.run(pserver_prog)
-    print("pserver run end")
 elif training_role == "TRAINER":
-    print("start trainer")
     trainer_prog = t.get_trainer_program()
-    feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
     exe.run(fluid.default_startup_program())
+
     for pass_id in range(PASS_NUM):
         accuracy.reset(exe)
         for data in train_reader():
@@ -164,9 +160,10 @@ elif training_role == "TRAINER":
                                 feed=feeder.feed(data),
                                 fetch_list=[avg_cost] + accuracy.metrics)
             pass_acc = accuracy.eval(exe)
-            print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
-                pass_acc))
-            # this model is slow, so if we can train two mini batch, we think it works properly.
+            print("pass_id:" + str(pass_id) + "loss:" + str(loss) + " pass_acc:"
+                  + str(pass_acc))
+            # this model is slow, so if we can train two mini batches,
+            # we think it works properly.
     print("trainer run end")
 else:
     print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_recommender_system_dist.py b/python/paddle/v2/fluid/tests/book_distribute/notest_recommender_system_dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d8885e377b0a10d8b5bad4e8fcecb9cc6fc8b64
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_recommender_system_dist.py
@@ -0,0 +1,216 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import os
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+from paddle.v2.fluid.optimizer import SGDOptimizer
+
+IS_SPARSE = True
+BATCH_SIZE = 256
+PASS_NUM = 100
+
+
+def get_usr_combined_features():
+    USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1
+    uid = layers.data(name='user_id', shape=[1], dtype='int64')
+    usr_emb = layers.embedding(
+        input=uid,
+        dtype='float32',
+        size=[USR_DICT_SIZE, 32],
+        param_attr='user_table',
+        is_sparse=IS_SPARSE)
+    usr_fc = layers.fc(input=usr_emb, size=32)
+    USR_GENDER_DICT_SIZE = 2
+
+    usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64')
+    usr_gender_emb = layers.embedding(
+        input=usr_gender_id,
+        size=[USR_GENDER_DICT_SIZE, 16],
+        param_attr='gender_table',
+        is_sparse=IS_SPARSE)
+    usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
+
+    USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
+    usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64")
+    usr_age_emb = layers.embedding(
+        input=usr_age_id,
+        size=[USR_AGE_DICT_SIZE, 16],
+        is_sparse=IS_SPARSE,
+        param_attr='age_table')
+    usr_age_fc = layers.fc(input=usr_age_emb, size=16)
+
+    USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
+    usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64")
+    usr_job_emb = layers.embedding(
+        input=usr_job_id,
+        size=[USR_JOB_DICT_SIZE, 16],
+        param_attr='job_table',
+        is_sparse=IS_SPARSE)
+    usr_job_fc = layers.fc(input=usr_job_emb, size=16)
+
+    concat_embed = layers.concat(
+        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1)
+
+    usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
+    return usr_combined_features
+
+
+def get_mov_combined_features():
+    MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1
+    mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')
+    mov_emb = layers.embedding(
+        input=mov_id,
+        dtype='float32',
+        size=[MOV_DICT_SIZE, 32],
+        param_attr='movie_table',
+        is_sparse=IS_SPARSE)
+    mov_fc = layers.fc(input=mov_emb, size=32)
+
+    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
+    category_id = layers.data(name='category_id', shape=[1], dtype='int64')
+    mov_categories_emb = layers.embedding(
+        input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
+    mov_categories_hidden = layers.sequence_pool(
+        input=mov_categories_emb, pool_type="sum")
+
+    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
+    mov_title_id = layers.data(name='movie_title', shape=[1], dtype='int64')
+    mov_title_emb = layers.embedding(
+        input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
+    mov_title_conv = nets.sequence_conv_pool(
+        input=mov_title_emb,
+        num_filters=32,
+        filter_size=3,
+        act="tanh",
+        pool_type="sum")
+
+    concat_embed = layers.concat(
+        input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)
+
+    mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
+    return mov_combined_features
+
+
+def model():
+    usr_combined_features = get_usr_combined_features()
+    mov_combined_features = get_mov_combined_features()
+
+    # need cos sim
+    inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
+    scale_infer = layers.scale(x=inference, scale=5.0)
+
+    label = layers.data(name='score', shape=[1], dtype='float32')
+    square_cost = layers.square_error_cost(input=scale_infer, label=label)
+    avg_cost = layers.mean(x=square_cost)
+
+    return avg_cost
+
+
+def func_feed(feeding, data, place):
+    feed_tensors = {}
+    for (key, idx) in feeding.iteritems():
+        tensor = core.LoDTensor()
+        if key != "category_id" and key != "movie_title":
+            if key == "score":
+                numpy_data = np.array(map(lambda x: x[idx], data)).astype(
+                    "float32")
+            else:
+                numpy_data = np.array(map(lambda x: x[idx], data)).astype(
+                    "int64")
+        else:
+            numpy_data = map(lambda x: np.array(x[idx]).astype("int64"), data)
+            lod_info = [len(item) for item in numpy_data]
+            offset = 0
+            lod = [offset]
+            for item in lod_info:
+                offset += item
+                lod.append(offset)
+            numpy_data = np.concatenate(numpy_data, axis=0)
+            tensor.set_lod([lod])
+
+        numpy_data = numpy_data.reshape([numpy_data.shape[0], 1])
+        tensor.set(numpy_data, place)
+        feed_tensors[key] = tensor
+    return feed_tensors
+
+
+def main():
+    cost = model()
+    optimizer = SGDOptimizer(learning_rate=0.2)
+    optimize_ops, params_grads = optimizer.minimize(cost)
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.movielens.train(), buf_size=8192),
+        batch_size=BATCH_SIZE)
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    t = fluid.DistributeTranspiler()
+
+    # all parameter server endpoints list for spliting parameters
+    pserver_endpoints = os.getenv("PSERVERS")
+    # server endpoint for current node
+    current_endpoint = os.getenv("SERVER_ENDPOINT")
+    # run as trainer or parameter server
+    training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+    t.transpile(
+        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+    if training_role == "PSERVER":
+        if not current_endpoint:
+            print("need env SERVER_ENDPOINT")
+            exit(1)
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+        exe.run(pserver_startup)
+        exe.run(pserver_prog)
+    elif training_role == "TRAINER":
+        exe.run(fluid.default_startup_program())
+        trainer_prog = t.get_trainer_program()
+
+        feeding = {
+            'user_id': 0,
+            'gender_id': 1,
+            'age_id': 2,
+            'job_id': 3,
+            'movie_id': 4,
+            'category_id': 5,
+            'movie_title': 6,
+            'score': 7
+        }
+
+        for pass_id in range(PASS_NUM):
+            for data in train_reader():
+                outs = exe.run(trainer_prog,
+                               feed=func_feed(feeding, data, place),
+                               fetch_list=[cost])
+                out = np.array(outs[0])
+                print("cost=" + str(out[0]))
+                if out[0] < 6.0:
+                    print("Training complete. Average cost is less than 6.0.")
+                    # if avg cost less than 6.0, we think our code is good.
+                    exit(0)
+    else:
+        print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_dynamic_lstm.py b/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_dynamic_lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..bff376a0e2ee0fbb0d869e0dddf4460ed5dc4ac6
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_dynamic_lstm.py
@@ -0,0 +1,135 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import os
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+
+def stacked_lstm_net(data,
+                     label,
+                     input_dim,
+                     class_dim=2,
+                     emb_dim=128,
+                     hid_dim=512,
+                     stacked_num=3):
+    assert stacked_num % 2 == 1
+
+    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
+    # add bias attr
+
+    # TODO(qijun) linear act
+    fc1 = fluid.layers.fc(input=emb, size=hid_dim)
+    lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
+
+    inputs = [fc1, lstm1]
+
+    for i in range(2, stacked_num + 1):
+        fc = fluid.layers.fc(input=inputs, size=hid_dim)
+        lstm, cell = fluid.layers.dynamic_lstm(
+            input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
+        inputs = [fc, lstm]
+
+    fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
+    lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
+
+    prediction = fluid.layers.fc(input=[fc_last, lstm_last],
+                                 size=class_dim,
+                                 act='softmax')
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
+    optimize_ops, params_grads = adam_optimizer.minimize(avg_cost)
+    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
+    return avg_cost, accuracy, accuracy.metrics[0], optimize_ops, params_grads
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    BATCH_SIZE = 100
+    PASS_NUM = 5
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    print "loaded word dict successfully"
+    dict_dim = len(word_dict)
+    class_dim = 2
+
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    cost, accuracy, acc_out, optimize_ops, params_grads = stacked_lstm_net(
+        data, label, input_dim=dict_dim, class_dim=class_dim)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=1000),
+        batch_size=BATCH_SIZE)
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+
+    t = fluid.DistributeTranspiler()
+    # all parameter server endpoints list for spliting parameters
+    pserver_endpoints = os.getenv("PSERVERS")
+    # server endpoint for current node
+    current_endpoint = os.getenv("SERVER_ENDPOINT")
+    # run as trainer or parameter server
+    training_role = os.getenv(
+        "TRAINING_ROLE", "TRAINER")  # get the training role: trainer/pserver
+    t.transpile(
+        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+    if training_role == "PSERVER":
+        if not current_endpoint:
+            print("need env SERVER_ENDPOINT")
+            exit(1)
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+        exe.run(pserver_startup)
+        exe.run(pserver_prog)
+    elif training_role == "TRAINER":
+        exe.run(fluid.default_startup_program())
+        trainer_prog = t.get_trainer_program()
+        for pass_id in xrange(PASS_NUM):
+            accuracy.reset(exe)
+            for data in train_data():
+                cost_val, acc_val = exe.run(trainer_prog,
+                                            feed=feeder.feed(data),
+                                            fetch_list=[cost, acc_out])
+                pass_acc = accuracy.eval(exe)
+                print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
+                      " pass_acc=" + str(pass_acc))
+                if cost_val < 1.0 and acc_val > 0.8:
+                    exit(0)
+    else:
+        print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/op_test.py b/python/paddle/v2/fluid/tests/op_test.py
index 56f54de86f680653fbd97a7ce1d3f547d1657587..3f6d7070c2987d0557c60db84a2c679cd2cfe36b 100644
--- a/python/paddle/v2/fluid/tests/op_test.py
+++ b/python/paddle/v2/fluid/tests/op_test.py
@@ -334,7 +334,7 @@ class OpTest(unittest.TestCase):
 
     def check_output(self, atol=1e-5):
         places = [core.CPUPlace()]
-        if core.is_compile_gpu() and core.op_support_gpu(self.op_type):
+        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
             places.append(core.CUDAPlace(0))
         for place in places:
             self.check_output_with_place(place, atol)
@@ -367,7 +367,7 @@ class OpTest(unittest.TestCase):
                    max_relative_error=0.005,
                    user_defined_grads=None):
         places = [core.CPUPlace()]
-        if core.is_compile_gpu() and core.op_support_gpu(self.op_type):
+        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
             places.append(core.CUDAPlace(0))
         for place in places:
             self.check_grad_with_place(place, inputs_to_check, output_names,
diff --git a/python/paddle/v2/fluid/tests/test_activation_op.py b/python/paddle/v2/fluid/tests/test_activation_op.py
index 18605e60652a1614571a91918a012f0c08c8f1b3..1de5d446b8eaf57d3718dde7540c929996ee3432 100644
--- a/python/paddle/v2/fluid/tests/test_activation_op.py
+++ b/python/paddle/v2/fluid/tests/test_activation_op.py
@@ -186,8 +186,7 @@ class TestFloor(OpTest):
         self.op_type = "floor"
         x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
         self.inputs = {'X': x}
-        # numpy floor need +1
-        self.outputs = {'Out': np.floor(self.inputs['X']) + 1.0}
+        self.outputs = {'Out': np.floor(self.inputs['X'])}
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/v2/fluid/tests/test_adagrad_op.py b/python/paddle/v2/fluid/tests/test_adagrad_op.py
index 86b0567ce123b00bace639fb8fe76cf3894abd6d..3556bcf8ba0d7f16b1d9bf50e46aebde83de2e25 100644
--- a/python/paddle/v2/fluid/tests/test_adagrad_op.py
+++ b/python/paddle/v2/fluid/tests/test_adagrad_op.py
@@ -180,7 +180,7 @@ class TestSparseAdagradOp(unittest.TestCase):
 
     def test_sparse_adagrad(self):
         places = [core.CPUPlace()]
-        if core.is_compile_gpu():
+        if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
         for place in places:
             self.check_with_place(place)
diff --git a/python/paddle/v2/fluid/tests/test_adam_op.py b/python/paddle/v2/fluid/tests/test_adam_op.py
index 10580adca714beeb7571312b8fdc4235ecaaccfe..df1fa8983c1984a9bb9f204aded148c17d3d609d 100644
--- a/python/paddle/v2/fluid/tests/test_adam_op.py
+++ b/python/paddle/v2/fluid/tests/test_adam_op.py
@@ -305,7 +305,7 @@ class TestSparseAdamOp(unittest.TestCase):
 
     def test_sparse_sgd(self):
         places = [core.CPUPlace()]
-        if core.is_compile_gpu():
+        if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
         for place in places:
             self.check_with_place(place)
diff --git a/python/paddle/v2/fluid/tests/test_batch_norm_op.py b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
index 371bd426781b457582e74c33c80c46b5d56946fa..cf13166f255c782bdcec622d58d073a0943c8e1e 100644
--- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
@@ -352,7 +352,7 @@ class TestBatchNormOp(OpTest):
             print "op test backward passed: ", str(place), data_layout
 
         places = [core.CPUPlace()]
-        if core.is_compile_gpu() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
             places.append(core.CUDAPlace(0))
 
         for place in places:
diff --git a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
index 82842534d4ac7ad8b0a8e0d877c6a638fb53cadc..79beb8b1fcef610bc2f3e8d18da4345baa9b99c3 100644
--- a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
@@ -33,7 +33,7 @@ class TestGaussianRandomOp(unittest.TestCase):
         self.gaussian_random_test(place=fluid.CPUPlace())
 
     def test_gpu(self):
-        if core.is_compile_gpu():
+        if core.is_compiled_with_cuda():
             self.gaussian_random_test(place=fluid.CUDAPlace(0))
 
     def gaussian_random_test(self, place):
diff --git a/python/paddle/v2/fluid/tests/test_iou_similarity_op.py b/python/paddle/v2/fluid/tests/test_iou_similarity_op.py
old mode 100755
new mode 100644
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
index 566fbba9abff36a2e1faccc8086bdabda0115d66..3f54e28defb76d3430a82e791578e20b84833f16 100644
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -202,6 +202,18 @@ class TestBook(unittest.TestCase):
                     x_t=x_t, hidden_t_prev=prev_hidden, cell_t_prev=prev_cell))
         print(str(program))
 
+    def test_dynamic_lstmp(self):
+        program = Program()
+        with program_guard(program):
+            hidden_dim, proj_dim = 16, 8
+            seq_data = layers.data(
+                name='seq_data', shape=[10, 10], dtype='float32', lod_level=1)
+            fc_out = layers.fc(input=seq_data, size=4 * hidden_dim)
+            self.assertIsNotNone(
+                layers.dynamic_lstmp(
+                    input=fc_out, size=4 * hidden_dim, proj_size=proj_dim))
+        print(str(program))
+
     def test_sequence_softmax(self):
         program = Program()
         with program_guard(program):
@@ -279,6 +291,16 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(out)
         print(str(program))
 
+    def test_multiplex(self):
+        program = Program()
+        with program_guard(program):
+            x1 = layers.data(name='x1', shape=[4], dtype='float32')
+            x2 = layers.data(name='x2', shape=[4], dtype='float32')
+            index = layers.data(name='index', shape=[1], dtype='int32')
+            out = layers.multiplex(inputs=[x1, x2], index=index)
+            self.assertIsNotNone(out)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_lstm_op.py b/python/paddle/v2/fluid/tests/test_lstm_op.py
index d9fa01e247ae613fb2a7ed523a447e31a5bd5994..3e79f9d8e157bc744f14ecfa7c9a6d7de4eae1f9 100644
--- a/python/paddle/v2/fluid/tests/test_lstm_op.py
+++ b/python/paddle/v2/fluid/tests/test_lstm_op.py
@@ -42,7 +42,7 @@ def relu(x):
     return np.maximum(x, 0)
 
 
-ACTVATION = {
+ACTIVATION = {
     'identity': identity,
     'sigmoid': sigmoid,
     'tanh': tanh,
@@ -158,8 +158,8 @@ class TestLstmOp(OpTest):
         w_b = b[:, 0:4 * self.D]
         w_c = b[:, 4 * self.D:] if self.use_peepholes else None
         h, c = lstm(x, self.lod, h0, c0, w, w_b, w_c, self.is_reverse,
-                    ACTVATION[self.act_gate], ACTVATION[self.act_cell],
-                    ACTVATION[self.act_cand])
+                    ACTIVATION[self.act_gate], ACTIVATION[self.act_cell],
+                    ACTIVATION[self.act_cand])
 
         self.inputs = {'Input': (x, self.lod), 'Weight': w}
 
diff --git a/python/paddle/v2/fluid/tests/test_lstmp_op.py b/python/paddle/v2/fluid/tests/test_lstmp_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..92a954a9aa5574c3016cf9744e1765fff9e9c091
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_lstmp_op.py
@@ -0,0 +1,286 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+import test_lstm_op as LstmTest
+
+ACTIVATION = {
+    'identity': LstmTest.identity,
+    'sigmoid': LstmTest.sigmoid,
+    'tanh': LstmTest.tanh,
+    'relu': LstmTest.relu
+}
+
+
+# LSTM with recurrent projection Layer
+def lstmp(
+        input,  # T x 4D
+        lod,  # 1 x N
+        h0=None,  # N x D
+        c0=None,  # N x D
+        w_r=None,  # P x 4D
+        w_rh=None,  # D x P
+        w_b=None,  # 1 x 4D
+        w_c=None,  # 1 x 3D
+        is_reverse=False,
+        act_gate=None,
+        act_cell=None,
+        act_cand=None,
+        act_proj=None):
+    def _step(x, w_r, w_rh, w_c, r_pre, c_pre, act_gate, act_cell, act_cand,
+              act_proj):
+        g = np.dot(r_pre, w_r)  # 1 x 4D
+        g = g + x
+        g = np.reshape(g, (1, g.size))
+        c, g_i, g_f, g_o = np.split(g, 4, axis=1)
+        if w_c is None:
+            g_i = act_gate(g_i)  # 1 x D
+            g_f = act_gate(g_f)  # 1 x D
+        else:
+            w_ic, w_fc, _ = np.split(w_c, 3, axis=1)
+            g_i = act_gate(g_i + w_ic * c_pre)  # 1 x D
+            g_f = act_gate(g_f + w_fc * c_pre)  # 1 x D
+        c = g_f * c_pre + g_i * act_cand(c)  # 1 x D
+
+        if w_c is None:
+            g_o = act_gate(g_o)  # 1 x D
+        else:
+            _, _, w_oc = np.split(w_c, 3, axis=1)
+            g_o = act_gate(g_o + w_oc * c)  # 1 x D
+        h = g_o * act_cell(c)
+        # projection
+        r = np.dot(h, w_rh)
+        r = act_proj(r)
+        return r, c
+
+    def _reverse(x, lod):
+        y = np.zeros_like(x)
+        for i in range(len(lod) - 1):
+            b, e = lod[i], lod[i + 1]
+            y[b:e, :] = np.flip(x[b:e, :], 0)
+        return y
+
+    offset = lod[0]
+    batch_size = len(offset) - 1
+    # recurrent projection state
+    projection = []
+    cell = []
+    input = _reverse(input, offset) if is_reverse else input
+    if w_b is not None:
+        input = input + np.tile(w_b, (offset[-1], 1))
+    for i in range(batch_size):
+        # compute one sequence
+        seq_len = offset[i + 1] - offset[i]
+        x = input[offset[i]:offset[i + 1], :]
+        r_pre = np.dot(h0[i], w_rh)  # 1 x P
+        r_pre = act_proj(r_pre)
+        c_pre = c0[i]  # 1 x D
+        for j in range(seq_len):
+            # compute one step
+            r_pre, c_pre = _step(x[j], w_r, w_rh, w_c, r_pre, c_pre, act_gate,
+                                 act_cell, act_cand, act_proj)
+            projection.append(r_pre.flatten())
+            cell.append(c_pre.flatten())
+
+    projection = np.array(projection).astype('float64')
+    cell = np.array(cell).astype('float64')
+
+    projection = _reverse(projection, offset) if is_reverse else projection
+    cell = _reverse(cell, offset) if is_reverse else cell
+
+    assert projection.shape == (input.shape[0], w_r.shape[0])  # T x P
+    assert cell.shape == (input.shape[0], input.shape[1] / 4)  # T x D
+    return projection, cell
+
+
+class TestLstmpOp(LstmTest.TestLstmOp):
+    def reset_argument(self):
+        pass
+
+    def setUp(self):
+        self.set_argument()
+        # projection size
+        self.P = 10
+        self.act_proj = self.act_cell
+
+        self.reset_argument()
+        self.op_type = 'lstmp'
+
+        T = self.lod[0][-1]
+        N = len(self.lod[0]) - 1
+
+        x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
+        if self.has_initial_state:
+            h0 = np.random.normal(size=(N, self.D)).astype('float64')
+            c0 = np.random.normal(size=(N, self.D)).astype('float64')
+        else:
+            h0 = np.zeros((N, self.D)).astype('float64')
+            c0 = np.zeros((N, self.D)).astype('float64')
+        w = np.random.normal(size=(self.P, 4 * self.D)).astype('float64')
+        if self.use_peepholes:
+            b = np.random.normal(size=(1, 7 * self.D)).astype('float64')
+        else:
+            b = np.random.normal(size=(1, 4 * self.D)).astype('float64')
+
+        w_b = b[:, 0:4 * self.D]
+        w_c = b[:, 4 * self.D:] if self.use_peepholes else None
+        w_rh = np.random.normal(size=(self.D, self.P)).astype('float64')
+        r, c = lstmp(x, self.lod, h0, c0, w, w_rh, w_b, w_c, self.is_reverse,
+                     ACTIVATION[self.act_gate], ACTIVATION[self.act_cell],
+                     ACTIVATION[self.act_cand], ACTIVATION[self.act_proj])
+
+        self.inputs = {'Input': (x, self.lod), 'Weight': w, 'ProjWeight': w_rh}
+
+        self.inputs['Bias'] = b
+
+        if self.has_initial_state:
+            self.inputs['H0'] = h0
+            self.inputs['C0'] = c0
+
+        self.outputs = {
+            'Projection': (r, self.lod),
+            'Cell': (c, self.lod),
+        }
+        self.attrs = {
+            'use_peepholes': self.use_peepholes,
+            'is_reverse': self.is_reverse,
+            'gate_activation': self.act_gate,
+            'cell_activation': self.act_cell,
+            'candidate_activation': self.act_cand,
+            'proj_activation': self.act_proj
+        }
+
+    def test_check_output(self):
+        self.check_output(atol=1e-8)
+
+    def test_check_grad(self):
+        # TODO(qingqing) remove folowing lines after the check_grad is refined.
+        N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'ProjWeight', 'Bias'], ['Projection'],
+            max_relative_error=1e-2)
+
+
+class TestLstmpOpHasInitial(TestLstmpOp):
+    def reset_argument(self):
+        self.has_initial_state = True
+
+    def test_check_grad(self):
+        # TODO(qingqing) remove folowing lines after the check_grad is refined.
+        N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0', 'C0'],
+            ['Projection'],
+            max_relative_error=1e-2)
+
+    def test_check_grad_ingore_bias(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'ProjWeight', 'Weight'], ['Projection'],
+            max_relative_error=1e-2,
+            no_grad_set=set('Bias'))
+
+    def test_check_grad_ingore_weight(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'ProjWeight', 'Bias'], ['Projection'],
+            max_relative_error=1e-2,
+            no_grad_set=set('Weight'))
+
+    def test_check_grad_ingore_proj_weight(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'Bias'], ['Projection'],
+            max_relative_error=1e-2,
+            no_grad_set=set('ProjWeight'))
+
+    def test_check_grad_ingore_input(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Weight', 'ProjWeight', 'Bias'], ['Projection'],
+            max_relative_error=1e-2,
+            no_grad_set=set('Input'))
+
+    def test_check_grad_ingore_h0(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'ProjWeight', 'Bias', 'C0'], ['Projection'],
+            max_relative_error=1e-2,
+            no_grad_set=set('H0'))
+
+    def test_check_grad_ingore_c0(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0'], ['Projection'],
+            max_relative_error=1e-2,
+            no_grad_set=set('C0'))
+
+
+class TestLstmpOpRerverse(TestLstmpOp):
+    def reset_argument(self):
+        self.is_reverse = True
+
+
+class TestLstmpOpNotUsePeepholes(TestLstmpOp):
+    def reset_argument(self):
+        self.use_peepholes = False
+
+
+class TestLstmpOpLinearProjection(TestLstmpOp):
+    def reset_argument(self):
+        self.act_proj = 'identity'
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_multihead_attention.py b/python/paddle/v2/fluid/tests/test_multihead_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2b300a645fe21931cc12a4e7bb8ebe9b85707c9
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_multihead_attention.py
@@ -0,0 +1,98 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import numpy as np
+
+
+class TestMultiheadAttention(unittest.TestCase):
+    def gen_random_input(self):
+        """Generate random input data.
+        """
+        # batch_size, max_sequence_length, hidden dimension
+        self.input_shape = (3, 13, 16)
+        self.queries = np.random.random(size=self.input_shape).astype("float32")
+        self.keys = np.random.random(size=self.input_shape).astype("float32")
+
+    def set_program(self):
+        """Build the test program.
+        """
+        queries = fluid.layers.data(
+            name="queries",
+            shape=self.input_shape,
+            dtype="float32",
+            append_batch_size=False)
+        queries.stop_gradient = False
+        keys = fluid.layers.data(
+            name="keys",
+            shape=self.input_shape,
+            dtype="float32",
+            append_batch_size=False)
+        keys.stop_gradient = False
+
+        contexts = fluid.nets.scaled_dot_product_attention(
+            queries=queries,
+            keys=keys,
+            values=keys,
+            num_heads=8,
+            dropout_rate=0.)
+        out = fluid.layers.reduce_sum(contexts, dim=None)
+        fluid.backward.append_backward(loss=out)
+
+        self.fetch_list = [contexts]
+
+    def run_program(self):
+        """Run the test program.
+        """
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            self.set_inputs(place)
+            exe = fluid.Executor(place)
+
+            exe.run(fluid.default_startup_program())
+            output = exe.run(fluid.default_main_program(),
+                             feed=self.inputs,
+                             fetch_list=self.fetch_list,
+                             return_numpy=True)
+            self.op_output = output
+
+    def set_inputs(self, place):
+        """Set the randomly generated data to the test program.
+        """
+        self.inputs = {}
+        queries = fluid.Tensor()
+        queries.set(self.queries, place)
+
+        keys = fluid.Tensor()
+        keys.set(self.keys, place)
+
+        self.inputs["keys"] = keys
+        self.inputs["queries"] = queries
+
+    def test_multihead_attention(self):
+        self.gen_random_input()
+
+        self.set_program()
+        self.run_program()
+
+        #fixme(caoying) add more meaningfull unittest.
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_normalization_wrapper.py b/python/paddle/v2/fluid/tests/test_normalization_wrapper.py
index 57f14f6b9cc9c7cf9ae93274cf3d7763350e6e10..6b71f2a923f0cf0744d6b2190aa35830dcf15f24 100644
--- a/python/paddle/v2/fluid/tests/test_normalization_wrapper.py
+++ b/python/paddle/v2/fluid/tests/test_normalization_wrapper.py
@@ -46,7 +46,7 @@ class TestNormalization(unittest.TestCase):
         """Run the test program.
         """
         places = [core.CPUPlace()]
-        if core.is_compile_gpu():
+        if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
 
         for place in places:
diff --git a/python/paddle/v2/fluid/tests/test_one_hot_op.py b/python/paddle/v2/fluid/tests/test_one_hot_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e51ea27d14d0637021f8902fa935beb318658018
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_one_hot_op.py
@@ -0,0 +1,110 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import math
+from op_test import OpTest
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.framework as framework
+from paddle.v2.fluid.framework import Program, program_guard
+
+
+class TestOneHotOp(OpTest):
+    def setUp(self):
+        self.op_type = 'one_hot'
+        depth = 10
+        dimension = 12
+        x_lod = [[0, 4, 5, 8, 11]]
+        x = [np.random.randint(0, depth - 1) for i in xrange(x_lod[0][-1])]
+        x = np.array(x).astype('int').reshape([x_lod[0][-1], 1])
+
+        out = np.zeros(shape=(np.product(x.shape[:-1]),
+                              depth)).astype('float32')
+
+        for i in xrange(np.product(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'depth': depth, 'dtype': int(core.DataType.FP32)}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestOneHotOp_default_dtype(OpTest):
+    def setUp(self):
+        self.op_type = 'one_hot'
+        depth = 10
+        dimension = 12
+        x_lod = [[0, 4, 5, 8, 11]]
+        x = [np.random.randint(0, depth - 1) for i in xrange(x_lod[0][-1])]
+        x = np.array(x).astype('int').reshape([x_lod[0][-1], 1])
+
+        out = np.zeros(shape=(np.product(x.shape[:-1]),
+                              depth)).astype('float32')
+
+        for i in xrange(np.product(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'depth': depth}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestOneHotOp_exception(OpTest):
+    def setUp(self):
+        self.op_type = 'one_hot'
+        self.depth = 10
+        self.place = core.CPUPlace()
+        self.dimension = 12
+        self.x = core.LoDTensor()
+        x_lod = [[0, 4, 5, 8, 11]]
+        data = [np.random.randint(11, 20) for i in xrange(x_lod[0][-1])]
+        data = np.array(data).astype('int').reshape([x_lod[0][-1], 1])
+        self.x.set(data, self.place)
+        self.x.set_lod(x_lod)
+
+    def test_check_output(self):
+        program = Program()
+        with program_guard(program):
+            x = fluid.layers.data(
+                name='x', shape=[self.dimension], dtype='float32', lod_level=1)
+            block = program.current_block()
+            one_hot_out = block.create_var(
+                name="one_hot_out",
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                dtype='float32')
+            block.append_op(
+                type='one_hot',
+                inputs={'X': x},
+                attrs={'depth': self.depth},
+                outputs={'Out': one_hot_out})
+            exe = fluid.Executor(self.place)
+
+            def run():
+                exe.run(feed={'x': self.x},
+                        fetch_list=[one_hot_out],
+                        return_numpy=False)
+
+            self.assertRaises(core.EnforceNotMet, run)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_op_support_gpu.py b/python/paddle/v2/fluid/tests/test_op_support_gpu.py
index 34939818126b1d747fb76861bbd691894fb3759b..7de02a8fda22a3db82a2e0b5e6fa9c9f2718fa12 100644
--- a/python/paddle/v2/fluid/tests/test_op_support_gpu.py
+++ b/python/paddle/v2/fluid/tests/test_op_support_gpu.py
@@ -18,7 +18,8 @@ import paddle.v2.fluid.core as core
 
 class TestOpSupportGPU(unittest.TestCase):
     def test_case(self):
-        self.assertEqual(core.is_compile_gpu(), core.op_support_gpu("sum"))
+        self.assertEqual(core.is_compiled_with_cuda(),
+                         core.op_support_gpu("sum"))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_parallel_op.py b/python/paddle/v2/fluid/tests/test_parallel_op.py
index 5394df7360e5ba60408b7f53494c3e0152c424a7..367cc8b1aaf0aff24c685031f33d35becb9eb7ef 100644
--- a/python/paddle/v2/fluid/tests/test_parallel_op.py
+++ b/python/paddle/v2/fluid/tests/test_parallel_op.py
@@ -53,7 +53,7 @@ class BaseParallelForTest(unittest.TestCase):
             fetch=fetch,
             place=cpu,
             use_parallel=True)
-        if fluid.core.is_compile_gpu():
+        if fluid.core.is_compiled_with_cuda():
             gpu = fluid.CUDAPlace(0)
             result_gpu = self._run_test_impl_(
                 callback=callback,
diff --git a/python/paddle/v2/fluid/tests/test_profiler.py b/python/paddle/v2/fluid/tests/test_profiler.py
index 34700df37d22cf71bad2d86efa4718a3767c2d4f..09b2d08401878448b4b3f3c6c03193e255e9ffeb 100644
--- a/python/paddle/v2/fluid/tests/test_profiler.py
+++ b/python/paddle/v2/fluid/tests/test_profiler.py
@@ -23,7 +23,7 @@ import paddle.v2.fluid.core as core
 
 class TestProfiler(unittest.TestCase):
     def test_nvprof(self):
-        if not fluid.core.is_compile_gpu():
+        if not fluid.core.is_compiled_with_cuda():
             return
         epoc = 8
         dshape = [4, 3, 28, 28]
@@ -42,7 +42,7 @@ class TestProfiler(unittest.TestCase):
         os.remove(output_file)
 
     def net_profiler(self, state):
-        if state == 'GPU' and not core.is_compile_gpu():
+        if state == 'GPU' and not core.is_compiled_with_cuda():
             return
         startup_program = fluid.Program()
         main_program = fluid.Program()
diff --git a/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
index 74cd6de9e6fde70c001bb2189c4976cdd8e34633..0a223bac0ce8fd626881cef983c7cd960f2c5ba8 100644
--- a/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
+++ b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
@@ -45,7 +45,7 @@ class TestReorderLoDTensor(unittest.TestCase):
         outputs = []
         input_grads = []
         places = [core.CPUPlace()]
-        if core.is_compile_gpu():
+        if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
         for place in places:
             self.set_inputs(place)
diff --git a/python/paddle/v2/fluid/tests/test_sgd_op.py b/python/paddle/v2/fluid/tests/test_sgd_op.py
index f87927968b0fdb00ec207ff1d52be9e0d81af139..ba2ca1683f9f6d72bbd1550df89c7424d223a1d9 100644
--- a/python/paddle/v2/fluid/tests/test_sgd_op.py
+++ b/python/paddle/v2/fluid/tests/test_sgd_op.py
@@ -91,7 +91,7 @@ class TestSparseSGDOp(unittest.TestCase):
 
     def test_sparse_sgd(self):
         places = [core.CPUPlace()]
-        if core.is_compile_gpu():
+        if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
         for place in places:
             self.check_with_place(place)
diff --git a/python/paddle/v2/fluid/tests/test_split_selected_rows_op.py b/python/paddle/v2/fluid/tests/test_split_selected_rows_op.py
index 37c6587c4151a89563f93cab35d63b2419ef88ab..343aa20066146ae08462a92f1efaa20c4d4b5ed8 100644
--- a/python/paddle/v2/fluid/tests/test_split_selected_rows_op.py
+++ b/python/paddle/v2/fluid/tests/test_split_selected_rows_op.py
@@ -21,7 +21,7 @@ from paddle.v2.fluid.op import Operator
 class TestSpliteSelectedRows(unittest.TestCase):
     def get_places(self):
         places = [core.CPUPlace()]
-        if core.is_compile_gpu():
+        if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
         return places
 
diff --git a/python/paddle/v2/fluid/tests/test_uniform_random_op.py b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
index b2a39f975eb461292dc2e7be332a26931684bf90..94cf416fad8f02cdea8017ae1350fa264ce644b1 100644
--- a/python/paddle/v2/fluid/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
@@ -36,7 +36,7 @@ class TestUniformRandomOp(unittest.TestCase):
         self.uniform_random_test(place=core.CPUPlace())
 
     def test_gpu(self):
-        if core.is_compile_gpu():
+        if core.is_compiled_with_cuda():
             self.uniform_random_test(place=core.CUDAPlace(0))
 
     def uniform_random_test(self, place):