diff --git a/.clang-format b/.clang-format
index aff93435f58c522f5ed1090aef2005f76e91cf31..8b5830627348c6bff12260b7d9adbd357f074718 100644
--- a/.clang-format
+++ b/.clang-format
@@ -19,7 +19,7 @@ BasedOnStyle:  Google
 IndentWidth:     2
 TabWidth:        2
 ContinuationIndentWidth: 4
-AccessModifierOffset: -2  # The private/protected/public has no indent in class
+AccessModifierOffset: -1  # The private/protected/public has no indent in class
 Standard:  Cpp11 
 AllowAllParametersOfDeclarationOnNextLine: true
 BinPackParameters: false
diff --git a/.copyright.hook b/.copyright.hook
index 09afff2072df3384a429d01d06188218ae6e85d1..86b16ebdc46047c7cb3d7731a71cbf9647a1f2fe 100644
--- a/.copyright.hook
+++ b/.copyright.hook
@@ -9,7 +9,7 @@ import subprocess
 import platform
 
 COPYRIGHT = '''
-  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6140340890c0e5025eb08209e8ea78df918b4dc0..eeda759ff18ccb86ce6a585fe41cb972ea3ae295 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -34,6 +34,14 @@ repos:
         entry: bash ./tools/codestyle/cpplint_pre_commit.hook
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
+-   repo: local
+    hooks:
+    -   id: pylint-doc-string
+        name: pylint
+        description: Check python docstring style using docstring_checker.
+        entry: bash ./tools/codestyle/pylint_pre_commit.hook
+        language: system
+        files: \.(py)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
     sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
     hooks:
diff --git a/.travis.yml b/.travis.yml
index 3391e2c3cab9938c9dc5705b51367c707d3bbe9d..8c772030925dcad3909f142b08e4d8057a3f89b7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -18,6 +18,8 @@ env:
 addons:
   ssh_known_hosts: 13.229.163.131
 before_install:
+  # For pylint dockstring checker
+  - sudo pip install pylint pytest astroid isort
   - |
     function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
diff --git a/AUTHORS.md b/AUTHORS.md
index 4ee05420982d13f686cf13e8957ce41dfcdd2cb8..11f227be7148d8d6e055538347a8c31679406c84 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -4,6 +4,7 @@
 | backyes | Yan-Fei Wang |
 | baiyfbupt | Yi-Fan Bai |
 | beckett1124 | Bin Qi |
+| ChengduoZH | Cheng-Duo Zhao|
 | chengxiaohua1105 | Xiao-Hua Cheng |
 | cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
 | cxysteven | Xing-Yi Cheng |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 23bbe829ac16180088bfa37df66e23f19b021ea3..cfaab206e1f321a55119d4a8d65c4a99d3819fff 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,7 +25,6 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
 message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
         "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 
-find_package(Sphinx)
 if(NOT CMAKE_CROSSCOMPILING)
     find_package(CUDA QUIET)
 endif(NOT CMAKE_CROSSCOMPILING)
@@ -42,7 +41,6 @@ option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FO
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
-option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check"         ON)
 option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
 option(WITH_DOUBLE      "Compile PaddlePaddle with double precision"    OFF)
 option(WITH_RDMA        "Compile PaddlePaddle with RDMA support"        OFF)
@@ -59,8 +57,10 @@ option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
 option(WITH_DISTRIBUTE  "Compile with grpc distributed support"         OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
+option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
+option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -101,6 +101,9 @@ endif()
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
   "A path setting third party libraries download & build directories.")
 
+set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING
+  "A path setting fluid shared and static libraries")
+
 if (WITH_C_API AND WITH_PYTHON)
   message(WARNING "It is suggest not embedded a python interpreter in Paddle "
     "when using C-API. It will give an unpredictable behavior when using a "
@@ -118,13 +121,14 @@ else()
 endif()
 
 set(WITH_MKLML ${WITH_MKL})
-if (WITH_MKL AND AVX2_FOUND)
-    set(WITH_MKLDNN ON)
-else()
-    message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
-    set(WITH_MKLDNN OFF)
+if (NOT DEFINED WITH_MKLDNN)
+    if (WITH_MKL AND AVX2_FOUND)
+        set(WITH_MKLDNN ON)
+    else()
+        message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
+        set(WITH_MKLDNN OFF)
+    endif()
 endif()
-
 ########################################################################################
 
 include(external/mklml)     # download mklml package
@@ -153,7 +157,6 @@ include(cupti)
 include(configure)          # add paddle env configuration
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
-include(cpplint)            # set paddle c++ style
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs
 include(rdma)               # set rdma libraries
@@ -202,7 +205,7 @@ endif(USE_NNPACK)
 
 add_subdirectory(proto)
 
-if(NOT MOBILE_INFERENCE)
+if(NOT MOBILE_INFERENCE AND NOT WITH_FLUID_ONLY)
     # "add_subdirectory(go)" should be placed after the following loine,
     # because it depends on paddle/optimizer.
     add_subdirectory(paddle/optimizer)
@@ -226,5 +229,11 @@ if(WITH_PYTHON)
 endif()
 
 if(WITH_DOC)
+    find_package(Sphinx REQUIRED)
+    find_python_module(recommonmark REQUIRED)
     add_subdirectory(doc)
 endif()
+
+if (WITH_CONTRIB)
+    add_subdirectory(paddle/contrib)
+endif()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3c36cffcb4eeaaf7f8cff5167777628dd2697e7d..b1b02bcc2f4fd14297715bcf5bfd1617e3d5f0c9 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -58,6 +58,8 @@ PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful-
     create mode 100644 233
    ```
 
+	NOTE: The `yapf` installed by `pip install pre-commit` and `conda install -c conda-forge pre-commit` is slightly different. Paddle developers use `pip install pre-commit`.
+
 1. Build and test
 
    Users can build PaddlePaddle natively on Linux and Mac OS X.  But to unify the building environment and to make it easy for debugging, the recommended way is [using Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/build_en.md).
diff --git a/Dockerfile b/Dockerfile
index 164fe84904947bfc3cf71132b5fba04744460b26..752fea5951bdc8c2cf79a17c960217c88ae62571 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -24,12 +24,12 @@ COPY ./paddle/scripts/docker/root/ /root/
 
 RUN apt-get update && \
     apt-get install -y --allow-downgrades \
-    git python-pip python-dev openssh-server bison \
+    git python-pip python-dev python-opencv openssh-server bison \
     libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
     wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
     curl sed grep graphviz libjpeg-dev zlib1g-dev  \
     python-matplotlib gcc-4.8 g++-4.8 \
-    automake locales clang-format swig doxygen cmake  \
+    automake locales clang-format swig cmake  \
     liblapack-dev liblapacke-dev \
     clang-3.8 llvm-3.8 libclang-3.8-dev \
     net-tools libtool ccache && \
@@ -70,14 +70,16 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
-RUN pip install --upgrade pip==9.0.3 && \
+RUN easy_install -U pip && \
     pip install -U wheel && \
     pip install -U docopt PyYAML sphinx==1.5.6 && \
     pip install sphinx-rtd-theme==0.1.9 recommonmark
 
 RUN pip install pre-commit 'ipython==5.3.0' && \
-    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip install opencv-python
+    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0'
+
+#For docstring checker
+RUN pip install pylint pytest astroid isort
 
 COPY ./python/requirements.txt /root/
 RUN pip install -r /root/requirements.txt
@@ -101,6 +103,3 @@ RUN echo 'root:root' | chpasswd
 RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
 RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 EXPOSE 22
-
-# development image default do build work
-CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
diff --git a/Dockerfile.android b/Dockerfile.android
index 848a7eba6f1421432addae8acff407b611adb4ae..48db2efea21a648657e3f490c95429b9a29ede52 100644
--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -40,5 +40,3 @@ RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
     unzip -q android-ndk-r14b-linux-x86_64.zip && \
     mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
     rm -rf /opt/android-ndk-tmp
-
-CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"]
diff --git a/README.md b/README.md
index a3b13fe79cc33927e5cc3e091926b111688a941b..8d89c6b1ec9e4aefbd64328dedb4e8c7cc50c21b 100644
--- a/README.md
+++ b/README.md
@@ -62,9 +62,9 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
 ## Installation
 
 It is recommended to check out the
-[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html)
+[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/docker_install_en.html)
 before looking into the
-[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/build_from_source_en.html).
+[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/build_from_source_en.html).
 
 ## Documentation
 
diff --git a/benchmark/.gitignore b/benchmark/.gitignore
index 7b66e8a5b5020fd847982db401665d24ba3a069c..fb4114356d4f37efc8ad672316fd4f99443d9fcd 100644
--- a/benchmark/.gitignore
+++ b/benchmark/.gitignore
@@ -7,3 +7,6 @@ paddle/rnn/imdb.pkl
 caffe/image/logs
 tensorflow/image/logs
 tensorflow/rnn/logs
+fluid/models/*.pyc
+fluid/logs
+fluid/nohup.out
diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md
deleted file mode 100644
index 64816098a524f064ec12474a736cd4c721227a70..0000000000000000000000000000000000000000
--- a/benchmark/cluster/README.md
+++ /dev/null
@@ -1,196 +0,0 @@
-# Cluster Training Benchmark
-
-## Setup
-
-- Platform
-  - Kubernetes: v1.6.2
-  - Linux Kernel: v3.10.0
-
-- Resource
-  - CPU: 10 Cores per Pod
-  - Memory: 5GB per Pod
-
-- Docker Image
-
-  We use different base Docker Image to run the benchmark on Kubernetes:
-  - PaddlePaddle v2: paddlepaddle/paddle:0.11.0
-  - PaddlePaddle Fluid: paddlepaddle/paddle:[commit-id]
-  - TensorFlow: tensorflow/tensorflow:1.5.0-rc0
-
-- Model
-  vgg16 is used in this benchmark.
-
-## Cases
-
-- Variable
-  - Batch Size of training data.
-  - PServer count of the training job.
-  - The number of trainers.
-
-- Invariant
-  - The resource of trainer/pserver Pod.
-
-### Measure the Performance for Different Batch Size
-
-- PServer Count: 40
-- Trainer Count: 100
-- Metrics: mini-batch / sec
-
-
-<table>
-<thead>
-<tr>
-<th>Batch Size </th>
-<th> 32</th>
-<th>64</th>
-<th>128 </th>
-<th>256</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td>-</td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2  </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-</tbody>
-</table>
-
-### Measure the Performance for Different PServer Count
-
-- Trainer Count: 100
-- Batch Size: 64
-- Metrics: mini-batch / sec
-
-
-<table>
-<thead>
-<tr>
-<th>PServer Count  </th>
-<th>10</th>
-<th>20</th>
-<th>40 </th>
-<th>60</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td>-</td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2  </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-</tbody>
-</table>
-
-### Measure Parallel Efficiency By Increasing Trainer Count
-
-- PServer Count: 20
-- Batch Size: 64
-- Metrics:
-
-$S = \div(T1, TN)$
-
-which S is the ratio of T1 over TN, training time of 1 and N trainers.
-The parallel efficiency is:
-
-$E = \div(S, N)$
-
-<table>
-<thead>
-<tr>
-<th>Trainer Counter  </th>
-<th>1</th>
-<th>10</th>
-<th>20 </th>
-<th>30</th>
-<th>40</th>
-<th>50</th>
-<th>60 </th>
-<th>70</th>
-<th>80</th>
-<th>90</th>
-<th>100 </th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2  </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-</tr>
-</tbody>
-</table>
-
-
-## Reproduce the benchmark
-
-TODO
diff --git a/benchmark/cluster/vgg16/Dockerfile b/benchmark/cluster/vgg16/Dockerfile
deleted file mode 100644
index 13ad8e1b6237e6f41a076c4fb54311728832ae33..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/Dockerfile
+++ /dev/null
@@ -1,35 +0,0 @@
-FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
-
-# you can get mirror list here:
-# https://launchpad.net/ubuntu/+archivemirrors
-ARG UBUNTU_MIRROR
-RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-
-RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
-RUN pip install -U kubernetes opencv-python
-
-RUN pip install paddlepaddle
-# if network is slowly, you may need to add proxy here.
-# ENV https_proxy=
-RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
-RUN pip uninstall -y paddlepaddle
-# unset proxy if it is setted.
-# ENV https_proxy=""
-
-# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
-#       so we must build one with distribute support to install in this image.
-ADD *.whl /
-RUN pip install /*.whl && rm -f /*.whl
-ENV LD_LIBRARY_PATH=/usr/local/lib
-
-# tf k8s
-RUN pip install tensorflow==1.4.0
-ADD tf_k8s /usr/bin
-RUN chmod +x /usr/bin/tf_k8s
-ADD vgg16_tf.py /workspace/
-
-# below lines may change a lot for debugging
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
-RUN chmod +x /usr/bin/paddle_k8s
-ADD vgg16_fluid.py vgg16_v2.py /workspace/
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
deleted file mode 100644
index d56a912b9b03986e32693363f82df05a34b779e9..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/README.md
+++ /dev/null
@@ -1,195 +0,0 @@
-# Performance for Distributed vgg16
-
-## Test Result
-
-### Hardware Infomation
-
-- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
-- cpu MHz		: 2101.000
-- cache size	: 20480 KB
-
-### Blas settings
-
-Setting environment variable: `MKL_NUM_THREADS=1`.
-
-### Single Node Single Thread
-
-- Metrics: samples / sec
-
-<table>
-<thead>
-<tr>
-<th>Batch Size </th>
-<th> 32</th>
-<th>64</th>
-<th>128 </th>
-<th>256</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td> 15.44 </td>
-<td> 16.32 </td>
-<td> 16.74 </td>
-<td> 16.79 </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2  </td>
-<td> 15.97 </td>
-<td> 17.04 </td>
-<td> 17.60 </td>
-<td> 17.83 </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td> 9.09 </td>
-<td> 9.10 </td>
-<td> 9.24 </td>
-<td> 8.66 </td>
-</tr>
-</tbody>
-</table>
-
-
-### Different Batch Size
-
-- PServer Count: 10
-- Trainer Count: 20
-- Metrics: samples / sec
-
-<table>
-<thead>
-<tr>
-<th>Batch Size </th>
-<th> 32</th>
-<th>64</th>
-<th>128 </th>
-<th>256</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td> 190.20 </td>
-<td> 222.15 </td>
-<td> 247.40 </td>
-<td> 258.18 </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2  </td>
-<td> 170.96 </td>
-<td> 233.71 </td>
-<td> 256.14 </td>
-<td> 329.23 </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-</tr>
-</tbody>
-</table>
-
-### Accelerate Rate
-
-- Pserver Count: 20
-- Batch Size: 128
-- Metrics: samples / sec
-
-<table>
-<thead>
-<tr>
-<th>Trainer Count </th>
-<th>20</th>
-<th>40</th>
-<th>80</th>
-<th>100</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td> 263.29 (78.64%) </td>
-<td> 518.80 (77.47%) </td>
-<td> 836.26 (62.44%) </td>
-<td> 1019.29 (60.89%) </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2 (need more tests)   </td>
-<td> 326.85 (92.85%) </td>
-<td> 534.58 (75.93%) </td>
-<td> 853.30 (60.60%) </td>
-<td> 1041.99 (59.20%) </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-</tr>
-</tbody>
-</table>
-
-
-### Different Pserver Count
-
-- Trainer Count: 60
-- Batch Size: 128
-- Metrics: samples/ sec
-
-<table>
-<thead>
-<tr>
-<th>PServer Count </th>
-<th>3</th>
-<th>6</th>
-<th>10</th>
-<th>20</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid(should fix in next PR) </td>
-<td> 589.1 </td>
-<td> 592.6 </td>
-<td> 656.4 </td>
-<td> 655.8 </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2 (need more tests)   </td>
-<td> 593.4 </td>
-<td> 791.3 </td>
-<td> 729.7 </td>
-<td> 821.7 </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-</tr>
-</tbody>
-</table>
-
-
-*The performance gap between Fuild and v2 comes from the network interference.*
-
-
-## Steps to Run the Performance Test
-
-1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
-1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
-1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
-1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
-1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
-
-Check the logs for the distributed training progress and analyze the performance.
-
-## Enable Verbos Logs
-
-Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
diff --git a/benchmark/cluster/vgg16/fluid_pserver.yaml b/benchmark/cluster/vgg16/fluid_pserver.yaml
deleted file mode 100644
index ee8b0763b62fc011f40f6197e929a68b48a93e47..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/fluid_pserver.yaml
+++ /dev/null
@@ -1,72 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
-  name: vgg16job-pserver
-spec:
-  replicas: 10
-  template:
-    metadata:
-      labels:
-        paddle-job-pserver: vgg16job
-    spec:
-      hostNetwork: true
-      imagePullSecrets:
-      - name: job-registry-secret
-      containers:
-      - name: pserver
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
-        imagePullPolicy: Always
-        ports:
-        - name: jobport-30236
-          containerPort: 30236
-        env:
-        - name: PADDLE_JOB_NAME
-          value: vgg16job
-        - name: MKL_NUM_THREADS
-          value: "1"
-        - name: TRAINING_ROLE
-          value: "PSERVER"
-        - name: TRAINERS
-          value: "20"
-        - name: PSERVERS
-          value: "10"
-        - name: TOPOLOGY
-          value: ""
-        - name: ENTRY
-          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: PADDLE_INIT_PORT
-          value: "30236"
-        - name: PADDLE_INIT_NICS
-          value: "xgbe0"
-        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
-          value: "1"
-        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
-          value: "20"
-        - name: PADDLE_INIT_NUM_PASSES
-          value: "1"
-        - name: PADDLE_INIT_USE_GPU
-          value: "0"
-        - name: LD_LIBRARY_PATH
-          value: "/usr/local/lib:/usr/local/nvidia/lib64"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        - name: POD_IP
-          valueFrom:
-            fieldRef:
-              fieldPath: "status.podIP"
-        command: ["paddle_k8s", "start_fluid"]
-        resources:
-          requests:
-            memory: 10Gi
-            cpu: 4
-          limits:
-            memory: 10Gi
-            cpu: 4
diff --git a/benchmark/cluster/vgg16/fluid_trainer.yaml b/benchmark/cluster/vgg16/fluid_trainer.yaml
deleted file mode 100644
index 3d56caac009464d1073423bb63abff1f8b0cf28f..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ /dev/null
@@ -1,69 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: vgg16job-trainer
-spec:
-  parallelism: 20
-  completions: 20
-  template:
-    metadata:
-      labels:
-        paddle-job: vgg16job
-    spec:
-      imagePullSecrets:
-      - name: job-registry-secret
-      hostNetwork: true
-      containers:
-      - name: trainer
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
-        imagePullPolicy: Always
-        command: ["paddle_k8s", "start_fluid"]
-        env:
-        - name: PADDLE_JOB_NAME
-          value: vgg16job
-        - name: TRAINING_ROLE
-          value: "TRAINER"
-        - name: TRAINERS
-          value: "20"
-        - name: PSERVERS
-          value: "10"
-        - name: TOPOLOGY
-          value: ""
-        - name: ENTRY
-          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: PADDLE_INIT_PORT
-          value: "30236"
-        - name: PADDLE_INIT_NICS
-          value: "xgbe0"
-        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
-          value: "1"
-        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
-          value: "20"
-        - name: PADDLE_INIT_NUM_PASSES
-          value: "1"
-        - name: PADDLE_INIT_USE_GPU
-          value: "0"
-        - name: LD_LIBRARY_PATH
-          value: "/usr/local/lib:/usr/local/nvidia/lib64"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        - name: POD_IP
-          valueFrom:
-            fieldRef:
-              fieldPath: "status.podIP"
-        resources:
-          requests:
-            memory: 40Gi
-            cpu: 2
-          limits:
-            memory: 40Gi
-            cpu: 2
-      restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/run_vgg_dist.sh b/benchmark/cluster/vgg16/run_vgg_dist.sh
deleted file mode 100644
index 8c0501439e9d5fa175f5aa9b62d286e690a10904..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/run_vgg_dist.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-
-# Update to point to the source file.
-VGG_SRC="vgg16_fluid.py"
-
-export TRAINING_ROLE=PSERVER
-export TRAINERS=2
-export POD_IP=127.0.0.1
-export PADDLE_INIT_PORT=6174
-MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 &
-
-# Need to wait for the ps to start first.
-sleep 10
-echo "done start ps"
-
-export TRAINING_ROLE=TRAINER
-export TRAINERS=2
-export POD_IP=127.0.0.1
-export PADDLE_INIT_PORT=6174
-CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 &
-CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 &
diff --git a/benchmark/cluster/vgg16/tf_k8s b/benchmark/cluster/vgg16/tf_k8s
deleted file mode 100644
index 4fc263d5f681aeabfa71f1758714d269d987b272..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/tf_k8s
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/bin/bash
-check_trainer_ret() {
-  ret=$1
-  stdbuf -oL echo "job returned $ret...setting pod return message..."
-  stdbuf -oL echo "==============================="
-
-  if [ $ret -eq 136 ] ; then
-    echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
-  elif [ $ret -eq 139 ] ; then
-    echo "Segmentation Fault" > /dev/termination-log
-  elif [ $ret -eq 1 ] ; then
-    echo "General Error" > /dev/termination-log
-  elif [ $ret -eq 134 ] ; then
-    echo "Program Abort" > /dev/termination-log
-  fi
-  stdbuf -oL echo "termination log wroted..."
-  exit $ret
-}
-
-g_pservers=""
-g_trainers=""
-
-wait_running_pods(){
-  pserver_label="tf-job-pserver=${JOB_NAME}"
-  trainer_label="tf-job-trainer=${JOB_NAME}"
-
-  stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
-  stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
-
-  g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
-  g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
-}
-
-start_tf_pserver(){
-  wait_running_pods
-
-  label="tf-job-pserver=${JOB_NAME}"
-  pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
-
-  cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
-  --job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
-
-  stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
-}
-
-start_tf_trainer(){
-  wait_running_pods
-
-  label="tf-job-trainer=${JOB_NAME}"
-  trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
-
-  cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
-  --job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
-
-  stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
-  check_trainer_ret $?
-}
-
-start_tf(){
-    if [[ "${TF_JOB_NAME}" == "worker" ]]; then
-        start_tf_trainer
-    else
-        start_tf_pserver
-    fi
-}
-
-usage() {
-    echo "usage: tf_k8s [<args>]:"
-    echo "  start_tf         Start tensorflow jobs"
-}
-
-case "$1" in
-    start_tf)
-        start_tf
-        ;;
-    --help)
-        usage
-        ;;
-    *)
-        usage
-        ;;
-esac
diff --git a/benchmark/cluster/vgg16/tf_pserver.yaml b/benchmark/cluster/vgg16/tf_pserver.yaml
deleted file mode 100644
index 5e37c700819119c8af05c40fe4b8d13911efc3e1..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/tf_pserver.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
-  name: vgg16job-tf-pserver
-spec:
-  replicas: 10
-  template:
-    metadata:
-      labels:
-        tf-job-pserver: vgg16job-tf
-    spec:
-      hostNetwork: true
-      imagePullSecrets:
-      - name: job-registry-secret
-      containers:
-      - name: pserver
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
-        imagePullPolicy: Always
-        command: ["tf_k8s", "start_tf"]
-        ports:
-        - name: jobport-30236
-          containerPort: 30236
-        env:
-        - name: PORT
-          value: "32036"
-        - name: ENTRY
-          value: "python vgg16_tf.py"
-        - name: JOB_NAME
-          value: vgg16job-tf
-        - name: PSERVERS_NUM
-          value: "10"
-        - name: TF_JOB_NAME 
-          value: "ps"
-        - name: TRAINERS_NUM
-          value: "20"
-        - name: BATCH_SIZE
-          value: "128"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: NUM_PASSES
-          value: "1"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        - name: POD_IP
-          valueFrom:
-            fieldRef:
-              fieldPath: "status.podIP"
-        resources:
-          requests:
-            memory: 10Gi
-            cpu: 4
-          limits:
-            memory: 10Gi
-            cpu: 4
diff --git a/benchmark/cluster/vgg16/tf_trainer.yaml b/benchmark/cluster/vgg16/tf_trainer.yaml
deleted file mode 100644
index 08795df3addfa7b618db24a65e57be190e268f06..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/tf_trainer.yaml
+++ /dev/null
@@ -1,58 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: vgg16job-tf-trainer
-spec:
-  parallelism: 20
-  completions: 20
-  template:
-    metadata:
-      labels:
-        tf-job-trainer: vgg16job-tf
-    spec:
-      imagePullSecrets:
-      - name: job-registry-secret
-      hostNetwork: true
-      containers:
-      - name: trainer
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
-        imagePullPolicy: Always
-        command: ["tf_k8s", "start_tf"]
-        ports:
-        - name: jobport-30236
-          containerPort: 30236
-        env:
-        - name: PORT
-          value: "32036"
-        - name: JOB_NAME
-          value: vgg16job-tf
-        - name: TF_JOB_NAME 
-          value: "worker"
-        - name: ENTRY
-          value: "python vgg16_tf.py"
-        - name: PSERVERS_NUM
-          value: "10"
-        - name: BATCH_SIZE
-          value: "128"
-        - name: TRAINERS_NUM
-          value: "20"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: NUM_PASSES
-          value: "1"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        - name: POD_IP
-          valueFrom:
-            fieldRef:
-              fieldPath: "status.podIP"
-        resources:
-          requests:
-            memory: 40Gi
-            cpu: 2
-          limits:
-            memory: 40Gi
-            cpu: 2
-      restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/v2_pserver.yaml b/benchmark/cluster/vgg16/v2_pserver.yaml
deleted file mode 100644
index dd1271e0cf399184134c06b3200ee1202c65cef0..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/v2_pserver.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
-  name: vgg16v2job-pserver
-spec:
-  replicas: 10
-  template:
-    metadata:
-      labels:
-        paddle-job-pserver: vgg16v2job
-    spec:
-      hostNetwork: true
-      imagePullSecrets:
-      - name: job-registry-secret
-      containers:
-      - name: pserver
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
-        imagePullPolicy: Always
-        ports:
-        - name: jobport-30236
-          containerPort: 30236
-        env:
-        - name: PADDLE_JOB_NAME
-          value: vgg16v2job
-        - name: TRAINERS
-          value: "20"
-        - name: PSERVERS
-          value: "10"
-        - name: TOPOLOGY
-          value: ""
-        - name: ENTRY
-          value: "python train.py"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: PADDLE_INIT_PORT
-          value: "30236"
-        - name: PADDLE_INIT_NICS
-          value: "xgbe0"
-        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
-          value: "1"
-        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
-          value: "20"
-        - name: PADDLE_INIT_NUM_PASSES
-          value: "1"
-        - name: PADDLE_INIT_USE_GPU
-          value: "0"
-        - name: LD_LIBRARY_PATH
-          value: "/usr/local/lib:/usr/local/nvidia/lib64"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        command: ["paddle_k8s", "start_pserver"]
-        resources:
-          requests:
-            memory: 10Gi
-            cpu: 4
-          limits:
-            memory: 10Gi
-            cpu: 4
diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml
deleted file mode 100644
index 12c8964066cbcfe8d2a44de2f51a3d12ea422fe2..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: vgg16v2job-trainer
-spec:
-  parallelism: 20
-  completions: 20
-  template:
-    metadata:
-      labels:
-        paddle-job: vgg16v2job
-    spec:
-      imagePullSecrets:
-        - name: job-registry-secret
-      hostNetwork: true
-      containers:
-      - name: trainer
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
-        imagePullPolicy: Always
-        command: ["paddle_k8s", "start_trainer", "v2"]
-        env:
-        - name: PADDLE_JOB_NAME
-          value: vgg16v2job
-        - name: BATCH_SIZE
-          value: "256"
-        - name: TRAINERS
-          value: "20"
-        - name: PSERVERS
-          value: "10"
-        - name: TOPOLOGY
-          value: ""
-        - name: ENTRY
-          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: PADDLE_INIT_PORT
-          value: "30236"
-        - name: PADDLE_INIT_NICS
-          value: "xgbe0"
-        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
-          value: "1"
-        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
-          value: "20"
-        - name: PADDLE_INIT_NUM_PASSES
-          value: "2"
-        - name: PADDLE_INIT_USE_GPU
-          value: "0"
-        - name: LD_LIBRARY_PATH
-          value: "/usr/local/lib:/usr/local/nvidia/lib64"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        resources:
-          requests:
-            memory: 40Gi
-            cpu: 2
-          limits:
-            memory: 40Gi
-            cpu: 2
-      restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
deleted file mode 100644
index 05b5f3977cbed2f08df73c6d8ba2fff687db3313..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ /dev/null
@@ -1,308 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""VGG16 benchmark in Fluid"""
-from __future__ import print_function
-
-import sys
-import time
-import numpy as np
-import paddle.v2 as paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.profiler as profiler
-import argparse
-import functools
-import os
-from paddle.fluid import debuger
-
-
-def str2bool(v):
-    if v.lower() in ('yes', 'true', 't', 'y', '1'):
-        return True
-    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
-        return False
-    else:
-        raise argparse.ArgumentTypeError('Boolean value expected.')
-
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    '--batch_size', type=int, default=128, help="Batch size for training.")
-parser.add_argument(
-    '--learning_rate',
-    type=float,
-    default=1e-3,
-    help="Learning rate for training.")
-parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
-parser.add_argument(
-    '--device',
-    type=str,
-    default='CPU',
-    choices=['CPU', 'GPU'],
-    help="The device type.")
-parser.add_argument('--device_id', type=int, default=0, help="The device id.")
-parser.add_argument(
-    '--data_format',
-    type=str,
-    default='NCHW',
-    choices=['NCHW', 'NHWC'],
-    help='The data order, now only support NCHW.')
-parser.add_argument(
-    '--data_set',
-    type=str,
-    default='cifar10',
-    choices=['cifar10', 'flowers'],
-    help='Optional dataset for benchmark.')
-parser.add_argument(
-    '--local',
-    type=str2bool,
-    default=True,
-    help='Whether to run as local mode.')
-
-parser.add_argument(
-    "--ps_hosts",
-    type=str,
-    default="",
-    help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
-    "--trainer_hosts",
-    type=str,
-    default="",
-    help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
-    "--profile", action='store_true', help="If set, profile a few steps.")
-
-# Flags for defining the tf.train.Server
-parser.add_argument(
-    "--task_index", type=int, default=0, help="Index of task within the job")
-args = parser.parse_args()
-
-
-def vgg16_bn_drop(input):
-    def conv_block(input, num_filter, groups, dropouts):
-        return fluid.nets.img_conv_group(
-            input=input,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act='relu',
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type='max')
-
-    conv1 = conv_block(input, 64, 2, [0.3, 0])
-    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
-    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
-    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
-    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
-    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
-    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
-    return fc2
-
-
-def main():
-    if args.data_set == "cifar10":
-        classdim = 10
-        if args.data_format == 'NCHW':
-            data_shape = [3, 32, 32]
-        else:
-            data_shape = [32, 32, 3]
-    else:
-        classdim = 102
-        if args.data_format == 'NCHW':
-            data_shape = [3, 224, 224]
-        else:
-            data_shape = [224, 224, 3]
-
-    # Input data
-    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    # Train program
-    net = vgg16_bn_drop(images)
-    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    # Evaluator
-    batch_size = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(
-        input=predict, label=label, total=batch_size)
-
-    # inference program
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(batch_acc)
-
-    # Optimization
-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
-    optimize_ops, params_grads = optimizer.minimize(avg_cost)
-
-    # Initialize executor
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(
-        args.device_id)
-    exe = fluid.Executor(place)
-
-    # test
-    def test(exe):
-        test_pass_acc = fluid.average.WeightedAverage()
-        for batch_id, data in enumerate(test_reader()):
-            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
-                                    data)).astype("float32")
-            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-            y_data = y_data.reshape([-1, 1])
-
-            outs = exe.run(inference_program,
-                           feed={"pixel": img_data,
-                                 "label": y_data},
-                           fetch_list=[batch_acc, batch_size])
-            test_pass_acc.add(value=np.array(outs[0]), weight=np.array(outs[1]))
-
-        return test_pass_acc.eval()
-
-    def train_loop(exe, trainer_prog):
-        iters = 0
-        ts = time.time()
-        train_pass_acc = fluid.average.WeightedAverage()
-        for pass_id in range(args.num_passes):
-            # train
-            start_time = time.time()
-            num_samples = 0
-            train_pass_acc.reset()
-
-            def run_step(batch_id, data):
-                img_data = np.array(
-                    map(lambda x: x[0].reshape(data_shape), data)).astype(
-                        "float32")
-                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-                y_data = y_data.reshape([-1, 1])
-
-                loss, acc, b_size = exe.run(
-                    trainer_prog,
-                    feed={"pixel": img_data,
-                          "label": y_data},
-                    fetch_list=[avg_cost, batch_acc, batch_size])
-                return loss, acc, b_size
-
-            if args.profile and args.task_index == 0:
-                # warmup.
-                for batch_id, data in enumerate(train_reader()):
-                    if batch_id > 5: break
-                    run_step(batch_id, data)
-                with profiler.profiler('All', 'total', '/tmp/profile_vgg'):
-                    for batch_id, data in enumerate(train_reader()):
-                        if batch_id > 5: break
-                        run_step(batch_id, data)
-
-            for batch_id, data in enumerate(train_reader()):
-                ts = time.time()
-                loss, acc, b_size = run_step(batch_id, data)
-                iters += 1
-                num_samples += len(data)
-                train_pass_acc.add(value=acc, weight=b_size)
-                print(
-                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
-                    "Speed = %.2f img/s" % (pass_id, iters, loss, acc,
-                                            len(data) / (time.time() - ts))
-                )  # The accuracy is the accumulation of batches, but not the current batch.
-
-            pass_elapsed = time.time() - start_time
-            pass_train_acc = train_pass_acc.eval()
-            pass_test_acc = test(exe)
-            print("Task:%d Pass = %d, Training performance = %f imgs/s, "
-                  "Train accuracy = %f, Test accuracy = %f\n" %
-                  (args.task_index, pass_id, num_samples / pass_elapsed,
-                   pass_train_acc, pass_test_acc))
-
-    if args.local:
-        # Parameter initialization
-        exe.run(fluid.default_startup_program())
-
-        # data reader
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
-                else paddle.dataset.flowers.train(),
-                buf_size=5120),
-            batch_size=args.batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.cifar.test10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-            batch_size=args.batch_size)
-        train_loop(exe, fluid.default_main_program())
-    else:
-        trainers = int(os.getenv("TRAINERS"))  # total trainer count
-        print("trainers total: ", trainers)
-
-        training_role = os.getenv(
-            "TRAINING_ROLE",
-            "TRAINER")  # get the training role: trainer/pserver
-
-        t = fluid.DistributeTranspiler()
-        t.transpile(
-            trainer_id=args.task_index,
-            pservers=args.ps_hosts,
-            trainers=trainers)
-
-        if training_role == "PSERVER":
-            current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
-                "PADDLE_INIT_PORT")
-            if not current_endpoint:
-                print("need env SERVER_ENDPOINT")
-                exit(1)
-            pserver_prog = t.get_pserver_program(current_endpoint)
-            pserver_startup = t.get_startup_program(current_endpoint,
-                                                    pserver_prog)
-            exe.run(pserver_startup)
-            exe.run(pserver_prog)
-        elif training_role == "TRAINER":
-            # Parameter initialization
-            exe.run(fluid.default_startup_program())
-
-            # data reader
-            train_reader = paddle.batch(
-                paddle.reader.shuffle(
-                    paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
-                    else paddle.dataset.flowers.train(),
-                    buf_size=5120),
-                batch_size=args.batch_size)
-            test_reader = paddle.batch(
-                paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else
-                paddle.dataset.flowers.test(),
-                batch_size=args.batch_size)
-
-            trainer_prog = t.get_trainer_program()
-            feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
-            # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
-            exe.run(fluid.default_startup_program())
-            train_loop(exe, trainer_prog)
-        else:
-            print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
-
-
-def print_arguments():
-    print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == "__main__":
-    print_arguments()
-    main()
diff --git a/benchmark/cluster/vgg16/vgg16_tf.py b/benchmark/cluster/vgg16/vgg16_tf.py
deleted file mode 100644
index 2d220478acae46566760209dbc012cff316946aa..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/vgg16_tf.py
+++ /dev/null
@@ -1,366 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""VGG16 benchmark in TensorFlow
-You can get distribution example template structure here:
-https://medium.com/clusterone/how-to-write-distributed-tensorflow-code-with-an-example-on-tensorport-70bf3306adcb
-https://www.tensorflow.org/deploy/distributed
-"""
-
-import tensorflow as tf
-import paddle.v2 as paddle
-import numpy as np
-import argparse
-import time
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    '--batch_size', type=int, default=128, help="Batch size for training.")
-parser.add_argument(
-    '--learning_rate',
-    type=float,
-    default=1e-3,
-    help="Learning rate for training.")
-parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
-parser.add_argument(
-    '--device',
-    type=str,
-    default='CPU',
-    choices=['CPU', 'GPU'],
-    help="The device type.")
-parser.add_argument(
-    '--data_format',
-    type=str,
-    default='NHWC',
-    choices=['NCHW', 'NHWC'],
-    help='The data order, NCHW=[batch, channels, height, width].'
-    'Only support NHWC right now.')
-parser.add_argument(
-    '--data_set',
-    type=str,
-    default='cifar10',
-    choices=['cifar10', 'flowers'],
-    help='Optional dataset for benchmark.')
-
-parser.add_argument(
-    "--ps_hosts",
-    type=str,
-    default="",
-    help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
-    "--worker_hosts",
-    type=str,
-    default="",
-    help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
-    "--job_name", type=str, default="", help="One of 'worker', 'ps'")
-# Flags for defining the tf.train.Server
-parser.add_argument(
-    "--task_index", type=int, default=0, help="Index of task within the job")
-
-args = parser.parse_args()
-
-
-class VGG16Model(object):
-    def __init__(self):
-        self.parameters = []
-
-    def batch_norm_relu(self, inputs, is_training):
-        """Performs a batch normalization followed by a ReLU."""
-        # We set fused=True for a significant speed boost. See
-        # https://www.tensorflow.org/speed/speed_guide#common_fused_ops
-        inputs = tf.layers.batch_normalization(
-            inputs=inputs,
-            axis=1 if args.data_format == 'NCHW' else -1,
-            momentum=0.9,
-            epsilon=1e-05,
-            center=True,
-            scale=True,
-            training=is_training,
-            fused=True)
-        inputs = tf.nn.relu(inputs)
-        return inputs
-
-    def conv_bn_layer(self,
-                      name,
-                      images,
-                      kernel_shape,
-                      is_training,
-                      drop_rate=0.0):
-        with tf.name_scope(name) as scope:
-            kernel = tf.Variable(
-                tf.truncated_normal(
-                    kernel_shape, dtype=tf.float32, stddev=1e-1),
-                name='weights')
-            conv = tf.nn.conv2d(
-                images,
-                kernel, [1, 1, 1, 1],
-                data_format=args.data_format,
-                padding='SAME')
-            biases = tf.Variable(
-                tf.constant(
-                    0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
-                trainable=True,
-                name='biases')
-            out = tf.nn.bias_add(conv, biases)
-            out = self.batch_norm_relu(out, is_training)
-            out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
-            return out
-
-    def fc_layer(self, name, inputs, shape):
-        with tf.name_scope(name) as scope:
-            fc_w = tf.Variable(
-                tf.truncated_normal(
-                    shape, dtype=tf.float32, stddev=1e-1),
-                name='weights')
-            fc_b = tf.Variable(
-                tf.constant(
-                    0.0, shape=[shape[-1]], dtype=tf.float32),
-                trainable=True,
-                name='biases')
-            out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
-            return out
-
-    def network(self, images, class_dim, is_training):
-        """ VGG16 model structure.
-
-            TODO(kuke): enable this network to support the 'NCHW' data format
-        """
-
-        # conv1
-        conv1_1 = self.conv_bn_layer(
-            'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
-        conv1_2 = self.conv_bn_layer(
-            'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
-        # pool1
-        pool1 = tf.nn.max_pool(
-            conv1_2,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool1')
-        # conv2
-        conv2_1 = self.conv_bn_layer(
-            'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
-        conv2_2 = self.conv_bn_layer(
-            'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
-        # pool2
-        pool2 = tf.nn.max_pool(
-            conv2_2,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool2')
-        # conv3
-        conv3_1 = self.conv_bn_layer(
-            'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
-        conv3_2 = self.conv_bn_layer(
-            'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
-        conv3_3 = self.conv_bn_layer(
-            'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
-        # pool3
-        pool3 = tf.nn.max_pool(
-            conv3_3,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool3')
-        # conv4
-        conv4_1 = self.conv_bn_layer(
-            'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
-        conv4_2 = self.conv_bn_layer(
-            'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
-        conv4_3 = self.conv_bn_layer(
-            'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
-        # pool4
-        pool4 = tf.nn.max_pool(
-            conv4_3,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool4')
-        # conv5
-        conv5_1 = self.conv_bn_layer(
-            'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
-        conv5_2 = self.conv_bn_layer(
-            'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
-        conv5_3 = self.conv_bn_layer(
-            'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
-        # pool5
-        pool5 = tf.nn.max_pool(
-            conv5_3,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool4')
-        # flatten
-        shape = int(np.prod(pool5.get_shape()[1:]))
-        pool5_flat = tf.reshape(pool5, [-1, shape])
-        # fc1
-        drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
-        fc1 = self.fc_layer('fc1', drop, [shape, 512])
-        # fc2
-        bn = self.batch_norm_relu(fc1, is_training)
-        drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
-        fc2 = self.fc_layer('fc2', drop, [512, 512])
-
-        fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
-
-        return fc3
-
-
-def run_benchmark(cluster_spec, server):
-    """Run benchmark on cifar10 or flowers."""
-
-    if args.data_set == "cifar10":
-        class_dim = 10
-        raw_shape = (3, 32, 32)
-        dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
-            None, 3, 32, 32)
-    else:
-        class_dim = 102
-        raw_shape = (3, 224, 224)
-        dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
-            None, 3, 224, 224)
-
-    device = tf.train.replica_device_setter(
-        worker_device="/job:worker/task:{}".format(args.task_index),
-        cluster=cluster_spec)
-
-    with tf.device(device):
-        images = tf.placeholder(tf.float32, shape=dat_shape)
-        labels = tf.placeholder(tf.int64, shape=(None, ))
-        is_training = tf.placeholder('bool')
-        onehot_labels = tf.one_hot(labels, depth=class_dim)
-
-        vgg16 = VGG16Model()
-        logits = vgg16.network(images, class_dim, is_training)
-        loss = tf.losses.softmax_cross_entropy(
-            onehot_labels=onehot_labels, logits=logits)
-        avg_loss = tf.reduce_mean(loss)
-
-        correct = tf.equal(tf.argmax(logits, 1), labels)
-        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
-
-        optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
-        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-        global_step = tf.Variable(0, name='global_step', trainable=False)
-        with tf.control_dependencies(update_ops):
-            train_op = optimizer.minimize(avg_loss, global_step=global_step)
-
-        summary_op = tf.summary.merge_all()
-        init_op = tf.global_variables_initializer()
-
-    # data reader
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
-            buf_size=5120),
-        batch_size=args.batch_size)
-    test_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.cifar.test10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-            buf_size=5120),
-        batch_size=args.batch_size)
-
-    # test
-    def test():
-        test_accs = []
-        for batch_id, data in enumerate(test_reader()):
-            test_images = np.array(
-         map(lambda x: np.transpose(x[0].reshape(raw_shape),
-         axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
-            test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
-            test_accs.append(
-                accuracy.eval(feed_dict={
-                    images: test_images,
-                    labels: test_labels,
-                    is_training: False
-                }))
-        return np.mean(test_accs)
-
-    config = tf.ConfigProto(
-        intra_op_parallelism_threads=1,
-        inter_op_parallelism_threads=1,
-        log_device_placement=True)
-    config.gpu_options.allow_growth = True
-
-    hooks = [tf.train.StopAtStepHook(last_step=1000000)]
-
-    with tf.train.MonitoredTrainingSession(
-            master=server.target,
-            is_chief=(args.task_index == 0),
-            hooks=hooks,
-            config=config) as sess:
-        iters, num_samples, start_time = 0, 0, 0.0
-        for pass_id in range(args.num_passes):
-            # train
-            num_samples = 0
-            start_time = time.time()
-            for batch_id, data in enumerate(train_reader()):
-                train_images = np.array(
-                    map(lambda x: np.transpose(x[0].reshape(raw_shape),
-                    axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
-                train_labels = np.array(map(lambda x: x[1], data)).astype(
-                    'int64')
-                iter_begin_time = time.time()
-                _, loss, acc = sess.run([train_op, avg_loss, accuracy],
-                                        feed_dict={
-                                            images: train_images,
-                                            labels: train_labels,
-                                            is_training: True
-                                        })
-                iters += 1
-                print(
-                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed=%.2f imgs/sec"
-                    % (pass_id, iters, loss, acc,
-                       len(data) / (time.time() - iter_begin_time)))
-                num_samples += len(data)
-            train_elapsed = time.time() - start_time
-            # test
-            pass_test_acc = test()
-            print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
-                  (pass_id, num_samples / train_elapsed, pass_test_acc))
-
-
-def print_arguments():
-    print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == '__main__':
-    print_arguments()
-
-    ps_hosts = args.ps_hosts.split(",")
-    worker_hosts = args.worker_hosts.split(",")
-
-    # Create a cluster from the parameter server and worker hosts.
-    cluster_spec = tf.train.ClusterSpec({
-        "ps": ps_hosts,
-        "worker": worker_hosts
-    })
-
-    # Create and start a server for the local task.
-    server = tf.train.Server(
-        cluster_spec, job_name=args.job_name, task_index=args.task_index)
-
-    if args.job_name == "ps":
-        print("start pserver")
-        server.join()
-    elif args.job_name == "worker":
-        print("start worker")
-        run_benchmark(cluster_spec, server)
diff --git a/benchmark/cluster/vgg16/vgg16_v2.py b/benchmark/cluster/vgg16/vgg16_v2.py
deleted file mode 100644
index 1a66af32d7131997c63bd3c3042875f33a467084..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/vgg16_v2.py
+++ /dev/null
@@ -1,154 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-import gzip
-
-import paddle.v2.dataset.cifar as cifar
-import paddle.v2 as paddle
-import time
-import os
-
-DATA_DIM = 3 * 32 * 32
-CLASS_DIM = 10
-BATCH_SIZE = os.getenv("BATCH_SIZE")
-if BATCH_SIZE:
-    BATCH_SIZE = int(BATCH_SIZE)
-else:
-    BATCH_SIZE = 128
-print "batch_size", BATCH_SIZE
-NODE_COUNT = int(os.getenv("TRAINERS"))
-ts = 0
-
-
-def vgg(input, nums, class_dim):
-    def conv_block(input, num_filter, groups, num_channels=None):
-        return paddle.networks.img_conv_group(
-            input=input,
-            num_channels=num_channels,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act=paddle.activation.Relu(),
-            pool_type=paddle.pooling.Max())
-
-    assert len(nums) == 5
-    # the channel of input feature is 3
-    conv1 = conv_block(input, 64, nums[0], 3)
-    conv2 = conv_block(conv1, 128, nums[1])
-    conv3 = conv_block(conv2, 256, nums[2])
-    conv4 = conv_block(conv3, 512, nums[3])
-    conv5 = conv_block(conv4, 512, nums[4])
-
-    fc_dim = 512
-    fc1 = paddle.layer.fc(input=conv5,
-                          size=fc_dim,
-                          act=paddle.activation.Relu(),
-                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
-    fc2 = paddle.layer.fc(input=fc1,
-                          size=fc_dim,
-                          act=paddle.activation.Relu(),
-                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
-    out = paddle.layer.fc(input=fc2,
-                          size=class_dim,
-                          act=paddle.activation.Softmax())
-    return out
-
-
-def vgg13(input, class_dim):
-    nums = [2, 2, 2, 2, 2]
-    return vgg(input, nums, class_dim)
-
-
-def vgg16(input, class_dim):
-    nums = [2, 2, 3, 3, 3]
-    return vgg(input, nums, class_dim)
-
-
-def vgg19(input, class_dim):
-    nums = [2, 2, 4, 4, 4]
-    return vgg(input, nums, class_dim)
-
-
-def main():
-    global ts
-    paddle.init(use_gpu=False)
-    image = paddle.layer.data(
-        name="image", type=paddle.data_type.dense_vector(DATA_DIM))
-    lbl = paddle.layer.data(
-        name="label", type=paddle.data_type.integer_value(CLASS_DIM))
-
-    extra_layers = None
-    # NOTE: for v2 distributed training need averaging updates.
-    learning_rate = 1e-3 / NODE_COUNT
-    out = vgg16(image, class_dim=CLASS_DIM)
-    cost = paddle.layer.classification_cost(input=out, label=lbl)
-
-    # Create parameters
-    parameters = paddle.parameters.create(cost)
-
-    # Create optimizer
-    optimizer = paddle.optimizer.Momentum(
-        momentum=0.9,
-        regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
-                                                         BATCH_SIZE),
-        learning_rate=learning_rate / BATCH_SIZE,
-        learning_rate_decay_a=0.1,
-        learning_rate_decay_b=128000 * 35,
-        learning_rate_schedule="discexp", )
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            cifar.train10(),
-            # To use other data, replace the above line with:
-            # reader.train_reader('train.list'),
-            buf_size=1000),
-        batch_size=BATCH_SIZE)
-    test_reader = paddle.batch(
-        cifar.test10(),
-        # To use other data, replace the above line with:
-        # reader.test_reader('val.list'),
-        batch_size=BATCH_SIZE)
-
-    # Create trainer
-    trainer = paddle.trainer.SGD(cost=cost,
-                                 parameters=parameters,
-                                 update_equation=optimizer,
-                                 extra_layers=extra_layers,
-                                 is_local=False)
-
-    # End batch and end pass event handler
-    def event_handler(event):
-        global ts, ts_pass
-        if isinstance(event, paddle.event.BeginPass):
-            ts_pass = time.time()
-        if isinstance(event, paddle.event.BeginIteration):
-            ts = time.time()
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 1 == 0:
-                print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics,
-                    time.time() - ts)
-        if isinstance(event, paddle.event.EndPass):
-            print "Pass %d end, spent: %f" % (event.pass_id,
-                                              time.time() - ts_pass)
-            result = trainer.test(reader=test_reader)
-            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
-
-    trainer.train(
-        reader=train_reader, num_passes=200, event_handler=event_handler)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/benchmark/fluid/Dockerfile b/benchmark/fluid/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..b9eaca5ee6b487bb37bb954b3c606c3096d37aeb
--- /dev/null
+++ b/benchmark/fluid/Dockerfile
@@ -0,0 +1,22 @@
+FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv
+RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
+RUN pip install -U pip
+RUN pip install -U kubernetes paddlepaddle
+
+# IMPORTANT:
+# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
+
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.imikolov.fetch()" | python'
+RUN pip uninstall -y paddlepaddle && mkdir /workspace
+
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s
+
+ENV LD_LIBRARY_PATH=/usr/local/lib
+ADD fluid_benchmark.py recordio_converter.py models/ /workspace/
diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f40f3c129741f9b6e3654399a9110b065fec7d6c
--- /dev/null
+++ b/benchmark/fluid/README.md
@@ -0,0 +1,97 @@
+# Fluid Benchmark
+
+This directory contains several models configurations and tools that used to run
+Fluid benchmarks for local and distributed training.
+
+
+## Run the Benchmark
+
+To start, run the following command to get the full help message:
+
+```bash
+python fluid_benchmark.py --help
+```
+
+Currently supported `--model` argument include:
+
+* mnist
+* resnet
+    * you can chose to use different dataset using `--data_set cifar10` or
+      `--data_set flowers`.
+* vgg
+* stacked_dynamic_lstm
+* machine_translation
+
+* Run the following command to start a benchmark job locally:
+    ```bash
+      python fluid_benchmark.py --model mnist  --device GPU
+    ```
+    You can choose to use GPU/CPU training. With GPU training, you can specify
+    `--gpus <gpu_num>` to run multi GPU training.
+* Run distributed training with parameter servers:
+    * see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example.
+    * start parameter servers:
+        ```bash
+        PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist  --device GPU --update_method pserver
+        sleep 15
+        ```
+    * start trainers:
+        ```bash
+        PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist  --device GPU --update_method pserver
+        ```
+* Run distributed training using NCCL2
+    ```bash
+    PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3  PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2
+    ```
+
+## Prepare the RecordIO file to Achieve Better Performance
+
+Run the following command will generate RecordIO files like "mnist.recordio" under the path
+and batch_size you choose, you can use batch_size=1 so that later reader can change the batch_size
+at any time using `fluid.batch`.
+
+```bash
+python -c 'from recordio_converter import *; prepare_mnist("data", 1)'
+```
+
+## Run Distributed Benchmark on Kubernetes Cluster
+
+You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
+have to start all those processes mannually on each node, which is not recommended.
+
+To build the Docker image, you need to choose a paddle "whl" package to run with, you may either
+download it from
+http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_en.html or
+build it by your own. Once you've got the "whl" package, put it under the current directory and run:
+
+```bash
+docker build -t [your docker image name]:[your docker image tag] .
+```
+
+Then push the image to a Docker registry that your Kubernetes cluster can reach.
+
+We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit
+distributed benchmark jobs to your cluster. To generate a job yaml, just run:
+
+```bash
+python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --gpus 8 --device GPU --update_method pserver " --disttype pserver
+```
+
+Then the yaml files are generated under directory `myjob`, you can run:
+
+```bash
+kubectl create -f myjob/
+```
+
+The job shall start.
+
+
+## Notes for Run Fluid Distributed with NCCL2 and RDMA
+
+Before running NCCL2 distributed jobs, please check that whether your node has multiple network
+interfaces, try to add the environment variable `export NCCL_SOCKET_IFNAME=eth0` to use your actual
+network device.
+
+To run high-performance distributed training, you must prepare your hardware environment to be
+able to run RDMA enabled network communication, please check out [this](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/cluster/nccl2_rdma_training.md)
+note for details.
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..62a05234c45ee4fe1dc21f5a74efc269227154db
--- /dev/null
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -0,0 +1,452 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import cProfile
+import time
+import os
+
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
+import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler
+
+BENCHMARK_MODELS = [
+    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('Fluid model benchmarks.')
+    parser.add_argument(
+        '--model',
+        type=str,
+        choices=BENCHMARK_MODELS,
+        default='resnet',
+        help='The model to run benchmark with.')
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=32,
+        help='The batch size on each gpu.')
+    parser.add_argument(
+        '--learning_rate', type=float, default=0.001, help='The learning rate.')
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations',
+        type=int,
+        default=80,
+        help='The number of minibatches, set to -1 to run all batches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=100, help='The number of passes.')
+    parser.add_argument(
+        '--data_format',
+        type=str,
+        default='NCHW',
+        choices=['NCHW', 'NHWC'],
+        help='The data data_format, now only support NCHW.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--gpus',
+        type=int,
+        default=1,
+        help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
+    # this option is available only for vgg and resnet.
+    parser.add_argument(
+        '--cpus',
+        type=int,
+        default=1,
+        help='If cpus > 1, will use ParallelDo to run, else use Executor.')
+    parser.add_argument(
+        '--data_set',
+        type=str,
+        default='flowers',
+        choices=['cifar10', 'flowers', 'imagenet'],
+        help='Optional dataset for benchmark.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    parser.add_argument(
+        '--no_test',
+        action='store_true',
+        help='If set, do not test the testset during training.')
+    parser.add_argument(
+        '--memory_optimize',
+        action='store_true',
+        help='If set, optimize runtime memory before start.')
+    parser.add_argument(
+        '--use_fake_data',
+        action='store_true',
+        help='If set ommit the actual read data operators.')
+    parser.add_argument(
+        '--profile', action='store_true', help='If set, profile a few steps.')
+    parser.add_argument(
+        '--update_method',
+        type=str,
+        default='local',
+        choices=['local', 'pserver', 'nccl2'],
+        help='Choose parameter update method, can be local, pserver, nccl2.')
+    parser.add_argument(
+        '--use_reader_op',
+        action='store_true',
+        help='Whether to use reader op, and must specify the data path if set this to true.'
+    )
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        default="",
+        help='Directory that contains all the training recordio files.')
+    args = parser.parse_args()
+    return args
+
+
+def append_nccl2_prepare(trainer_id):
+    if trainer_id >= 0:
+        # append gen_nccl_id at the end of startup program
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        port = os.getenv("PADDLE_PSERVER_PORT")
+        worker_ips = os.getenv("PADDLE_TRAINER_IPS")
+        worker_endpoints = []
+        for ip in worker_ips.split(","):
+            worker_endpoints.append(':'.join([ip, port]))
+        num_trainers = len(worker_endpoints)
+        current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
+        worker_endpoints.remove(current_endpoint)
+
+        nccl_id_var = fluid.default_startup_program().global_block().create_var(
+            name="NCCLID",
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.RAW)
+        fluid.default_startup_program().global_block().append_op(
+            type="gen_nccl_id",
+            inputs={},
+            outputs={"NCCLID": nccl_id_var},
+            attrs={
+                "endpoint": current_endpoint,
+                "endpoint_list": worker_endpoints,
+                "trainer_id": trainer_id
+            })
+        return nccl_id_var, num_trainers, trainer_id
+    else:
+        raise Exception("must set positive PADDLE_TRAINER_ID env variables for "
+                        "nccl-based dist train.")
+
+
+def dist_transpile(trainer_id):
+    if trainer_id < 0:
+        return None, None
+
+    # the port of all pservers, needed by both trainer and pserver
+    port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+    # comma separated ips of all pservers, needed by trainer and
+    # pserver
+    pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
+    eplist = []
+    for ip in pserver_ips.split(","):
+        eplist.append(':'.join([ip, port]))
+    pserver_endpoints = ",".join(eplist)
+    # total number of workers/trainers in the job, needed by
+    # trainer and pserver
+    trainers = int(os.getenv("PADDLE_TRAINERS"))
+    # the IP of the local machine, needed by pserver only
+    current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
+    # the role, should be either PSERVER or TRAINER
+    training_role = os.getenv("PADDLE_TRAINING_ROLE")
+
+    t = distribute_transpiler.DistributeTranspiler()
+    t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
+    if training_role == "PSERVER":
+        pserver_program = t.get_pserver_program(current_endpoint)
+        pserver_startup_program = t.get_startup_program(current_endpoint,
+                                                        pserver_program)
+        return pserver_program, pserver_startup_program
+    elif training_role == "TRAINER":
+        train_program = t.get_trainer_program()
+        return train_program, fluid.default_startup_program()
+    else:
+        raise ValueError(
+            'TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
+        )
+
+
+def test(exe, inference_program, test_reader, feeder, batch_acc):
+    accuracy_evaluator = fluid.metrics.Accuracy()
+    for batch_id, data in enumerate(test_reader()):
+        acc = exe.run(inference_program,
+                      feed=feeder.feed(data),
+                      fetch_list=[batch_acc])
+        accuracy_evaluator.update(value=np.array(acc), weight=len(data))
+
+    return accuracy_evaluator.eval()
+
+
+# TODO(wuyi): replace train, train_parallel, test functions with new trainer
+# API once it is ready.
+def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
+          args, train_prog, startup_prog):
+    if os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER":
+        place = core.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        exe.run(train_prog)
+        return
+
+    if args.use_fake_data:
+        raise Exception(
+            "fake data is not supported in single GPU test for now.")
+
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(startup_prog)
+
+    if not args.use_reader_op:
+        feed_var_list = [
+            var for var in train_prog.global_block().vars.itervalues()
+            if var.is_data
+        ]
+        feeder = fluid.DataFeeder(feed_var_list, place)
+
+    iters, num_samples, start_time = 0, 0, time.time()
+    for pass_id in range(args.pass_num):
+        train_losses = []
+        if not args.use_reader_op:
+            reader_generator = train_reader()
+        batch_id = 0
+        data = None
+        while True:
+            if not args.use_reader_op:
+                data = next(reader_generator, None)
+                if data == None:
+                    break
+            if iters == args.iterations:
+                break
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+
+            if args.use_reader_op:
+                try:
+                    loss = exe.run(train_prog, fetch_list=[avg_loss])
+                except fluid.core.EnforceNotMet as ex:
+                    break
+            else:
+                loss = exe.run(train_prog,
+                               feed=feeder.feed(data),
+                               fetch_list=[avg_loss])
+            iters += 1
+            batch_id += 1
+            # FIXME(wuyi): For use_reader_op, if the current
+            # pass is not the last, the last batch of this pass
+            # is also equal to args.batch_size.
+            if args.use_reader_op:
+                num_samples += args.batch_size * args.gpus
+            else:
+                num_samples += len(data)
+            train_losses.append(loss)
+            print("Pass: %d, Iter: %d, Loss: %f\n" %
+                  (pass_id, iters, np.mean(train_losses)))
+        print_train_time(start_time, time.time(), num_samples)
+        print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
+        # evaluation
+        if not args.no_test and batch_acc:
+            pass_test_acc = test(exe, infer_prog, test_reader, feeder,
+                                 batch_acc)
+            print(", Test Accuracy: %f" % pass_test_acc)
+        print("\n")
+        # TODO(wuyi): add warmup passes to get better perf data.
+        exit(0)
+
+
+# TODO(wuyi): replace train, train_parallel, test functions with new trainer
+# API once it is ready.
+def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
+                   batch_acc, args, train_prog, startup_prog, nccl_id_var,
+                   num_trainers, trainer_id):
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    if not args.use_reader_op:
+        feed_var_list = [
+            var for var in train_prog.global_block().vars.itervalues()
+            if var.is_data
+        ]
+        feeder = fluid.DataFeeder(feed_var_list, place)
+
+    # generate fake:
+    if args.use_fake_data:
+        for var in feed_var_list:
+            v = startup_prog.global_block().clone_variable(var)
+            var.persistable = True
+            v.persistable = True
+
+            real_shape = list(var.shape)
+            real_shape[0] = args.batch_size / args.gpus
+            startup_prog.global_block().append_op(
+                outputs={"Out": v},
+                type="fill_constant",
+                attrs={"shape": real_shape,
+                       "value": 1.0,
+                       "dtype": var.dtype})
+
+    if nccl_id_var and trainer_id == 0:
+        #FIXME(wuyi): wait other trainer to start listening
+        time.sleep(30)
+
+    startup_exe = fluid.Executor(place)
+    startup_exe.run(startup_prog)
+    strategy = fluid.ExecutionStrategy()
+    strategy.num_threads = 1
+    strategy.allow_op_delay = False
+    exe = fluid.ParallelExecutor(
+        True,
+        avg_loss.name,
+        exec_strategy=strategy,
+        num_trainers=num_trainers,
+        trainer_id=trainer_id)
+
+    for pass_id in range(args.pass_num):
+        num_samples = 0
+        iters = 0
+        start_time = time.time()
+        if not args.use_reader_op:
+            reader_generator = train_reader()
+        batch_id = 0
+        data = None
+        while True:
+            if not args.use_reader_op:
+                data = next(reader_generator, None)
+                if data == None:
+                    break
+            if iters == args.iterations:
+                break
+            if args.profile and pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif args.profile and pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id)
+
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if args.use_fake_data or args.use_reader_op:
+                try:
+                    loss, = exe.run([avg_loss.name])
+                except fluid.core.EnforceNotMet as ex:
+                    break
+            else:
+                loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
+            if args.update_method == "pserver":
+                exe.bcast_params()
+            if args.use_reader_op:
+                num_samples += args.batch_size * args.gpus
+            else:
+                num_samples += len(data)
+            iters += 1
+            if batch_id % 1 == 0:
+                print("Pass %d, batch %d, loss %s" %
+                      (pass_id, batch_id, np.array(loss)))
+            batch_id += 1
+
+        print_train_time(start_time, time.time(), num_samples)
+        if not args.no_test and batch_acc:
+            test_acc = test(startup_exe, infer_prog, test_reader, feeder,
+                            batch_acc)
+            print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc))
+        exit(0)
+
+
+def print_arguments(args):
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
+    print('----------- Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def print_train_time(start_time, end_time, num_samples):
+    train_elapsed = end_time - start_time
+    examples_per_sec = num_samples / train_elapsed
+    print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+          (num_samples, train_elapsed, examples_per_sec))
+
+
+def main():
+    args = parse_args()
+    print_arguments(args)
+
+    # the unique trainer id, starting from 0, needed by trainer
+    # only
+    nccl_id_var, num_trainers, trainer_id = (
+        None, 1, int(os.getenv("PADDLE_TRAINER_ID", "0")))
+
+    if args.use_cprof:
+        pr = cProfile.Profile()
+        pr.enable()
+    model_def = __import__("models.%s" % args.model, fromlist=["models"])
+    train_args = list(model_def.get_model(args))
+    train_args.append(args)
+    # Run optimizer.minimize(avg_loss)
+    train_args[2].minimize(train_args[0])
+    if args.memory_optimize:
+        fluid.memory_optimize(fluid.default_main_program())
+
+    if args.update_method == "pserver":
+        train_prog, startup_prog = dist_transpile(trainer_id)
+        if not train_prog:
+            raise Exception(
+                "Must configure correct environments to run dist train.")
+        train_args.extend([train_prog, startup_prog])
+        if args.gpus > 1 and os.getenv("PADDLE_TRAINING_ROLE") == "TRAINER":
+            train_args.extend([nccl_id_var, num_trainers, trainer_id])
+            train_parallel(*train_args)
+        train(*train_args)
+        exit(0)
+
+    # for other update methods, use default programs
+    train_args.append(fluid.default_main_program())
+    train_args.append(fluid.default_startup_program())
+
+    if args.update_method == "nccl2":
+        nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(trainer_id)
+    if args.gpus == 1:
+        # NOTE: parallel executor use profiler interanlly
+        if args.use_nvprof and args.device == 'GPU':
+            with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+                train(*train_args)
+        else:
+            train(*train_args)
+    else:
+        if args.device == "CPU":
+            raise Exception("Only support GPU perf with parallel exe")
+        train_args.extend([nccl_id_var, num_trainers, trainer_id])
+        train_parallel(*train_args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/fluid/kube_gen_job.py b/benchmark/fluid/kube_gen_job.py
new file mode 100644
index 0000000000000000000000000000000000000000..9da8a69af1d7b671b2648b1b3702776c1c0650b0
--- /dev/null
+++ b/benchmark/fluid/kube_gen_job.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import yaml
+import copy
+import argparse
+import random
+import os
+from kube_templates import pserver, trainer, envs
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Generate dist job yamls.')
+
+    parser.add_argument(
+        '--jobname', default="paddlejob", help='unique job name')
+    parser.add_argument(
+        '--cpu', default=1, type=int, help='CPU cores per trainer node')
+    parser.add_argument(
+        '--pscpu', default=1, type=int, help='CPU cores per pserver node')
+    parser.add_argument(
+        '--gpu', default=0, type=int, help='num of GPUs per node')
+    parser.add_argument(
+        '--image',
+        default="bootstrapper:5000/fluid_benchmark:gpu",
+        help='num of GPUs per node')
+    parser.add_argument(
+        '--pservers', default=1, type=int, help='num of pservers')
+    parser.add_argument(
+        '--trainers', default=1, type=int, help='num of trainers')
+    parser.add_argument('--memory', default=1, type=int, help='trainer memory')
+    parser.add_argument(
+        '--psmemory', default=1, type=int, help='pserver memory')
+    parser.add_argument(
+        '--port', default=30236, type=int, help='num of trainers')
+    parser.add_argument(
+        '--entry', default="python train.py", help='command to run')
+    parser.add_argument(
+        '--fluid', default=1, type=int, help='whether is fluid job')
+    parser.add_argument(
+        '--rdma', action='store_true', help='whether mount rdma libs')
+    parser.add_argument(
+        '--disttype',
+        default="pserver",
+        type=str,
+        choices=['pserver', 'nccl2', 'local'],
+        help='pserver or nccl2 or local')
+
+    args = parser.parse_args()
+    return args
+
+
+def gen_job():
+    ps = pserver
+    tn = trainer
+    args = parse_args()
+
+    ps_container = ps["spec"]["template"]["spec"]["containers"][0]
+    tn_container = tn["spec"]["template"]["spec"]["containers"][0]
+
+    if args.fluid == 1:
+        ps_container["command"] = \
+            ["paddle_k8s", "start_fluid"]
+        tn_container["command"] = \
+            ["paddle_k8s", "start_fluid"]
+    ps["metadata"]["name"] = args.jobname + "-pserver"
+    ps["spec"]["template"]["metadata"]["labels"][
+        "paddle-job-pserver"] = args.jobname
+    tn["metadata"]["name"] = args.jobname + "-trainer"
+    tn["spec"]["template"]["metadata"]["labels"]["paddle-job"] = args.jobname
+
+    ps_container["image"] = args.image
+    tn_container["image"] = args.image
+
+    ps_container["resources"]["requests"]["cpu"] = str(args.pscpu)
+    ps_container["resources"]["requests"]["memory"] = str(args.psmemory) + "Gi"
+    ps_container["resources"]["limits"]["cpu"] = str(args.pscpu)
+    ps_container["resources"]["limits"]["memory"] = str(args.psmemory) + "Gi"
+
+    tn_container["resources"]["requests"]["cpu"] = str(args.cpu)
+    tn_container["resources"]["requests"]["memory"] = str(args.memory) + "Gi"
+    tn_container["resources"]["limits"]["cpu"] = str(args.cpu)
+    tn_container["resources"]["limits"]["memory"] = str(args.memory) + "Gi"
+    if args.gpu > 0:
+        tn_container["resources"]["requests"][
+            "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu)
+        tn_container["resources"]["limits"][
+            "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu)
+
+    ps["spec"]["replicas"] = int(args.pservers)
+    tn["spec"]["parallelism"] = int(args.trainers)
+    tn["spec"]["completions"] = int(args.trainers)
+    ps_container["ports"][0]["name"] = "jobport-" + str(args.port)
+    ps_container["ports"][0]["containerPort"] = args.port
+    spreadport = random.randint(40000, 60000)
+    tn_container["ports"][0]["name"] = "spr-" + str(spreadport)
+    tn_container["ports"][0]["containerPort"] = spreadport
+
+    envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname})
+    envs.append({"name": "TRAINERS", "value": str(args.trainers)})
+    envs.append({"name": "PSERVERS", "value": str(args.pservers)})
+    envs.append({"name": "ENTRY", "value": args.entry})
+    envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)})
+    envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
+    # NOTE: these directories below are cluster specific, please modify
+    # this settings before you run on your own cluster.
+    envs.append({
+        "name": "LD_LIBRARY_PATH",
+        "value":
+        "/usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind"
+    })
+
+    volumes = [{
+        "name": "nvidia-driver",
+        "hostPath": {
+            "path": "/usr/local/nvidia/lib64"
+        }
+    }]
+    volumeMounts = [{
+        "mountPath": "/usr/local/nvidia/lib64",
+        "name": "nvidia-driver"
+    }]
+
+    if args.rdma:
+        volumes.extend([{
+            "name": "ibetc",
+            "hostPath": {
+                "path": "/etc/libibverbs.d"
+            }
+        }, {
+            "name": "iblibs",
+            "hostPath": {
+                "path": "/usr/local/rdma"
+            }
+        }, {
+            "name": "valgrind",
+            "hostPath": {
+                "path": "/usr/lib64/mlnx_ofed/valgrind"
+            }
+        }])
+        volumeMounts.extend([{
+            "mountPath": "/etc/libibverbs.d",
+            "name": "ibetc"
+        }, {
+            "mountPath": "/usr/local/rdma",
+            "name": "iblibs"
+        }, {
+            "mountPath": "/usr/lib64/mlnx_ofed/valgrind",
+            "name": "valgrind"
+        }])
+        # append shm for NCCL2
+        volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}})
+        volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"})
+
+    tn["spec"]["template"]["spec"]["volumes"] = volumes
+    tn_container["volumeMounts"] = volumeMounts
+
+    ps_container["env"] = envs
+    ps_container["env"].append({"name": "TRAINING_ROLE", "value": "PSERVER"})
+    tn_container["env"] = envs
+    if args.disttype == "pserver":
+        tn_container["env"].append({
+            "name": "TRAINING_ROLE",
+            "value": "TRAINER"
+        })
+    elif args.disttype == "nccl2" or args.disttype == "local":
+        # NCCL2 have no training role, set to plain WORKER
+        tn_container["env"].append({"name": "TRAINING_ROLE", "value": "WORKER"})
+
+    os.mkdir(args.jobname)
+    if args.disttype == "pserver":
+        with open("%s/pserver.yaml" % args.jobname, "w") as fn:
+            yaml.dump(ps, fn)
+
+    with open("%s/trainer.yaml" % args.jobname, "w") as fn:
+        yaml.dump(tn, fn)
+
+
+if __name__ == "__main__":
+    gen_job()
diff --git a/benchmark/fluid/kube_templates/__init__.py b/benchmark/fluid/kube_templates/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d09d940a5ee638e4b55405d05924e2d76006cfc
--- /dev/null
+++ b/benchmark/fluid/kube_templates/__init__.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pserver import pserver
+from trainer import trainer
+
+__all__ = ["pserver", "trainer", "envs"]
+
+envs = [
+    # envs that don't need to change
+    {
+        "name": "GLOG_v",
+        "value": "0"
+    },
+    {
+        "name": "GLOG_logtostderr",
+        "value": "1"
+    },
+    {
+        "name": "TOPOLOGY",
+        "value": ""
+    },
+    {
+        "name": "TRAINER_PACKAGE",
+        "value": "/workspace"
+    },
+    {
+        "name": "PADDLE_INIT_NICS",
+        "value": "eth2"
+    },
+    {
+        "name": "NAMESPACE",
+        "valueFrom": {
+            "fieldRef": {
+                "fieldPath": "metadata.namespace"
+            }
+        }
+    },
+    {
+        "name": "POD_IP",
+        "valueFrom": {
+            "fieldRef": {
+                "fieldPath": "status.podIP"
+            }
+        }
+    },
+    {
+        "name": "PADDLE_CURRENT_IP",
+        "valueFrom": {
+            "fieldRef": {
+                "fieldPath": "status.podIP"
+            }
+        }
+    }
+]
diff --git a/benchmark/fluid/kube_templates/pserver.py b/benchmark/fluid/kube_templates/pserver.py
new file mode 100644
index 0000000000000000000000000000000000000000..b54982c806ad4229fbd4bd7edf82a4e7eb4c5ad1
--- /dev/null
+++ b/benchmark/fluid/kube_templates/pserver.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+pserver = {
+    "apiVersion": "extensions/v1beta1",
+    "kind": "ReplicaSet",
+    "metadata": {
+        "name": "jobname-pserver"
+    },
+    "spec": {
+        "replicas": 1,
+        "template": {
+            "metadata": {
+                "labels": {
+                    "paddle-job-pserver": "jobname"
+                }
+            },
+            "spec": {
+                "hostNetwork": True,
+                "imagePullSecrets": [{
+                    "name": "job-registry-secret"
+                }],
+                "containers": [{
+                    "name": "pserver",
+                    "image": "",
+                    "imagePullPolicy": "Always",
+                    "ports": [{
+                        "name": "jobport-1",
+                        "containerPort": 1
+                    }],
+                    "env": [],
+                    "command": ["paddle_k8s", "start_pserver"],
+                    "resources": {
+                        "requests": {
+                            "memory": "10Gi",
+                            "cpu": "4"
+                        },
+                        "limits": {
+                            "memory": "10Gi",
+                            "cpu": "4"
+                        }
+                    }
+                }]
+            }
+        }
+    }
+}
diff --git a/benchmark/fluid/kube_templates/trainer.py b/benchmark/fluid/kube_templates/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b915d31e371d9d787ff64d705e32baf301e16abe
--- /dev/null
+++ b/benchmark/fluid/kube_templates/trainer.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+trainer = {
+    "apiVersion": "batch/v1",
+    "kind": "Job",
+    "metadata": {
+        "name": "jobname-pserver"
+    },
+    "spec": {
+        "parallelism": 4,
+        "completions": 4,
+        "template": {
+            "metadata": {
+                "labels": {
+                    "paddle-job": "jobname"
+                }
+            },
+            "spec": {
+                "hostNetwork": True,
+                "imagePullSecrets": [{
+                    "name": "job-registry-secret"
+                }],
+                "restartPolicy": "Never",
+                "containers": [{
+                    "name": "trainer",
+                    "image": "",
+                    "imagePullPolicy": "Always",
+                    # to let container set rlimit
+                    "securityContext": {
+                        "privileged": True
+                        # TODO(wuyi): use below specific cap instead of privileged,
+                        # using privileged will cause all GPU device are visible
+                        # in the container.
+                        # "capabilities": {
+                        #     "add": ["SYS_RESOURCE"]
+                        # }
+                    },
+                    "ports": [{
+                        "name": "jobport-1",
+                        "containerPort": 1
+                    }],
+                    "env": [],
+                    "command": ["paddle_k8s", "start_trainer", "v2"],
+                    "resources": {
+                        "requests": {
+                            "memory": "10Gi",
+                            "cpu": "4",
+                        },
+                        "limits": {
+                            "memory": "10Gi",
+                            "cpu": "4",
+                        }
+                    }
+                }]
+            }
+        }
+    }
+}
diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py
deleted file mode 100644
index 1e2185dfac1072d1f1046f4616a9d53a8fc76061..0000000000000000000000000000000000000000
--- a/benchmark/fluid/mnist.py
+++ /dev/null
@@ -1,224 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import time
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-
-SEED = 1
-DTYPE = "float32"
-
-# random seed must set before configuring the network.
-# fluid.default_startup_program().random_seed = SEED
-
-
-def parse_args():
-    parser = argparse.ArgumentParser("mnist model benchmark.")
-    parser.add_argument(
-        '--batch_size', type=int, default=128, help='The minibatch size.')
-    parser.add_argument(
-        '--skip_batch_num',
-        type=int,
-        default=5,
-        help='The first num of minibatch num to skip, for better performance test'
-    )
-    parser.add_argument(
-        '--iterations', type=int, default=35, help='The number of minibatches.')
-    parser.add_argument(
-        '--pass_num', type=int, default=5, help='The number of passes.')
-    parser.add_argument(
-        '--device',
-        type=str,
-        default='GPU',
-        choices=['CPU', 'GPU'],
-        help='The device type.')
-    parser.add_argument(
-        '--infer_only', action='store_true', help='If set, run forward only.')
-    parser.add_argument(
-        '--use_cprof', action='store_true', help='If set, use cProfile.')
-    parser.add_argument(
-        '--use_nvprof',
-        action='store_true',
-        help='If set, use nvprof for CUDA.')
-    parser.add_argument(
-        '--with_test',
-        action='store_true',
-        help='If set, test the testset during training.')
-    args = parser.parse_args()
-    return args
-
-
-def cnn_model(data):
-    conv_pool_1 = fluid.nets.simple_img_conv_pool(
-        input=data,
-        filter_size=5,
-        num_filters=20,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-    conv_pool_2 = fluid.nets.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        pool_size=2,
-        pool_stride=2,
-        act="relu")
-
-    # TODO(dzhwinter) : refine the initializer and random seed settting
-    SIZE = 10
-    input_shape = conv_pool_2.shape
-    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
-    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
-
-    predict = fluid.layers.fc(
-        input=conv_pool_2,
-        size=SIZE,
-        act="softmax",
-        param_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.NormalInitializer(
-                loc=0.0, scale=scale)))
-    return predict
-
-
-def eval_test(exe, batch_acc, batch_size_tensor, inference_program):
-    test_reader = paddle.batch(
-        paddle.dataset.mnist.test(), batch_size=args.batch_size)
-    test_pass_acc = fluid.average.WeightedAverage()
-    for batch_id, data in enumerate(test_reader()):
-        img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
-                                data)).astype(DTYPE)
-        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-        y_data = y_data.reshape([len(y_data), 1])
-
-        acc, weight = exe.run(inference_program,
-                              feed={"pixel": img_data,
-                                    "label": y_data},
-                              fetch_list=[batch_acc, batch_size_tensor])
-        test_pass_acc.add(value=acc, weight=weight)
-        pass_acc = test_pass_acc.eval()
-    return pass_acc
-
-
-def run_benchmark(model, args):
-    if args.use_cprof:
-        pr = cProfile.Profile()
-        pr.enable()
-    start_time = time.time()
-    # Input data
-    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    # Train program
-    predict = model(images)
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    # Evaluator
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(
-        input=predict, label=label, total=batch_size_tensor)
-
-    # inference program
-    inference_program = fluid.default_main_program().clone()
-
-    # Optimization
-    opt = fluid.optimizer.AdamOptimizer(
-        learning_rate=0.001, beta1=0.9, beta2=0.999)
-    opt.minimize(avg_cost)
-
-    fluid.memory_optimize(fluid.default_main_program())
-
-    # Initialize executor
-    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
-    exe = fluid.Executor(place)
-
-    # Parameter initialization
-    exe.run(fluid.default_startup_program())
-
-    # Reader
-    train_reader = paddle.batch(
-        paddle.dataset.mnist.train(), batch_size=args.batch_size)
-
-    accuracy = fluid.metrics.Accuracy()
-    iters, num_samples, start_time = 0, 0, time.time()
-    for pass_id in range(args.pass_num):
-        accuracy.reset()
-        train_accs = []
-        train_losses = []
-        for batch_id, data in enumerate(train_reader()):
-            if iters == args.skip_batch_num:
-                start_time = time.time()
-                num_samples = 0
-            if iters == args.iterations:
-                break
-            img_data = np.array(
-                map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
-            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-            y_data = y_data.reshape([len(y_data), 1])
-
-            outs = exe.run(
-                fluid.default_main_program(),
-                feed={"pixel": img_data,
-                      "label": y_data},
-                fetch_list=[avg_cost, batch_acc, batch_size_tensor]
-            )  # The accuracy is the accumulation of batches, but not the current batch.
-            accuracy.update(value=outs[1], weight=outs[2])
-            iters += 1
-            num_samples += len(y_data)
-            loss = np.array(outs[0])
-            acc = np.array(outs[1])
-            train_losses.append(loss)
-            train_accs.append(acc)
-            print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
-                  (pass_id, iters, loss, acc))
-
-        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
-              (pass_id, np.mean(train_losses), np.mean(train_accs)))
-        train_elapsed = time.time() - start_time
-        examples_per_sec = num_samples / train_elapsed
-
-        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
-              (num_samples, train_elapsed, examples_per_sec))
-        # evaluation
-        if args.with_test:
-            test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
-                                     inference_program)
-        exit(0)
-
-
-def print_arguments(args):
-    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
-                                vars(args)['device'] == 'GPU')
-    print('----------- mnist Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    print_arguments(args)
-    if args.use_nvprof and args.device == 'GPU':
-        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
-            run_benchmark(cnn_model, args)
-    else:
-        run_benchmark(cnn_model, args)
diff --git a/benchmark/fluid/models/__init__.py b/benchmark/fluid/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c3fcac8dd4a1ba0496ef013bd4eb468a0075125
--- /dev/null
+++ b/benchmark/fluid/models/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+]
diff --git a/benchmark/fluid/machine_translation.py b/benchmark/fluid/models/machine_translation.py
similarity index 59%
rename from benchmark/fluid/machine_translation.py
rename to benchmark/fluid/models/machine_translation.py
index adde5f21acd4e77d58a453d6868abeccfca4bb5a..69541adf6b7e53fcc1ac9d3c82b5a60ca0a72879 100644
--- a/benchmark/fluid/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@@ -27,74 +27,6 @@ import paddle.fluid.core as core
 import paddle.fluid.framework as framework
 from paddle.fluid.executor import Executor
 
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    "--embedding_dim",
-    type=int,
-    default=512,
-    help="The dimension of embedding table. (default: %(default)d)")
-parser.add_argument(
-    "--encoder_size",
-    type=int,
-    default=512,
-    help="The size of encoder bi-rnn unit. (default: %(default)d)")
-parser.add_argument(
-    "--decoder_size",
-    type=int,
-    default=512,
-    help="The size of decoder rnn unit. (default: %(default)d)")
-parser.add_argument(
-    "--batch_size",
-    type=int,
-    default=16,
-    help="The sequence number of a mini-batch data. (default: %(default)d)")
-parser.add_argument(
-    '--skip_batch_num',
-    type=int,
-    default=5,
-    help='The first num of minibatch num to skip, for better performance test')
-parser.add_argument(
-    '--iterations', type=int, default=80, help='The number of minibatches.')
-parser.add_argument(
-    "--dict_size",
-    type=int,
-    default=30000,
-    help="The dictionary capacity. Dictionaries of source sequence and "
-    "target dictionary have same capacity. (default: %(default)d)")
-parser.add_argument(
-    "--pass_num",
-    type=int,
-    default=2,
-    help="The pass number to train. (default: %(default)d)")
-parser.add_argument(
-    "--learning_rate",
-    type=float,
-    default=0.0002,
-    help="Learning rate used to train the model. (default: %(default)f)")
-parser.add_argument(
-    "--infer_only", action='store_true', help="If set, run forward only.")
-parser.add_argument(
-    "--beam_size",
-    type=int,
-    default=3,
-    help="The width for beam searching. (default: %(default)d)")
-parser.add_argument(
-    '--device',
-    type=str,
-    default='GPU',
-    choices=['CPU', 'GPU'],
-    help="The device type.")
-parser.add_argument(
-    "--max_length",
-    type=int,
-    default=250,
-    help="The maximum length of sequence when doing generation. "
-    "(default: %(default)d)")
-parser.add_argument(
-    '--with_test',
-    action='store_true',
-    help='If set, test the testset during training.')
-
 
 def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
     def linear(inputs):
@@ -264,116 +196,39 @@ def lodtensor_to_ndarray(lod_tensor):
     return ndarray
 
 
-def train():
+def get_model(args):
+    if args.use_reader_op:
+        raise Exception("machine_translation do not support reader op for now.")
+    embedding_dim = 512
+    encoder_size = 512
+    decoder_size = 512
+    dict_size = 30000
+    beam_size = 3
+    max_length = 250
     avg_cost, feeding_list = seq_to_seq_net(
-        args.embedding_dim,
-        args.encoder_size,
-        args.decoder_size,
-        args.dict_size,
-        args.dict_size,
+        embedding_dim,
+        encoder_size,
+        decoder_size,
+        dict_size,
+        dict_size,
         False,
-        beam_size=args.beam_size,
-        max_length=args.max_length)
+        beam_size=beam_size,
+        max_length=max_length)
 
     # clone from default main program
     inference_program = fluid.default_main_program().clone()
 
     optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
-    optimizer.minimize(avg_cost)
-
-    fluid.memory_optimize(fluid.default_main_program())
 
     train_batch_generator = paddle.batch(
         paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
-        batch_size=args.batch_size)
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=args.batch_size * args.gpus)
 
     test_batch_generator = paddle.batch(
         paddle.reader.shuffle(
-            paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
+            paddle.dataset.wmt14.test(dict_size), buf_size=1000),
         batch_size=args.batch_size)
 
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
-    exe = Executor(place)
-    exe.run(framework.default_startup_program())
-
-    def do_validation():
-        total_loss = 0.0
-        count = 0
-        for batch_id, data in enumerate(test_batch_generator()):
-            src_seq = to_lodtensor(map(lambda x: x[0], data), place)[0]
-            trg_seq = to_lodtensor(map(lambda x: x[1], data), place)[0]
-            lbl_seq = to_lodtensor(map(lambda x: x[2], data), place)[0]
-
-            fetch_outs = exe.run(inference_program,
-                                 feed={
-                                     feeding_list[0]: src_seq,
-                                     feeding_list[1]: trg_seq,
-                                     feeding_list[2]: lbl_seq
-                                 },
-                                 fetch_list=[avg_cost],
-                                 return_numpy=False)
-
-            total_loss += lodtensor_to_ndarray(fetch_outs[0])[0]
-            count += 1
-
-        return total_loss / count
-
-    iters, num_samples, start_time = 0, 0, time.time()
-    for pass_id in xrange(args.pass_num):
-        train_accs = []
-        train_losses = []
-        for batch_id, data in enumerate(train_batch_generator()):
-            if iters == args.skip_batch_num:
-                start_time = time.time()
-                num_samples = 0
-            if iters == args.iterations:
-                break
-            src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place)
-            num_samples += word_num
-            trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place)
-            num_samples += word_num
-            lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place)
-
-            fetch_outs = exe.run(framework.default_main_program(),
-                                 feed={
-                                     feeding_list[0]: src_seq,
-                                     feeding_list[1]: trg_seq,
-                                     feeding_list[2]: lbl_seq
-                                 },
-                                 fetch_list=[avg_cost])
-
-            iters += 1
-            loss = np.array(fetch_outs[0])
-            print(
-                "Pass = %d, Iter = %d, Loss = %f" % (pass_id, iters, loss)
-            )  # The accuracy is the accumulation of batches, but not the current batch.
-
-        train_elapsed = time.time() - start_time
-        examples_per_sec = num_samples / train_elapsed
-        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
-              (num_samples, train_elapsed, examples_per_sec))
-        # evaluation
-        if args.with_test:
-            test_loss = do_validation()
-        exit(0)
-
-
-def infer():
-    pass
-
-
-def print_arguments(args):
-    print('----------- seq2seq Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == '__main__':
-    args = parser.parse_args()
-    print_arguments(args)
-    if args.infer_only:
-        infer()
-    else:
-        train()
+    return avg_cost, inference_program, optimizer, train_batch_generator, \
+           test_batch_generator, None
diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e740dc6896b7eeeb82170aa13d32987c4df5c48
--- /dev/null
+++ b/benchmark/fluid/models/mnist.py
@@ -0,0 +1,125 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import cProfile
+import os
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+
+SEED = 1
+DTYPE = "float32"
+
+# random seed must set before configuring the network.
+# fluid.default_startup_program().random_seed = SEED
+
+
+def cnn_model(data):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=data,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+
+    # TODO(dzhwinter) : refine the initializer and random seed settting
+    SIZE = 10
+    input_shape = conv_pool_2.shape
+    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+    predict = fluid.layers.fc(
+        input=conv_pool_2,
+        size=SIZE,
+        act="softmax",
+        param_attr=fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.NormalInitializer(
+                loc=0.0, scale=scale)))
+    return predict
+
+
+def get_model(args):
+    if args.use_reader_op:
+        filelist = [
+            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+        ]
+        data_file = fluid.layers.open_files(
+            filenames=filelist,
+            shapes=[[-1, 1, 28, 28], (-1, 1)],
+            lod_levels=[0, 0],
+            dtypes=["float32", "int64"],
+            thread_num=args.gpus,
+            pass_num=args.pass_num)
+        data_file = fluid.layers.double_buffer(
+            fluid.layers.batch(
+                data_file, batch_size=args.batch_size))
+        images, label = fluid.layers.read_file(data_file)
+    else:
+        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    if args.device == 'CPU' and args.cpus > 1:
+        places = fluid.layers.get_places(args.cpus)
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            predict = cnn_model(pd.read_input(images))
+            label = pd.read_input(label)
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            batch_acc = fluid.layers.accuracy(input=predict, label=label)
+
+            pd.write_output(avg_cost)
+            pd.write_output(batch_acc)
+
+        avg_cost, batch_acc = pd()
+        avg_cost = fluid.layers.mean(avg_cost)
+        batch_acc = fluid.layers.mean(batch_acc)
+    else:
+        # Train program
+        predict = cnn_model(images)
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        # Evaluator
+        batch_acc = fluid.layers.accuracy(input=predict, label=label)
+
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+
+    # Optimization
+    opt = fluid.optimizer.AdamOptimizer(
+        learning_rate=0.001, beta1=0.9, beta2=0.999)
+
+    # Reader
+    train_reader = paddle.batch(
+        paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus)
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=args.batch_size)
+    return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ee2b5be09bfcc2e7fcec7eb2f80e28e4e75ab3d
--- /dev/null
+++ b/benchmark/fluid/models/resnet.py
@@ -0,0 +1,205 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import numpy as np
+import time
+import os
+
+import cProfile, pstats, StringIO
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
+from recordio_converter import imagenet_train, imagenet_test
+
+
+def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+    conv1 = fluid.layers.conv2d(
+        input=input,
+        filter_size=filter_size,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=None,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv1, act=act)
+
+
+def shortcut(input, ch_out, stride):
+    ch_in = input.shape[1]  # if args.data_format == 'NCHW' else input.shape[-1]
+    if ch_in != ch_out:
+        return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+    else:
+        return input
+
+
+def basicblock(input, ch_out, stride):
+    short = shortcut(input, ch_out, stride)
+    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
+    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+
+
+def bottleneck(input, ch_out, stride):
+    short = shortcut(input, ch_out * 4, stride)
+    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
+    conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
+    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
+
+
+def layer_warp(block_func, input, ch_out, count, stride):
+    res_out = block_func(input, ch_out, stride)
+    for i in range(1, count):
+        res_out = block_func(res_out, ch_out, 1)
+    return res_out
+
+
+def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
+
+    cfg = {
+        18: ([2, 2, 2, 1], basicblock),
+        34: ([3, 4, 6, 3], basicblock),
+        50: ([3, 4, 6, 3], bottleneck),
+        101: ([3, 4, 23, 3], bottleneck),
+        152: ([3, 8, 36, 3], bottleneck)
+    }
+    stages, block_func = cfg[depth]
+    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
+    pool1 = fluid.layers.pool2d(
+        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
+    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
+    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
+    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
+    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
+    pool2 = fluid.layers.pool2d(
+        input=res4,
+        pool_size=7,
+        pool_type='avg',
+        pool_stride=1,
+        global_pooling=True)
+    out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
+    return out
+
+
+def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
+    assert (depth - 2) % 6 == 0
+
+    n = (depth - 2) // 6
+
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
+    return out
+
+
+def get_model(args):
+    model = resnet_cifar10
+    if args.data_set == "cifar10":
+        class_dim = 10
+        if args.data_format == 'NCHW':
+            dshape = [3, 32, 32]
+        else:
+            dshape = [32, 32, 3]
+        model = resnet_cifar10
+        train_reader = paddle.dataset.cifar.train10()
+        test_reader = paddle.dataset.cifar.test10()
+    elif args.data_set == "flowers":
+        class_dim = 102
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+        model = resnet_imagenet
+        train_reader = paddle.dataset.flowers.train()
+        test_reader = paddle.dataset.flowers.test()
+    elif args.data_set == "imagenet":
+        class_dim = 1000
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+        model = resnet_imagenet
+        if not args.data_path:
+            raise Exception(
+                "Must specify --data_path when training with imagenet")
+        train_reader = imagenet_train(args.data_path)
+        test_reader = imagenet_test(args.data_path)
+
+    if args.use_reader_op:
+        filelist = [
+            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+        ]
+        data_file = fluid.layers.open_files(
+            filenames=filelist,
+            shapes=[[-1] + dshape, (-1, 1)],
+            lod_levels=[0, 0],
+            dtypes=["float32", "int64"],
+            thread_num=args.gpus,
+            pass_num=args.pass_num)
+        data_file = fluid.layers.double_buffer(
+            fluid.layers.batch(
+                data_file, batch_size=args.batch_size))
+        input, label = fluid.layers.read_file(data_file)
+    else:
+        input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    if args.device == 'CPU' and args.cpus > 1:
+        places = fluid.layers.get_places(args.cpus)
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            predict = model(pd.read_input(input), class_dim)
+            label = pd.read_input(label)
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            batch_acc = fluid.layers.accuracy(input=predict, label=label)
+
+            pd.write_output(avg_cost)
+            pd.write_output(batch_acc)
+
+        avg_cost, batch_acc = pd()
+        avg_cost = fluid.layers.mean(avg_cost)
+        batch_acc = fluid.layers.mean(batch_acc)
+    else:
+        predict = model(input, class_dim)
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        batch_acc = fluid.layers.accuracy(input=predict, label=label)
+
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc])
+
+    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+
+    batched_train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            train_reader, buf_size=5120),
+        batch_size=args.batch_size * args.gpus)
+    batched_test_reader = paddle.batch(train_reader, batch_size=args.batch_size)
+
+    return avg_cost, inference_program, optimizer, batched_train_reader, batched_test_reader, batch_acc
diff --git a/benchmark/fluid/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py
similarity index 50%
rename from benchmark/fluid/stacked_dynamic_lstm.py
rename to benchmark/fluid/models/stacked_dynamic_lstm.py
index 73bcc47b4d404af2c01d61ca3dfb11971bbcfe9c..e1c4857f1a365f6480929ea57296a9801f5ea945 100644
--- a/benchmark/fluid/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -29,57 +29,6 @@ import paddle.fluid as fluid
 import paddle.batch as batch
 import paddle.fluid.profiler as profiler
 
-
-def parse_args():
-    parser = argparse.ArgumentParser("Understand Sentiment by Dynamic RNN.")
-    parser.add_argument(
-        '--batch_size',
-        type=int,
-        default=32,
-        help='The sequence number of a batch data. (default: %(default)d)')
-    parser.add_argument(
-        '--skip_batch_num',
-        type=int,
-        default=5,
-        help='The first num of minibatch num to skip, for better performance test'
-    )
-    parser.add_argument(
-        '--iterations', type=int, default=80, help='The number of minibatches.')
-    parser.add_argument(
-        '--emb_dim',
-        type=int,
-        default=512,
-        help='Dimension of embedding table. (default: %(default)d)')
-    parser.add_argument(
-        '--hidden_dim',
-        type=int,
-        default=512,
-        help='Hidden size of lstm unit. (default: %(default)d)')
-    parser.add_argument(
-        '--pass_num',
-        type=int,
-        default=100,
-        help='Epoch number to train. (default: %(default)d)')
-    parser.add_argument(
-        '--device',
-        type=str,
-        default='CPU',
-        choices=['CPU', 'GPU'],
-        help='The device type.')
-    parser.add_argument(
-        '--crop_size',
-        type=int,
-        default=int(os.environ.get('CROP_SIZE', '1500')),
-        help='The max sentence length of input. Since this model use plain RNN,'
-        ' Gradient could be explored if sentence is too long')
-    parser.add_argument(
-        '--with_test',
-        action='store_true',
-        help='If set, test the testset during training.')
-    args = parser.parse_args()
-    return args
-
-
 word_dict = imdb.word_dict()
 
 
@@ -94,14 +43,18 @@ def crop_sentence(reader, crop_size):
     return __impl__
 
 
-def main():
-    args = parse_args()
-    lstm_size = args.hidden_dim
+def get_model(args):
+    if args.use_reader_op:
+        raise Exception(
+            "stacked_dynamic_lstm do not support reader op for now.")
+    lstm_size = 512
+    emb_dim = 512
+    crop_size = 1500
 
     data = fluid.layers.data(
         name="words", shape=[1], lod_level=1, dtype='int64')
     sentence = fluid.layers.embedding(
-        input=data, size=[len(word_dict), args.emb_dim])
+        input=data, size=[len(word_dict), emb_dim])
 
     sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
 
@@ -151,9 +104,8 @@ def main():
     loss = fluid.layers.mean(x=loss)
 
     # add acc
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
     batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
-                shape=[1], dtype='int64'), total=batch_size_tensor)
+                shape=[1], dtype='int64'))
 
     inference_program = fluid.default_main_program().clone()
     with fluid.program_guard(inference_program):
@@ -161,51 +113,17 @@ def main():
             target_vars=[batch_acc, batch_size_tensor])
 
     adam = fluid.optimizer.Adam()
-    adam.minimize(loss)
-
-    fluid.memory_optimize(fluid.default_main_program())
-
-    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
-    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
 
     train_reader = batch(
         paddle.reader.shuffle(
-            crop_sentence(imdb.train(word_dict), args.crop_size),
-            buf_size=25000),
+            crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000),
+        batch_size=args.batch_size * args.gpus)
+    test_reader = batch(
+        paddle.reader.shuffle(
+            crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000),
         batch_size=args.batch_size)
 
-    iters, num_samples, start_time = 0, 0, time.time()
-    for pass_id in range(args.pass_num):
-        train_accs = []
-        train_losses = []
-        for batch_id, data in enumerate(train_reader()):
-            if iters == args.skip_batch_num:
-                start_time = time.time()
-                num_samples = 0
-            if iters == args.iterations:
-                break
-            tensor_words = to_lodtensor([x[0] for x in data], place)
-            label = numpy.array([x[1] for x in data]).astype("int64")
-            label = label.reshape((-1, 1))
-            loss_np, acc, weight = exe.run(
-                fluid.default_main_program(),
-                feed={"words": tensor_words,
-                      "label": label},
-                fetch_list=[loss, batch_acc, batch_size_tensor])
-            iters += 1
-            for x in data:
-                num_samples += len(x[0])
-            print(
-                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
-                (pass_id, iters, loss_np, acc)
-            )  # The accuracy is the accumulation of batches, but not the current batch.
-
-        train_elapsed = time.time() - start_time
-        examples_per_sec = num_samples / train_elapsed
-        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
-              (num_samples, train_elapsed, examples_per_sec))
-        exit(0)
+    return loss, inference_program, adam, train_reader, test_reader, batch_acc
 
 
 def to_lodtensor(data, place):
@@ -221,16 +139,3 @@ def to_lodtensor(data, place):
     res.set(flattened_data, place)
     res.set_lod([lod])
     return res
-
-
-def print_arguments(args):
-    print('----------- lstm Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    print_arguments(args)
-    main()
diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..6092cdeb884b3a9b60a3bcf20b022f2b0685e6aa
--- /dev/null
+++ b/benchmark/fluid/models/vgg.py
@@ -0,0 +1,120 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in Fluid"""
+from __future__ import print_function
+
+import sys
+import time
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import argparse
+import functools
+import os
+
+
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    return fc2
+
+
+def get_model(args):
+    if args.data_set == "cifar10":
+        classdim = 10
+        if args.data_format == 'NCHW':
+            data_shape = [3, 32, 32]
+        else:
+            data_shape = [32, 32, 3]
+    else:
+        classdim = 102
+        if args.data_format == 'NCHW':
+            data_shape = [3, 224, 224]
+        else:
+            data_shape = [224, 224, 3]
+
+    if args.use_reader_op:
+        filelist = [
+            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+        ]
+        data_file = fluid.layers.open_files(
+            filenames=filelist,
+            shapes=[[-1] + data_shape, (-1, 1)],
+            lod_levels=[0, 0],
+            dtypes=["float32", "int64"],
+            thread_num=args.gpus,
+            pass_num=args.pass_num)
+        data_file = fluid.layers.double_buffer(
+            fluid.layers.batch(
+                data_file, batch_size=args.batch_size))
+        images, label = fluid.layers.read_file(data_file)
+    else:
+        images = fluid.layers.data(name='data', shape=dshape, dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    net = vgg16_bn_drop(images)
+    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    # Optimization
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+
+    # data reader
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size * args.gpus)
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10()
+        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+        batch_size=args.batch_size)
+
+    return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
diff --git a/benchmark/fluid/recordio_converter.py b/benchmark/fluid/recordio_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2dc39109bf1beaf147b046560c92fbd2416d8e6
--- /dev/null
+++ b/benchmark/fluid/recordio_converter.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.dataset import mnist, cifar, flowers, image
+
+
+def convert_2_recordio(py_reader, outfilepath, batch_size, shape_data,
+                       shape_label):
+    num_batches = 0
+    with fluid.program_guard(fluid.Program(), fluid.Program()):
+        reader = paddle.batch(py_reader(), batch_size=batch_size)
+        feeder = fluid.DataFeeder(
+            feed_list=[  # order is image and label
+                fluid.layers.data(
+                    name='image', shape=shape_data),
+                fluid.layers.data(
+                    name='label', shape=shape_label, dtype='int64'),
+            ],
+            place=fluid.CPUPlace())
+        num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
+            outfilepath, reader, feeder)
+    return num_batches
+
+
+def prepare_mnist(outpath, batch_size):
+    outfilepath = os.path.join(outpath, "mnist.recordio")
+    convert_2_recordio(mnist.train, outfilepath, batch_size, [784], [1])
+
+
+def prepare_cifar10(outpath, batch_size):
+    outfilepath = os.path.join(outpath, "cifar.recordio")
+    convert_2_recordio(cifar.train10, outfilepath, batch_size, [3, 32, 32], [1])
+
+
+def prepare_flowers(outpath, batch_size):
+    outfilepath = os.path.join(outpath, "flowers.recordio")
+    convert_2_recordio(flowers.train, outfilepath, batch_size, [3, 224, 224],
+                       [1])
+
+
+def default_mapper(sample):
+    img, label = sample
+    img = image.simple_transform(
+        img, 256, 224, True, mean=[103.94, 116.78, 123.68])
+    return img.flatten().astype('float32'), label
+
+
+def imagenet_train(data_dir):
+    contents = os.listdir(data_dir)
+    if set(contents) != set(
+        ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
+        raise Exception("Imagenet data contents error!")
+    img2label = dict()
+    imgfilelist = []
+    with open(os.path.join(data_dir, "train.txt")) as fn:
+        while 1:
+            l = fn.readline()
+            if not l:
+                break
+            img, lbl = l[:-1].split(" ")
+            img2label[img] = int(lbl)
+            imgfilelist.append(img)
+    # shuffle all, this is slow
+    random.shuffle(imgfilelist)
+
+    def train_reader():
+        for idx, imgfile in enumerate(imgfilelist):
+            data = image.load_image(
+                os.path.join(data_dir, "train", imgfile.lower()))
+            label = [img2label[imgfile], ]
+            yield [data, label]
+
+    return paddle.reader.map_readers(default_mapper, train_reader)
+
+
+def imagenet_test(data_dir):
+    contents = os.listdir(data_dir)
+    if set(contents) != set(
+        ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
+        raise Exception("Imagenet data contents error!")
+    img2label = dict()
+    imgfilelist = []
+    with open(os.path.join(data_dir, "val.txt")) as fn:
+        while 1:
+            l = fn.readline()
+            if not l:
+                break
+            img, lbl = l[:-1].split(" ")
+            img2label[img] = int(lbl)
+            imgfilelist.append(img)
+
+    def test_reader():
+        for idx, imgfile in enumerate(imgfilelist):
+            base_path = os.path.join(data_dir, "val", imgfile.split(".")[0])
+            image_path = ".".join([base_path, "jpeg"])
+            data = image.load_image(image_path)
+            label = [img2label[imgfile], ]
+            yield [data, label]
+
+    return paddle.reader.map_readers(default_mapper, test_reader)
+
+
+# FIXME(wuyi): delete this when https://github.com/PaddlePaddle/Paddle/pull/11066 is merged
+def convert_reader_to_recordio_files(
+        filename,
+        batch_per_file,
+        reader_creator,
+        feeder,
+        compressor=core.RecordIOWriter.Compressor.Snappy,
+        max_num_records=1000,
+        feed_order=None):
+    if feed_order is None:
+        feed_order = feeder.feed_names
+    f_name, f_ext = os.path.splitext(filename)
+    assert (f_ext == ".recordio")
+
+    lines = []
+    f_idx = 0
+    counter = 0
+    for idx, batch in enumerate(reader_creator()):
+        lines.append(batch)
+        if idx >= batch_per_file and idx % batch_per_file == 0:
+            filename = "%s-%05d%s" % (f_name, f_idx, f_ext)
+            with fluid.recordio_writer.create_recordio_writer(
+                    filename, compressor, max_num_records) as writer:
+                for l in lines:
+                    res = feeder.feed(l)
+                    for each in feed_order:
+                        writer.append_tensor(res[each])
+                    writer.complete_append_tensor()
+                    counter += 1
+                lines = []
+                f_idx += 1
+            print("written file: ", filename)
+    return counter
+
+
+def prepare_imagenet(inpath, outpath, batch_size):
+    r = paddle.batch(imagenet_train(inpath), batch_size=batch_size)
+    feeder = fluid.DataFeeder(
+        feed_list=[
+            fluid.layers.data(
+                name="image", shape=[3, 224, 224]), fluid.layers.data(
+                    name="label", shape=[1], dtype='int64')
+        ],
+        place=fluid.CPUPlace())
+    outpath = os.path.join(outpath, "imagenet.recordio")
+    convert_reader_to_recordio_files(outpath, 10000, r, feeder)
diff --git a/benchmark/fluid/resnet.py b/benchmark/fluid/resnet.py
deleted file mode 100644
index 831fa2c019fc2868cd85b1ca7b2c8c76a2f1628c..0000000000000000000000000000000000000000
--- a/benchmark/fluid/resnet.py
+++ /dev/null
@@ -1,313 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import functools
-import numpy as np
-import time
-
-import cProfile, pstats, StringIO
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.profiler as profiler
-
-
-def parse_args():
-    parser = argparse.ArgumentParser('Convolution model benchmark.')
-    parser.add_argument(
-        '--model',
-        type=str,
-        choices=['resnet_imagenet', 'resnet_cifar10'],
-        default='resnet_imagenet',
-        help='The model architecture.')
-    parser.add_argument(
-        '--batch_size', type=int, default=32, help='The minibatch size.')
-    parser.add_argument(
-        '--use_fake_data',
-        action='store_true',
-        help='use real data or fake data')
-    parser.add_argument(
-        '--skip_batch_num',
-        type=int,
-        default=5,
-        help='The first num of minibatch num to skip, for better performance test'
-    )
-    parser.add_argument(
-        '--iterations', type=int, default=80, help='The number of minibatches.')
-    parser.add_argument(
-        '--pass_num', type=int, default=100, help='The number of passes.')
-    parser.add_argument(
-        '--data_format',
-        type=str,
-        default='NCHW',
-        choices=['NCHW', 'NHWC'],
-        help='The data data_format, now only support NCHW.')
-    parser.add_argument(
-        '--device',
-        type=str,
-        default='GPU',
-        choices=['CPU', 'GPU'],
-        help='The device type.')
-    parser.add_argument(
-        '--data_set',
-        type=str,
-        default='flowers',
-        choices=['cifar10', 'flowers'],
-        help='Optional dataset for benchmark.')
-    parser.add_argument(
-        '--infer_only', action='store_true', help='If set, run forward only.')
-    parser.add_argument(
-        '--use_cprof', action='store_true', help='If set, use cProfile.')
-    parser.add_argument(
-        '--use_nvprof',
-        action='store_true',
-        help='If set, use nvprof for CUDA.')
-    parser.add_argument(
-        '--with_test',
-        action='store_true',
-        help='If set, test the testset during training.')
-    args = parser.parse_args()
-    return args
-
-
-def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
-    conv1 = fluid.layers.conv2d(
-        input=input,
-        filter_size=filter_size,
-        num_filters=ch_out,
-        stride=stride,
-        padding=padding,
-        act=None,
-        bias_attr=False)
-    return fluid.layers.batch_norm(input=conv1, act=act)
-
-
-def shortcut(input, ch_out, stride):
-    ch_in = input.shape[1] if args.data_format == 'NCHW' else input.shape[-1]
-    if ch_in != ch_out:
-        return conv_bn_layer(input, ch_out, 1, stride, 0, None)
-    else:
-        return input
-
-
-def basicblock(input, ch_out, stride):
-    short = shortcut(input, ch_out, stride)
-    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
-    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
-    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
-
-
-def bottleneck(input, ch_out, stride):
-    short = shortcut(input, ch_out * 4, stride)
-    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
-    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
-    conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
-    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
-
-
-def layer_warp(block_func, input, ch_out, count, stride):
-    res_out = block_func(input, ch_out, stride)
-    for i in range(1, count):
-        res_out = block_func(res_out, ch_out, 1)
-    return res_out
-
-
-def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
-
-    cfg = {
-        18: ([2, 2, 2, 1], basicblock),
-        34: ([3, 4, 6, 3], basicblock),
-        50: ([3, 4, 6, 3], bottleneck),
-        101: ([3, 4, 23, 3], bottleneck),
-        152: ([3, 8, 36, 3], bottleneck)
-    }
-    stages, block_func = cfg[depth]
-    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
-    pool1 = fluid.layers.pool2d(
-        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
-    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
-    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
-    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
-    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
-    pool2 = fluid.layers.pool2d(
-        input=res4,
-        pool_size=7,
-        pool_type='avg',
-        pool_stride=1,
-        global_pooling=True)
-    out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
-    return out
-
-
-def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
-    assert (depth - 2) % 6 == 0
-
-    n = (depth - 2) // 6
-
-    conv1 = conv_bn_layer(
-        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
-    res1 = layer_warp(basicblock, conv1, 16, n, 1)
-    res2 = layer_warp(basicblock, res1, 32, n, 2)
-    res3 = layer_warp(basicblock, res2, 64, n, 2)
-    pool = fluid.layers.pool2d(
-        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
-    out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
-    return out
-
-
-def run_benchmark(model, args):
-    if args.use_cprof:
-        pr = cProfile.Profile()
-        pr.enable()
-
-    if args.data_set == "cifar10":
-        class_dim = 10
-        if args.data_format == 'NCHW':
-            dshape = [3, 32, 32]
-        else:
-            dshape = [32, 32, 3]
-    else:
-        class_dim = 102
-        if args.data_format == 'NCHW':
-            dshape = [3, 224, 224]
-        else:
-            dshape = [224, 224, 3]
-
-    input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    predict = model(input, class_dim)
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(
-        input=predict, label=label, total=batch_size_tensor)
-
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc, batch_size_tensor])
-
-    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
-    opts = optimizer.minimize(avg_cost)
-
-    fluid.memory_optimize(fluid.default_main_program())
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
-            buf_size=5120),
-        batch_size=args.batch_size)
-    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10()
-        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-        batch_size=args.batch_size)
-
-    def test(exe):
-        test_accuracy = fluid.average.WeightedAverage()
-        for batch_id, data in enumerate(test_reader()):
-            img_data = np.array(map(lambda x: x[0].reshape(dshape),
-                                    data)).astype("float32")
-            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-            y_data = y_data.reshape([-1, 1])
-
-            acc, weight = exe.run(inference_program,
-                                  feed={"data": img_data,
-                                        "label": y_data},
-                                  fetch_list=[batch_acc, batch_size_tensor])
-            test_accuracy.add(value=acc, weight=weight)
-
-        return test_accuracy.eval()
-
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
-    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
-    accuracy = fluid.average.WeightedAverage()
-    if args.use_fake_data:
-        data = train_reader().next()
-        image = np.array(map(lambda x: x[0].reshape(dshape), data)).astype(
-            'float32')
-        label = np.array(map(lambda x: x[1], data)).astype('int64')
-        label = label.reshape([-1, 1])
-
-    iters, num_samples, start_time = 0, 0, time.time()
-    for pass_id in range(args.pass_num):
-        accuracy.reset()
-        train_accs = []
-        train_losses = []
-        for batch_id, data in enumerate(train_reader()):
-            if iters == args.skip_batch_num:
-                start_time = time.time()
-                num_samples = 0
-            if iters == args.iterations:
-                break
-            if not args.use_fake_data:
-                image = np.array(map(lambda x: x[0].reshape(dshape),
-                                     data)).astype('float32')
-                label = np.array(map(lambda x: x[1], data)).astype('int64')
-                label = label.reshape([-1, 1])
-            loss, acc, weight = exe.run(
-                fluid.default_main_program(),
-                feed={'data': image,
-                      'label': label},
-                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
-            iters += 1
-            num_samples += len(label)
-            accuracy.add(value=acc, weight=weight)
-            train_losses.append(loss)
-            train_accs.append(acc)
-            print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
-                  (pass_id, iters, loss, acc))
-        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
-              (pass_id, np.mean(train_losses), np.mean(train_accs)))
-        train_elapsed = time.time() - start_time
-        examples_per_sec = num_samples / train_elapsed
-        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
-              (num_samples, train_elapsed, examples_per_sec))
-        # evaluation
-        if args.with_test:
-            pass_test_acc = test(exe)
-        exit(0)
-
-
-def print_arguments(args):
-    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
-                                vars(args)['device'] == 'GPU')
-    print('----------- resnet Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == '__main__':
-    model_map = {
-        'resnet_imagenet': resnet_imagenet,
-        'resnet_cifar10': resnet_cifar10
-    }
-    args = parse_args()
-    print_arguments(args)
-    if args.data_format == 'NHWC':
-        raise ValueError('Only support NCHW data_format now.')
-    if args.use_nvprof and args.device == 'GPU':
-        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
-            run_benchmark(model_map[args.model], args)
-    else:
-        run_benchmark(model_map[args.model], args)
diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh
index f6dfd20bf2ee0b668b6d4238d4511253b2233035..5d9b2db87135e53470b106dcd11a6bcfdc5dbda9 100644
--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
@@ -2,6 +2,7 @@
 # This script benchmarking the PaddlePaddle Fluid on
 # single thread single GPU.
 
+mkdir -p logs
 #export FLAGS_fraction_of_gpu_memory_to_use=0.0
 export CUDNN_PATH=/paddle/cudnn_v5
 
@@ -35,71 +36,74 @@ nohup stdbuf -oL nvidia-smi \
       --format=csv \
       --filename=mem.log  \
       -l 1 &
+
 # mnist
 # mnist gpu mnist 128
-FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=mnist \
                --device=GPU \
                --batch_size=128 \
                --skip_batch_num=5 \
                --iterations=500 \
-               2>&1 | tee -a mnist_gpu_128.log
+               2>&1 | tee -a logs/mnist_gpu_128.log
 
 # vgg16
 # gpu cifar10 128
-FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=vgg16 \
                --device=GPU \
                --batch_size=128 \
                --skip_batch_num=5 \
                --iterations=30 \
-               2>&1 | tee -a vgg16_gpu_128.log
+               2>&1 | tee -a logs/vgg16_gpu_128.log
 
 # flowers gpu  128
-FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=vgg16 \
                --device=GPU \
                --batch_size=32 \
                --data_set=flowers \
                --skip_batch_num=5 \
                --iterations=30 \
-               2>&1 | tee -a vgg16_gpu_flowers_32.log
+               2>&1 | tee -a logs/vgg16_gpu_flowers_32.log
 
 # resnet50
 # resnet50 gpu cifar10 128
-FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=resnet \
                --device=GPU \
                --batch_size=128 \
                --data_set=cifar10 \
-               --model=resnet_cifar10 \
                --skip_batch_num=5 \
                --iterations=30 \
-               2>&1 | tee -a resnet50_gpu_128.log
+               2>&1 | tee -a logs/resnet50_gpu_128.log
 
 # resnet50 gpu flowers 64
-FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=resnet \
                --device=GPU \
                --batch_size=64 \
                --data_set=flowers \
-               --model=resnet_imagenet \
                --skip_batch_num=5 \
                --iterations=30 \
-               2>&1 | tee -a resnet50_gpu_flowers_64.log
+               2>&1 | tee -a logs/resnet50_gpu_flowers_64.log
 
 # lstm
 # lstm gpu imdb 32 # tensorflow only support batch=32
-FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=stacked_dynamic_lstm \
                --device=GPU \
                --batch_size=32 \
                --skip_batch_num=5 \
                --iterations=30 \
-               --hidden_dim=512 \
-               --emb_dim=512 \
-               --crop_size=1500 \
-               2>&1 | tee -a lstm_gpu_32.log
+               2>&1 | tee -a logs/lstm_gpu_32.log
 
 # seq2seq
 # seq2seq gpu wmb 128
-FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=machine_translation \
                --device=GPU \
                --batch_size=128 \
                --skip_batch_num=5 \
                --iterations=30 \
-               2>&1 | tee -a lstm_gpu_128.log
+               2>&1 | tee -a logs/lstm_gpu_128.log
diff --git a/benchmark/fluid/run_fluid_benchmark.sh b/benchmark/fluid/run_fluid_benchmark.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4309a3126c1d72fe1eb2d5ec423075aea4d3ec88
--- /dev/null
+++ b/benchmark/fluid/run_fluid_benchmark.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device CPU --update_method pserver --iterations=10000 &
+
+sleep 15
+
+CUDA_VISIBLE_DEVICES=0,1 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
+
+CUDA_VISIBLE_DEVICES=2,3 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=1 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
diff --git a/benchmark/fluid/vgg.py b/benchmark/fluid/vgg.py
deleted file mode 100644
index 53e34e0cbd15914791c305db6797f826ebfae34e..0000000000000000000000000000000000000000
--- a/benchmark/fluid/vgg.py
+++ /dev/null
@@ -1,224 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""VGG16 benchmark in Fluid"""
-from __future__ import print_function
-
-import sys
-import time
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import argparse
-import functools
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    '--batch_size', type=int, default=128, help="Batch size for training.")
-parser.add_argument(
-    '--skip_batch_num',
-    type=int,
-    default=5,
-    help='The first num of minibatch num to skip, for better performance test')
-parser.add_argument(
-    '--iterations', type=int, default=80, help='The number of minibatches.')
-parser.add_argument(
-    '--learning_rate',
-    type=float,
-    default=1e-3,
-    help="Learning rate for training.")
-parser.add_argument('--pass_num', type=int, default=50, help="No. of passes.")
-parser.add_argument(
-    '--device',
-    type=str,
-    default='GPU',
-    choices=['CPU', 'GPU'],
-    help="The device type.")
-parser.add_argument(
-    '--data_format',
-    type=str,
-    default='NCHW',
-    choices=['NCHW', 'NHWC'],
-    help='The data order, now only support NCHW.')
-parser.add_argument(
-    '--data_set',
-    type=str,
-    default='cifar10',
-    choices=['cifar10', 'flowers'],
-    help='Optional dataset for benchmark.')
-parser.add_argument(
-    '--with_test',
-    action='store_true',
-    help='If set, test the testset during training.')
-args = parser.parse_args()
-
-
-def vgg16_bn_drop(input):
-    def conv_block(input, num_filter, groups, dropouts):
-        return fluid.nets.img_conv_group(
-            input=input,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act='relu',
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type='max')
-
-    conv1 = conv_block(input, 64, 2, [0.3, 0])
-    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
-    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
-    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
-    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
-    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
-    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
-    return fc2
-
-
-def main():
-    if args.data_set == "cifar10":
-        classdim = 10
-        if args.data_format == 'NCHW':
-            data_shape = [3, 32, 32]
-        else:
-            data_shape = [32, 32, 3]
-    else:
-        classdim = 102
-        if args.data_format == 'NCHW':
-            data_shape = [3, 224, 224]
-        else:
-            data_shape = [224, 224, 3]
-
-    # Input data
-    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    # Train program
-    net = vgg16_bn_drop(images)
-    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    # Evaluator
-    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(
-        input=predict, label=label, total=batch_size_tensor)
-
-    # inference program
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc, batch_size_tensor])
-
-    # Optimization
-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
-    opts = optimizer.minimize(avg_cost)
-
-    fluid.memory_optimize(fluid.default_main_program())
-
-    # Initialize executor
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
-    exe = fluid.Executor(place)
-
-    # Parameter initialization
-    exe.run(fluid.default_startup_program())
-
-    # data reader
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
-            buf_size=5120),
-        batch_size=args.batch_size)
-    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10()
-        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-        batch_size=args.batch_size)
-
-    # test
-    def test(exe):
-        test_accuracy = fluid.average.WeightedAverage()
-        for batch_id, data in enumerate(test_reader()):
-            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
-                                    data)).astype("float32")
-            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-            y_data = y_data.reshape([-1, 1])
-
-            acc, weight = exe.run(inference_program,
-                                  feed={"pixel": img_data,
-                                        "label": y_data},
-                                  fetch_list=[batch_acc, batch_size_tensor])
-            test_accuracy.add(value=acc, weight=weight)
-        return test_accuracy.eval()
-
-    iters, num_samples, start_time = 0, 0, time.time()
-    accuracy = fluid.average.WeightedAverage()
-    for pass_id in range(args.pass_num):
-        accuracy.reset()
-        train_accs = []
-        train_losses = []
-        for batch_id, data in enumerate(train_reader()):
-            if iters == args.skip_batch_num:
-                start_time = time.time()
-                num_samples = 0
-            if iters == args.iterations:
-                break
-            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
-                                    data)).astype("float32")
-            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-            y_data = y_data.reshape([-1, 1])
-
-            loss, acc, weight = exe.run(
-                fluid.default_main_program(),
-                feed={"pixel": img_data,
-                      "label": y_data},
-                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
-            accuracy.add(value=acc, weight=weight)
-            iters += 1
-            num_samples += len(y_data)
-            print(
-                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
-                (pass_id, iters, loss, acc)
-            )  # The accuracy is the accumulation of batches, but not the current batch.
-
-        # pass_train_acc = accuracy.eval()
-        train_losses.append(loss)
-        train_accs.append(acc)
-        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
-              (pass_id, np.mean(train_losses), np.mean(train_accs)))
-        train_elapsed = time.time() - start_time
-        examples_per_sec = num_samples / train_elapsed
-        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
-              (num_samples, train_elapsed, examples_per_sec))
-        # evaluation
-        if args.with_test:
-            pass_test_acc = test(exe)
-        exit(0)
-
-
-def print_arguments():
-    print('----------- vgg Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == "__main__":
-    print_arguments()
-    main()
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index e490397cc0624c310949a4b571bd00cac6e8953b..4158d0528a1aea52c2a3f0880fe1000183a9df53 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -41,6 +41,10 @@ if(USE_EIGEN_FOR_BLAS)
     add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
 endif(USE_EIGEN_FOR_BLAS)
 
+if(EIGEN_USE_THREADS)
+    add_definitions(-DEIGEN_USE_THREADS)
+endif(EIGEN_USE_THREADS)
+
 if(NOT WITH_PROFILER)
     add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)
@@ -88,6 +92,9 @@ if(WITH_GPU)
         if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
             message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
         endif()
+        if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4)
+            message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile")
+        endif()
         include_directories(${TENSORRT_INCLUDE_DIR})
     endif()
 elseif(WITH_AMD_GPU)
diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake
deleted file mode 100644
index 4823dc3e91390002aefac70f7931b4197db05789..0000000000000000000000000000000000000000
--- a/cmake/cpplint.cmake
+++ /dev/null
@@ -1,62 +0,0 @@
-# util to check C++ file style
-# * it basically use google cpplint.py.
-# * It provide "add_style_check_target" for cmake.
-#   Usage see add_style_check_target's document
-#
-# TODO(yuyang18): Add python style check.
-
-set(STYLE_FILTER)
-
-# diable unwanted filters
-
-# paddle do not indent public/potected/private in class
-set(STYLE_FILTER "${STYLE_FILTER}-whitespace/indent,")
-# paddle use mutable reference. BUT IT IS NOT RECOMMANDED
-set(STYLE_FILTER "${STYLE_FILTER}-runtime/references,")
-# paddle use relative path for include.
-set(STYLE_FILTER "${STYLE_FILTER}-build/include,")
-# paddle use <thread>, <mutex>, etc.
-set(STYLE_FILTER "${STYLE_FILTER}-build/c++11,")
-# paddle use c style casting. BUT IT IS NOT RECOMMANDED
-set(STYLE_FILTER "${STYLE_FILTER}-readability/casting")
-
-
-# IGNORE SOME FILES
-set(IGNORE_PATTERN
-    .*ImportanceSampler.*
-    .*cblas\\.h.*
-    .*\\.pb\\.txt
-    .*MultiDataProvider.*
-    .*pb.*
-    .*pybind.h)
-
-# add_style_check_target
-#
-# attach check code style step for target.
-#
-# first argument: target name to attach
-# rest arguments: source list to check code style.
-#
-# NOTE: If WITH_STYLE_CHECK is OFF, then this macro just do nothing.
-macro(add_style_check_target TARGET_NAME)
-    if(WITH_STYLE_CHECK)
-        set(SOURCES_LIST ${ARGN})
-        list(REMOVE_DUPLICATES SOURCES_LIST)
-        foreach(filename ${SOURCES_LIST})
-            foreach(pattern ${IGNORE_PATTERN})
-                if(filename MATCHES ${pattern})
-                    list(REMOVE_ITEM SOURCES_LIST ${filename})
-                endif()
-            endforeach()
-        endforeach()
-
-        if(SOURCES_LIST)
-            add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-                COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/scripts/cpplint.py"
-                        "--filter=${STYLE_FILTER}"
-                        ${SOURCES_LIST}
-                COMMENT "cpplint: Checking source code style"
-                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})        
-        endif()
-    endif()
-endmacro()
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index 10662fc96704685f030a5d76c6857d4bc20a63d9..73713d93d5a52738651dda498fac5ea66e3589d2 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -23,8 +23,12 @@ set(BOOST_PROJECT       "extern_boost")
 # checked that the devtools package of CentOS 6 installs boost 1.41.0.
 # So we use 1.41.0 here.
 set(BOOST_VER           "1.41.0")
-set(BOOST_TAR           "boost_1_41_0")
-set(BOOST_URL           "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz")
+if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL))
+    message(STATUS "use pre defined download url")
+    set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
+    set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
+endif()
+MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
 set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index edc93c2773f46ec9e0bf898557c55c93274e6a01..e029300eee9b99582f085f6b650e03f7dacc091a 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -21,11 +21,12 @@ else()
     ExternalProject_Add(
         extern_eigen3
         ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
+        GIT_REPOSITORY  "https://github.com/eigenteam/eigen-git-mirror"
         # eigen on cuda9.1 missing header of math_funtions.hpp
         # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
         GIT_TAG         917060c364181f33a735dc023818d5a54f60e54c
         PREFIX          ${EIGEN_SOURCE_DIR}
+        DOWNLOAD_NAME   "eigen"
         UPDATE_COMMAND  ""
         CONFIGURE_COMMAND ""
         BUILD_COMMAND     ""
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index e90948782bb5e333bbdb47ef9d61c1e37e3cf9e4..ffdf91a354bd92bdaf3f88344f0a9256638b568c 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -23,17 +23,29 @@ SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
 SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
 SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
 SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
+
+include(ProcessorCount)
+ProcessorCount(NUM_OF_PROCESSOR)
+
 IF(APPLE)
-  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
+  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
 ELSE()
-  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin)
+  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin)
 ENDIF()
 
+# FIXME(wuyi): do not build zlib cares protobuf twice, find a way to build grpc with them
 ExternalProject_Add(
     extern_grpc
     DEPENDS protobuf zlib
-    GIT_REPOSITORY "https://github.com/grpc/grpc.git"
-    GIT_TAG "v1.10.x"
+    # NOTE(wuyi):
+    # this package is generated by following steps:
+    # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git
+    # 2. submodule update --init
+    # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
+    #    checkout and clean other dirs under third_party
+    # 4. remove .git, and package the directory.
+    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.8.x.tar.gz"
+    URL_MD5  "c9c58ee7d0e8929a63155af6a2ecdbd0"
     PREFIX          ${GRPC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""
@@ -46,7 +58,6 @@ ExternalProject_Add(
     INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
 )
 
-# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them.
 ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
              "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 5759e5c489724332793bf103b7aacf7ffb068611..25c07850dda7b2f69c2207c37b9d2368632104ec 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -45,15 +45,15 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
 ELSE()
     MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
 ENDIF()
-
-SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow")
-SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} -Wno-error=strict-overflow")
+SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-unused-result")
+SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
+SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
 ExternalProject_Add(
     ${MKLDNN_PROJECT}
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS             ${MKLDNN_DEPENDS}
     GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "v0.11"
+    GIT_TAG             "db3424ad44901513c03a1ea31ccaacdf633fbe9f"
     PREFIX              ${MKLDNN_SOURCES_DIR}
     UPDATE_COMMAND      ""
     CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
@@ -61,6 +61,7 @@ ExternalProject_Add(
     CMAKE_ARGS          -DMKLROOT=${MKLML_ROOT}
     CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
     CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
+    CMAKE_ARGS          -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF
     CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
                         -DMKLROOT:PATH=${MKLML_ROOT}
 )
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 796bcf28a1dfb308ccb7a2f839742c5c2fcf2002..82c424fb79d5596c31891bc395699bf9ff4e7e7e 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -27,8 +27,12 @@ ENDIF()
 INCLUDE(ExternalProject)
 
 SET(MKLML_PROJECT       "extern_mklml")
-SET(MKLML_VER           "mklml_lnx_2018.0.1.20171007")
-SET(MKLML_URL           "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz")
+IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
+  MESSAGE(STATUS "use pre defined download url")
+  SET(MKLML_VER "mklml_lnx_2018.0.3.20180406" CACHE STRING "" FORCE)
+  SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
+ENDIF()
+MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 0fde4373a4be58e71ff1a305bd4991cc554d7a34..2665996432b1f6681927320a85d6835094abe4cd 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -212,6 +212,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
         ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake
             ${OPTIONAL_ARGS}
             -Dprotobuf_BUILD_TESTS=OFF
+            -DCMAKE_SKIP_RPATH=ON
             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
             -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake
index 80282329c6ac65fbd1493a6838efca4bd9cadaad..af09ed4d5d6e21cc50aba5198a7e9ea56f49451a 100644
--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@@ -47,8 +47,6 @@ ExternalProject_Add(
                      -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-    BUILD_COMMAND   make -j8
-    INSTALL_COMMAND make install
 )
 
 add_library(snappy STATIC IMPORTED GLOBAL)
diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake
index 20a96430823d07a07d4bb4602e7fc0cfe55c3bf2..6df636d7fa8757ade73892bda03a80ba9767472b 100644
--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
@@ -46,8 +46,6 @@ ExternalProject_Add(
                         -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
                         -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
                         -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-        BUILD_COMMAND   make -j8
-        INSTALL_COMMAND make install
         DEPENDS snappy
 )
 
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 1d3e2ade6d393c6e4c37eea0dc1064cdb18808a5..9ddd05b3d9404df29ca1bf634105314b7e6a5b70 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -206,8 +206,6 @@ function(cc_library TARGET_NAME)
         list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
       endif()
     endforeach()
-    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS})
-
   else(cc_library_SRCS)
     if(cc_library_DEPS)
       merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
@@ -231,7 +229,7 @@ endfunction(cc_binary)
 
 function(cc_test TARGET_NAME)
   if(WITH_TESTING)
-    set(options "")
+    set(options SERIAL)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS ARGS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -241,6 +239,9 @@ function(cc_test TARGET_NAME)
     add_test(NAME ${TARGET_NAME}
              COMMAND ${TARGET_NAME} ${cc_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    if (${cc_test_SERIAL})
+        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    endif()
   endif()
 endfunction(cc_test)
 
@@ -268,7 +269,6 @@ function(nv_library TARGET_NAME)
           list(APPEND nv_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
         endif()
       endforeach()
-      add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS})
     else(nv_library_SRCS)
       if (nv_library_DEPS)
         merge_static_libs(${TARGET_NAME} ${nv_library_DEPS})
@@ -295,7 +295,7 @@ endfunction(nv_binary)
 
 function(nv_test TARGET_NAME)
   if (WITH_GPU AND WITH_TESTING)
-    set(options "")
+    set(options SERIAL)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -303,6 +303,9 @@ function(nv_test TARGET_NAME)
     target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
     add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
     add_test(${TARGET_NAME} ${TARGET_NAME})
+    if (nv_test_SERIAL)
+        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    endif()
   endif()
 endfunction(nv_test)
 
@@ -338,7 +341,6 @@ function(hip_library TARGET_NAME)
 	  list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
 	endif()
       endforeach()
-      add_style_check_target(${TARGET_NAME} ${hip_library_SRCS} ${hip_library_HEADERS})
     else(hip_library_SRCS)
       if (hip_library_DEPS)
 	merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index cc758019827b9a5416a801e4da43d754d4492a73..236a55d332a91c88d1c5515e7aca4142930a079f 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -52,66 +52,91 @@ function(copy TARGET)
 endfunction()
 
 # third party
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/eigen3")
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/eigen3")
 copy(eigen3_lib
   SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
   DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported
+  DEPS eigen3
 )
 
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/gflags")
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/gflags")
 copy(gflags_lib
   SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
   DSTS ${dst_dir} ${dst_dir}/lib
+  DEPS gflags
 )
 
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/glog")
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/glog")
 copy(glog_lib
   SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
   DSTS ${dst_dir} ${dst_dir}/lib
+  DEPS glog
+)
+
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/boost/")
+copy(boost_lib
+  SRCS ${BOOST_INCLUDE_DIR}/boost
+  DSTS ${dst_dir}
+  DEPS boost
 )
 
 if(NOT PROTOBUF_FOUND)
-    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf")
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf")
     copy(protobuf_lib
       SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
       DSTS ${dst_dir} ${dst_dir}/lib
+      DEPS extern_protobuf
     )
 endif()
 
 if(NOT CBLAS_FOUND)
-    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/openblas")
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas")
     copy(openblas_lib
       SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
       DSTS ${dst_dir} ${dst_dir}
+      DEPS extern_openblas
     )
 elseif (WITH_MKLML)
-    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/mklml")
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml")
     copy(mklml_lib
       SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
       DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
+      DEPS mklml
     )
 endif()
 
+if(WITH_MKLDNN)
+  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn")
+  copy(mkldnn_lib
+    SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
+    DSTS ${dst_dir} ${dst_dir}/lib
+    DEPS mkldnn
+  )
+endif()
+
 if(NOT MOBILE_INFERENCE AND NOT RPI)
-  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappy")
+  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
   copy(snappy_lib
     SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
-    DSTS ${dst_dir} ${dst_dir}/lib)
+    DSTS ${dst_dir} ${dst_dir}/lib
+    DEPS snappy)
 
-  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappystream")
+  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
   copy(snappystream_lib
     SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
-    DSTS ${dst_dir} ${dst_dir}/lib)
+    DSTS ${dst_dir} ${dst_dir}/lib
+    DEPS snappystream)
 
-  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/zlib")
+  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
   copy(zlib_lib
     SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
-    DSTS ${dst_dir} ${dst_dir}/lib)
+    DSTS ${dst_dir} ${dst_dir}/lib
+    DEPS zlib)
 endif()
 
 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle/fluid")
+set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
 set(module "framework")
 copy(framework_lib DEPS framework_py_proto 
   SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
@@ -142,4 +167,31 @@ copy(string_lib
   DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
 )
 
+set(module "pybind")
+copy(pybind_lib
+  SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h
+  DSTS ${dst_dir}/${module}
+)
+
+# CMakeCache Info
+copy(cmake_cache
+  SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
+  DSTS ${FLUID_INSTALL_DIR})
+
 add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) 
+
+# paddle fluid version
+execute_process(
+  COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
+  WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
+set(version_file ${FLUID_INSTALL_DIR}/version.txt)
+file(WRITE ${version_file}
+  "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
+  "WITH_MKL: ${WITH_MKL}\n"
+  "WITH_GPU: ${WITH_GPU}\n")
+if(WITH_GPU)
+  file(APPEND ${version_file}
+    "CUDA version: ${CUDA_VERSION}\n"
+    "CUDNN version: v${CUDNN_MAJOR_VERSION}\n")
+endif()
diff --git a/doc/fluid/CMakeLists.txt b/doc/fluid/CMakeLists.txt
index 8086507bb4b7e870ad6d6091945ed07a00b5100b..be92af3902769a65c77953c9f3cb1f3aa3738d79 100644
--- a/doc/fluid/CMakeLists.txt
+++ b/doc/fluid/CMakeLists.txt
@@ -15,6 +15,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
 # HTML output director
 set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
 
+set(IMPORT_PADDLE_STRING "")
+set(IMPORT_PADDLEV2_STRING "")
+
 configure_file(
     "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
     "${BINARY_BUILD_DIR_EN}/conf.py"
@@ -27,8 +30,6 @@ sphinx_add_target(paddle_fluid_docs
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_EN})
 
-add_dependencies(paddle_fluid_docs gen_proto_py paddle_python)
-
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
 
@@ -50,6 +51,4 @@ sphinx_add_target(paddle_fluid_docs_cn
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_CN})
 
-add_dependencies(paddle_fluid_docs_cn gen_proto_py paddle_python)
-
 add_subdirectory(api)
diff --git a/doc/fluid/api/CMakeLists.txt b/doc/fluid/api/CMakeLists.txt
index 48b396f0786adad1ba6cd41f72497f853e54bc38..435d6e10fb02e9b2a8147f37da33e8848cc9b98a 100644
--- a/doc/fluid/api/CMakeLists.txt
+++ b/doc/fluid/api/CMakeLists.txt
@@ -7,6 +7,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
 # HTML output director
 set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
 
+set(IMPORT_PADDLE_STRING "import paddle")
+set(IMPORT_PADDLEV2_STRING "import paddle.v2")
+
 configure_file(
     "${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in"
     "${BINARY_BUILD_DIR_EN}/conf.py"
diff --git a/doc/fluid/api/clip.rst b/doc/fluid/api/clip.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3ba096388fc87dda3096a9030fe5749e61112c06
--- /dev/null
+++ b/doc/fluid/api/clip.rst
@@ -0,0 +1,47 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+====
+clip
+====
+
+ErrorClipByValue
+----------------
+
+..  autoclass:: paddle.fluid.clip.ErrorClipByValue
+    :members:
+    :noindex:
+
+GradientClipByValue
+-------------------
+
+..  autoclass:: paddle.fluid.clip.GradientClipByValue
+    :members:
+    :noindex:
+
+GradientClipByNorm
+------------------
+
+..  autoclass:: paddle.fluid.clip.GradientClipByNorm
+    :members:
+    :noindex:
+
+GradientClipByGlobalNorm
+------------------------
+
+..  autoclass:: paddle.fluid.clip.GradientClipByGlobalNorm
+    :members:
+    :noindex:
+
+append_gradient_clip_ops
+------------------------
+
+..  autofunction:: paddle.fluid.clip.append_gradient_clip_ops
+    :noindex:
+
+error_clip_callback
+-------------------
+
+..  autofunction:: paddle.fluid.clip.error_clip_callback
+    :noindex:
+
diff --git a/doc/fluid/api/evaluator.rst b/doc/fluid/api/evaluator.rst
index f80b87c7d2704a144c02028c4925530a67d11289..c0dc9a0d1d9f2f70948dc3c905dca25d7dd43742 100644
--- a/doc/fluid/api/evaluator.rst
+++ b/doc/fluid/api/evaluator.rst
@@ -5,24 +5,3 @@
 evaluator
 =========
 
-ChunkEvaluator
---------------
-
-..  autoclass:: paddle.fluid.evaluator.ChunkEvaluator
-    :members:
-    :noindex:
-
-EditDistance
---------------
-
-..  autoclass:: paddle.fluid.evaluator.EditDistance
-    :members:
-    :noindex:
-
-DetectionMAP
---------------
-
-..  autoclass:: paddle.fluid.evaluator.DetectionMAP
-    :members:
-    :noindex:
-  
diff --git a/doc/fluid/api/executor.rst b/doc/fluid/api/executor.rst
index a9cdf264e49691afc4b9425b7bfe54f8157ae6c2..f67a14c49f372e67d18ec8e6f87da01109376d22 100644
--- a/doc/fluid/api/executor.rst
+++ b/doc/fluid/api/executor.rst
@@ -30,3 +30,9 @@ switch_scope
 ..  autofunction:: paddle.fluid.executor.switch_scope
     :noindex:
 
+fetch_var
+---------
+
+..  autofunction:: paddle.fluid.executor.fetch_var
+    :noindex:
+
diff --git a/doc/fluid/api/gen_doc.sh b/doc/fluid/api/gen_doc.sh
index ba7b7ba8e51399deb852b0a7c8ddd3128f521e85..0f0539355559446fd91f659d61b636db214b5a40 100755
--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst
 
-for module in io data_feeder evaluator executor initializer io nets optimizer param_attr profiler regularizer
+for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer
 do
   python gen_doc.py ${module} > ${module}.rst
 done
diff --git a/doc/fluid/api/index_en.rst b/doc/fluid/api/index_en.rst
index 06c686d9508635abd41571983e00be174e94743e..29cea9c68221b921939e8e09072d87f9f604e21b 100644
--- a/doc/fluid/api/index_en.rst
+++ b/doc/fluid/api/index_en.rst
@@ -9,8 +9,9 @@ Fluid
     data_feeder.rst
     executor.rst
     initializer.rst
-    evaluator.rst
+    metrics.rst
     nets.rst
+    clip.rst
     optimizer.rst
     param_attr.rst
     profiler.rst
diff --git a/doc/fluid/api/initializer.rst b/doc/fluid/api/initializer.rst
index 2f02c5de097945a45a3e053427104bd17bea1279..c49a98c744cdf907630ea8c74791ff2021d996e8 100644
--- a/doc/fluid/api/initializer.rst
+++ b/doc/fluid/api/initializer.rst
@@ -33,11 +33,16 @@ Xavier
     :members:
     :noindex:
 
-MSRA
-------
+force_init_on_cpu
+-----------------
 
-..  autoclass:: paddle.fluid.initializer.MSRA
-    :members:
+..  autofunction:: paddle.fluid.initializer.force_init_on_cpu
+    :noindex:
+
+init_on_cpu
+-----------
+
+..  autofunction:: paddle.fluid.initializer.init_on_cpu
     :noindex:
 
 ConstantInitializer
@@ -68,9 +73,3 @@ XavierInitializer
     :members:
     :noindex:
 
-
-MSRAInitializer
------------------
-..  autoclass:: paddle.fluid.initializer.MSRAInitializer
-    :members:
-    :noindex:
diff --git a/doc/fluid/api/io.rst b/doc/fluid/api/io.rst
index dd9d88b669957c22cd0a07fa4b7e219e2d6e5d61..3e956f8302d261b52f9f76ff8eb4a01f9c6381f8 100644
--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
@@ -59,3 +59,21 @@ get_inference_program
 ..  autofunction:: paddle.fluid.io.get_inference_program
     :noindex:
 
+save_checkpoint
+---------------
+
+..  autofunction:: paddle.fluid.io.save_checkpoint
+    :noindex:
+
+load_checkpoint
+---------------
+
+..  autofunction:: paddle.fluid.io.load_checkpoint
+    :noindex:
+
+clean_checkpoint
+----------------
+
+..  autofunction:: paddle.fluid.io.clean_checkpoint
+    :noindex:
+
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index ff3c9346a2cd777a5294d536911f39de9032fe52..f78e6db3268e44d5f30d83508f07c4ed68106e48 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -55,6 +55,13 @@ While
     :members:
     :noindex:
 
+Switch
+------
+
+..  autoclass:: paddle.fluid.layers.Switch
+    :members:
+    :noindex:
+
 lod_rank_table
 --------------
 
@@ -67,12 +74,6 @@ max_sequence_len
 ..  autofunction:: paddle.fluid.layers.max_sequence_len
     :noindex:
 
-topk
-----
-
-..  autofunction:: paddle.fluid.layers.topk
-    :noindex:
-
 lod_tensor_to_array
 -------------------
 
@@ -109,6 +110,12 @@ less_than
 ..  autofunction:: paddle.fluid.layers.less_than
     :noindex:
 
+equal
+-----
+
+..  autofunction:: paddle.fluid.layers.equal
+    :noindex:
+
 array_read
 ----------
 
@@ -174,6 +181,12 @@ Print
 ..  autofunction:: paddle.fluid.layers.Print
     :noindex:
 
+is_empty
+--------
+
+..  autofunction:: paddle.fluid.layers.is_empty
+    :noindex:
+
 device
 ======
 
@@ -212,6 +225,55 @@ Send
 ..  autofunction:: paddle.fluid.layers.Send
     :noindex:
 
+open_recordio_file
+------------------
+
+..  autofunction:: paddle.fluid.layers.open_recordio_file
+    :noindex:
+
+open_files
+----------
+
+..  autofunction:: paddle.fluid.layers.open_files
+    :noindex:
+
+read_file
+---------
+
+..  autofunction:: paddle.fluid.layers.read_file
+    :noindex:
+
+shuffle
+-------
+
+..  autofunction:: paddle.fluid.layers.shuffle
+    :noindex:
+
+batch
+-----
+
+..  autofunction:: paddle.fluid.layers.batch
+    :noindex:
+
+double_buffer
+-------------
+
+..  autofunction:: paddle.fluid.layers.double_buffer
+    :noindex:
+
+random_data_generator
+---------------------
+
+..  autofunction:: paddle.fluid.layers.random_data_generator
+    :noindex:
+
+Preprocessor
+------------
+
+..  autoclass:: paddle.fluid.layers.Preprocessor
+    :members:
+    :noindex:
+
 nn
 ==
 
@@ -281,12 +343,6 @@ square_error_cost
 ..  autofunction:: paddle.fluid.layers.square_error_cost
     :noindex:
 
-accuracy
---------
-
-..  autofunction:: paddle.fluid.layers.accuracy
-    :noindex:
-
 chunk_eval
 ----------
 
@@ -311,6 +367,18 @@ sequence_pool
 ..  autofunction:: paddle.fluid.layers.sequence_pool
     :noindex:
 
+sequence_softmax
+----------------
+
+..  autofunction:: paddle.fluid.layers.sequence_softmax
+    :noindex:
+
+softmax
+-------
+
+..  autofunction:: paddle.fluid.layers.softmax
+    :noindex:
+
 pool2d
 ------
 
@@ -323,12 +391,6 @@ batch_norm
 ..  autofunction:: paddle.fluid.layers.batch_norm
     :noindex:
 
-layer_norm
-----------
-
-..  autofunction:: paddle.fluid.layers.layer_norm
-    :noindex:
-
 beam_search_decode
 ------------------
 
@@ -377,6 +439,12 @@ reduce_min
 ..  autofunction:: paddle.fluid.layers.reduce_min
     :noindex:
 
+reduce_prod
+-----------
+
+..  autofunction:: paddle.fluid.layers.reduce_prod
+    :noindex:
+
 sequence_first_step
 -------------------
 
@@ -425,6 +493,12 @@ matmul
 ..  autofunction:: paddle.fluid.layers.matmul
     :noindex:
 
+topk
+----
+
+..  autofunction:: paddle.fluid.layers.topk
+    :noindex:
+
 warpctc
 -------
 
@@ -473,6 +547,60 @@ multiplex
 ..  autofunction:: paddle.fluid.layers.multiplex
     :noindex:
 
+layer_norm
+----------
+
+..  autofunction:: paddle.fluid.layers.layer_norm
+    :noindex:
+
+softmax_with_cross_entropy
+--------------------------
+
+..  autofunction:: paddle.fluid.layers.softmax_with_cross_entropy
+    :noindex:
+
+smooth_l1
+---------
+
+..  autofunction:: paddle.fluid.layers.smooth_l1
+    :noindex:
+
+one_hot
+-------
+
+..  autofunction:: paddle.fluid.layers.one_hot
+    :noindex:
+
+autoincreased_step_counter
+--------------------------
+
+..  autofunction:: paddle.fluid.layers.autoincreased_step_counter
+    :noindex:
+
+reshape
+-------
+
+..  autofunction:: paddle.fluid.layers.reshape
+    :noindex:
+
+lod_reset
+---------
+
+..  autofunction:: paddle.fluid.layers.lod_reset
+    :noindex:
+
+lrn
+---
+
+..  autofunction:: paddle.fluid.layers.lrn
+    :noindex:
+
+pad
+---
+
+..  autofunction:: paddle.fluid.layers.pad
+    :noindex:
+
 label_smooth
 ------------
 
@@ -480,12 +608,35 @@ label_smooth
     :noindex:
 
 roi_pool
----------
+--------
 
 ..  autofunction:: paddle.fluid.layers.roi_pool
     :noindex:
 
-    
+dice_loss
+---------
+
+..  autofunction:: paddle.fluid.layers.dice_loss
+    :noindex:
+
+resize_bilinear
+---------------
+
+..  autofunction:: paddle.fluid.layers.resize_bilinear
+    :noindex:
+
+gather
+------
+
+..  autofunction:: paddle.fluid.layers.gather
+    :noindex:
+
+random_crop
+-----------
+
+..  autofunction:: paddle.fluid.layers.random_crop
+    :noindex:
+
 ops
 ===
 
@@ -501,18 +652,6 @@ mul
 ..  autofunction:: paddle.fluid.layers.mul
     :noindex:
 
-reshape
--------
-
-..  autofunction:: paddle.fluid.layers.reshape
-    :noindex:
-
-pad
----
-
-..  autofunction:: paddle.fluid.layers.pad
-    :noindex:
-
 scale
 -----
 
@@ -579,10 +718,76 @@ clip_by_norm
 ..  autofunction:: paddle.fluid.layers.clip_by_norm
     :noindex:
 
-sequence_softmax
-----------------
+logical_and
+-----------
 
-..  autofunction:: paddle.fluid.layers.sequence_softmax
+..  autofunction:: paddle.fluid.layers.logical_and
+    :noindex:
+
+logical_or
+----------
+
+..  autofunction:: paddle.fluid.layers.logical_or
+    :noindex:
+
+logical_xor
+-----------
+
+..  autofunction:: paddle.fluid.layers.logical_xor
+    :noindex:
+
+logical_not
+-----------
+
+..  autofunction:: paddle.fluid.layers.logical_not
+    :noindex:
+
+uniform_random
+--------------
+
+..  autofunction:: paddle.fluid.layers.uniform_random
+    :noindex:
+
+uniform_random_batch_size_like
+------------------------------
+
+..  autofunction:: paddle.fluid.layers.uniform_random_batch_size_like
+    :noindex:
+
+gaussian_random
+---------------
+
+..  autofunction:: paddle.fluid.layers.gaussian_random
+    :noindex:
+
+gaussian_random_batch_size_like
+-------------------------------
+
+..  autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like
+    :noindex:
+
+cumsum
+------
+
+..  autofunction:: paddle.fluid.layers.cumsum
+    :noindex:
+
+scatter
+-------
+
+..  autofunction:: paddle.fluid.layers.scatter
+    :noindex:
+
+sum
+---
+
+..  autofunction:: paddle.fluid.layers.sum
+    :noindex:
+
+shape
+-----
+
+..  autofunction:: paddle.fluid.layers.shape
     :noindex:
 
 sigmoid
@@ -651,6 +856,18 @@ floor
 ..  autofunction:: paddle.fluid.layers.floor
     :noindex:
 
+cos
+---
+
+..  autofunction:: paddle.fluid.layers.cos
+    :noindex:
+
+sin
+---
+
+..  autofunction:: paddle.fluid.layers.sin
+    :noindex:
+
 round
 -----
 
@@ -822,10 +1039,3 @@ zeros
 ..  autofunction:: paddle.fluid.layers.zeros
     :noindex:
 
-topk
-----
-
-..  autofunction:: paddle.fluid.layers.topk
-    :noindex:
-
-
diff --git a/doc/fluid/api/metrics.rst b/doc/fluid/api/metrics.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ddf07775d7ea293acd421b8549d03b277ff0611d
--- /dev/null
+++ b/doc/fluid/api/metrics.rst
@@ -0,0 +1,56 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=======
+metrics
+=======
+
+MetricBase
+----------
+
+..  autoclass:: paddle.fluid.metrics.MetricBase
+    :members:
+    :noindex:
+
+CompositeMetric
+---------------
+
+..  autoclass:: paddle.fluid.metrics.CompositeMetric
+    :members:
+    :noindex:
+
+Accuracy
+--------
+
+..  autoclass:: paddle.fluid.metrics.Accuracy
+    :members:
+    :noindex:
+
+ChunkEvaluator
+--------------
+
+..  autoclass:: paddle.fluid.metrics.ChunkEvaluator
+    :members:
+    :noindex:
+
+EditDistance
+------------
+
+..  autoclass:: paddle.fluid.metrics.EditDistance
+    :members:
+    :noindex:
+
+DetectionMAP
+------------
+
+..  autoclass:: paddle.fluid.metrics.DetectionMAP
+    :members:
+    :noindex:
+
+Auc
+---
+
+..  autoclass:: paddle.fluid.metrics.Auc
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/optimizer.rst b/doc/fluid/api/optimizer.rst
index 7a92caf9b7139cf091eff834dbed3586b23ac3af..6ad44bb6905b6e3f2b6e4aeb3701ced5d18e2005 100644
--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
@@ -89,9 +89,31 @@ DecayedAdagradOptimizer
     :members:
     :noindex:
 
+RMSPropOptimizer
+----------------
+
+..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+    :members:
+    :noindex:
+
 Adadelta
---------------
+--------
 
-..  autoclass:: paddle.fluid.optimizer.AdadeltaOptimizer
+..  autoclass:: paddle.fluid.optimizer.Adadelta
     :members:
     :noindex:
+
+ModelAverage
+------------
+
+..  autoclass:: paddle.fluid.optimizer.ModelAverage
+    :members:
+    :noindex:
+
+Optimizer
+---------
+
+..  autoclass:: paddle.fluid.optimizer.Optimizer
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/profiler.rst b/doc/fluid/api/profiler.rst
index 74d102dcb0db35766c34e3d14939a8aa5861686b..39fda65863471a78895503184848a754828b71a1 100644
--- a/doc/fluid/api/profiler.rst
+++ b/doc/fluid/api/profiler.rst
@@ -23,3 +23,15 @@ profiler
 ..  autofunction:: paddle.fluid.profiler.profiler
     :noindex:
 
+start_profiler
+--------------
+
+..  autofunction:: paddle.fluid.profiler.start_profiler
+    :noindex:
+
+stop_profiler
+-------------
+
+..  autofunction:: paddle.fluid.profiler.stop_profiler
+    :noindex:
+
diff --git a/doc/fluid/api/regularizer.rst b/doc/fluid/api/regularizer.rst
index 837c67111c6e98e6a3859be802addc20a1c64f2b..756bc53baa0625aef48dad0c35e7ae57421a70d0 100644
--- a/doc/fluid/api/regularizer.rst
+++ b/doc/fluid/api/regularizer.rst
@@ -11,6 +11,13 @@ append_regularization_ops
 ..  autofunction:: paddle.fluid.regularizer.append_regularization_ops
     :noindex:
 
+WeightDecayRegularizer
+----------------------
+
+..  autoclass:: paddle.fluid.regularizer.WeightDecayRegularizer
+    :members:
+    :noindex:
+
 L1Decay
 -------
 
@@ -26,15 +33,16 @@ L2Decay
     :noindex:
 
 L1DecayRegularizer
----------------------
+------------------
 
 ..  autoclass:: paddle.fluid.regularizer.L1DecayRegularizer
     :members:
     :noindex:
 
 L2DecayRegularizer
----------------------
+------------------
 
 ..  autoclass:: paddle.fluid.regularizer.L2DecayRegularizer
     :members:
     :noindex:
+
diff --git a/doc/fluid/build_and_install/paddleci.png b/doc/fluid/build_and_install/paddleci.png
new file mode 120000
index 0000000000000000000000000000000000000000..c3eb1457acc77cab9360e654240d1e8f548035b4
--- /dev/null
+++ b/doc/fluid/build_and_install/paddleci.png
@@ -0,0 +1 @@
+../../v2/build_and_install/paddleci.png
\ No newline at end of file
diff --git a/doc/fluid/design/concepts/functions_operators_layers.md b/doc/fluid/design/concepts/functions_operators_layers.md
index 30bc488a18a28d349645d9d2502aae6691a69931..1f86b99e5197c3e0b85fd76fe704520ef21b06d3 100644
--- a/doc/fluid/design/concepts/functions_operators_layers.md
+++ b/doc/fluid/design/concepts/functions_operators_layers.md
@@ -40,7 +40,7 @@ template <typename T>
 class FCOp : public OperatorBase {
  public:
   void Run(...) {
-    add(mul(Input<T>("X"), Input<T>("W")), Input<T>("b");
+    add(mul(Input<T>("X"), Input<T>("W")), Input<T>("b"));
   }
 };
 REGISTER_OP(FCOp, "fc");
diff --git a/doc/fluid/design/concepts/lod_tensor.md b/doc/fluid/design/concepts/lod_tensor.md
index a88292e7888d0ebc64ee89ca315dfea38a12c71d..d606d7a790b4b0dc18553f2220d39cec8aa619ec 100644
--- a/doc/fluid/design/concepts/lod_tensor.md
+++ b/doc/fluid/design/concepts/lod_tensor.md
@@ -155,7 +155,7 @@ into offsets
    3  2+3 4+5 1+9 2+10 3+12
 ```
 
-so we know that the first sentence is from word 0 to word 3, and the second sentence from work 3 to word 5.
+so we know that the first sentence is from word 0 to word 3, and the second sentence from word 3 to word 5.
 
 Similarly, the lengths in the top level LoD
 
diff --git a/doc/fluid/design/concepts/var_desc.md b/doc/fluid/design/concepts/var_desc.md
index 6750323c0167bf1efbde6ef4fd670e88a5aa502a..8db67f6703d142da71cf06bd4f7e2cb13556f9b0 100644
--- a/doc/fluid/design/concepts/var_desc.md
+++ b/doc/fluid/design/concepts/var_desc.md
@@ -35,7 +35,7 @@ The computation `Program` consists of nested `Blocks`. Each `Block` will consist
 
 ## Definition of VarType
 
-A VarDesc should have a name, type and whether or not it is persistable. The are different kinds of variable types supported in PaddlePaddle, apart from the POD_Types like: `LOD_TENSOR`, `SELECTED_ROWS`, `FEED_MINIBATCH`, `FETCH_LIST`, `STEP_SCOPES`, `LOD_RANK_TABLE`, `LOD_TENSOR_ARRAY`, `PLACE_LIST`, `READER` and `CHANNEL`. These are declared inside `VarType`. A `VarDesc` then looks as the following:
+A VarDesc should have a name, type and whether or not it is persistable. There are different kinds of variable types supported in PaddlePaddle, apart from the POD_Types like: `LOD_TENSOR`, `SELECTED_ROWS`, `FEED_MINIBATCH`, `FETCH_LIST`, `STEP_SCOPES`, `LOD_RANK_TABLE`, `LOD_TENSOR_ARRAY`, `PLACE_LIST`, `READER` and `CHANNEL`. These are declared inside `VarType`. A `VarDesc` then looks as the following:
 
 ```proto
 message VarDesc {
diff --git a/doc/fluid/design/dist_train/async_update.md b/doc/fluid/design/dist_train/async_update.md
index 6a0835b761b69030ba30697e6e8863928efbf57f..248d2ec18dafdecac9184527638754b6ba4d85b8 100644
--- a/doc/fluid/design/dist_train/async_update.md
+++ b/doc/fluid/design/dist_train/async_update.md
@@ -4,34 +4,37 @@
 
 For the typical synchronous distributed training, some significant steps are as follows:
 
-1. A Trainer will compute the gradients and SEND them to the Parameter Server(PServer) nodes.
-1. After the PServer node received gradients came from all the Trainers, It will aggregate the
+1. A trainer process will compute the gradients and **send** them to the parameter server (PS) nodes.
+1. After the PS node received gradients came from all the Trainers, It will aggregate the
 gradient variables for the same parameter into one gradient variable and then apply the aggregated
 gradient to the respective parameter, finally using an optimize algorithms(SGD, Monument...)
 to update the parameters.
-1. The Trainer would wait for the PServers finished the optimize stage, and GET the parameters from PServer,
+1. The Trainer would wait for the PS finished the optimize stage, and GET the parameters from PS,
 so all the Trainers would get the same parameters.
 
-In the synchronously distributed training, there should be a `Barrier` to synchronise the
-parameters after the optimizing stage. The performance of a distributed training job would
-depend on the slowest node if there were hundreds or thousands of training nodes in a
-Job, the performance of synchronously distributed training might be very poor because of
-the slow node. So this design doc would introduce an approach to implement
-*asynchronously* distributed training in PaddlePaddle Fluid.
+In Synchronous Distributed Training, there is a **barrier** on each PS to wait until all trainers processes
+have completed running current mini-batch. After that, all trainers can continue to run the next
+mini-batch. So, we can find that the overall performance of Synchronous Distributed Training depends 
+on the slowest node.
+
+In Asynchronous Distributed Training, we don't need to wait for a global mini-bach, the optimizer on
+the PS will run immediately when the gradient is uploaded to the PS from one trainer. This mode would
+train such models that achieve scaling, better throughput. In this design doc, we will introduce how to 
+implement the Asynchronous Distributed Training base on PaddlePaddle Fluid.
 
 ## Design
 
 <img src="./src/async_update.png" width="600"/>
 
-As the figure above, we describe a global view of asynchronously update process and use
+As the figure above, we describe a global view of the asynchronous update process and use
 the parameter `w1` as an example to introduce the steps:
 1. For each gradient variables, they may distribute on different GPU card and aggregate
 them while they are all calculated.
-1. Split the gradient variable into multiple blocks according to the number of PServer
+1. Split the gradient variable into multiple blocks according to the number of PS
 instances and then send them.
-1. PServer would run an `Optimize Block` using a specified optimize algorithm to update
+1. PS would run an `Optimize Block` using a specified optimize algorithm to update
 the specified parameter.
-1. The trainer will fetch latest parameter from PServer before running forward Op which depends
+1. The trainer will fetch the latest parameter from PS before running forward Op which depends
 on the specified parameter.
 1. Broadcast the received variable into multiple GPU cards and continue to run the next
 mini-batch.
@@ -40,8 +43,8 @@ mini-batch.
 
 - For the multiple devices distributed training, we need to aggregate the gradient
 variables which placed on different devices firstly and then schedule a `SendVars` Operator to
-send the gradient variables to the multiple PServer instances.
-- Schedule `FetchVars` operator to fetch the latest parameter from PServer before running
+send the gradient variables to the multiple PS instances.
+- Schedule `FetchVars` operator to fetch the latest parameter from PS before running
 the forward ops.
 - There could be a large number of gradient variables to be sent, so we need to use another
 thread pool(IO Threadpool) whose a number of the schedulable threads is larger than the
diff --git a/doc/fluid/design/motivation/api.md b/doc/fluid/design/motivation/api.md
index e6a4638d9100d9b07c3ee6b92b530a17eae1c162..bc222564e3ec28e306ca0572b6a23104f6e9cbc5 100644
--- a/doc/fluid/design/motivation/api.md
+++ b/doc/fluid/design/motivation/api.md
@@ -77,8 +77,7 @@ print "The sematic-vector of testA: ", paddle.infer(fA, parameters, testA)
 
 ### Example 2. Sharing Parameters between "Models"
 
-We use [GAN](https://github.com/PaddlePaddle/book/tree/develop/gan) in
-this example.  In the following example program, `d0` and `d1`
+We use GAN in this example.  In the following example program, `d0` and `d1`
 correspond to the two networks in the following figure:
 
 <img src="https://github.com/wangyang59/book/raw/00036f4b0da5225041a6824587c1a01cf20159b1/gan/image/gan_ig.png" width=400 />
diff --git a/doc/fluid/design/motivation/refactorization.md b/doc/fluid/design/motivation/refactorization.md
index 4e1d660cef6369f04db8e1e83360f6af25259f96..ad9d0f6d3f3ad9884f108826e8410871fffd51bf 100644
--- a/doc/fluid/design/motivation/refactorization.md
+++ b/doc/fluid/design/motivation/refactorization.md
@@ -125,12 +125,12 @@ Compile Time -> IR -> Runtime
 
 ## Operator/OpWithKernel/OpKernel
 
-![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/49caf1fb70820fb4a6c217634317c9306f361f36/op_op_with_kern_class_diagram.dot)
+![class_diagram](https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/op_op_with_kern_class_diagram.dot)
 
 ---
 
 ## Operator
-![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot)
+![class_diagram](https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/op.dot)
 
 * `Operator` is the fundamental building block of the user interface.
     * Operator stores input/output variable names and attributes.
@@ -141,7 +141,7 @@ Compile Time -> IR -> Runtime
 
 ## OpWithKernel/Kernel
 
-![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/9d7f4eba185cf41c8e2fbfb40ae21890dbddcd39/op_with_kernel.dot)
+![class_diagram](https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/op_with_kernel.dot)
 
 * `OpWithKernel` inherits `Operator`.
 * `OpWithKernel` contains a Kernel map.
diff --git a/doc/fluid/design/multi_devices/operator_kernel_type.md b/doc/fluid/design/multi_devices/operator_kernel_type.md
index 8c1bc8f76a337006497e5ab5e5a710f9f49261b8..5e391bd62b4f4e123a9a6f35b7adf5726f205635 100644
--- a/doc/fluid/design/multi_devices/operator_kernel_type.md
+++ b/doc/fluid/design/multi_devices/operator_kernel_type.md
@@ -75,7 +75,7 @@ Different layout leads to different implementation of the operator kernel. There
 
 - The inference of Layout is at run-time, not at compile-time.
 
-- Every operator has to implement different kernels for different layouts. Let's take MKLDNN as an example. If we want to implement an MKLDNN convolution operator, we have to implement all the kernels for different layouts, which are listed [here](http://01org.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to  register kernels for MKLDNN operators.
+- Every operator has to implement different kernels for different layouts. Let's take MKLDNN as an example. If we want to implement an MKLDNN convolution operator, we have to implement all the kernels for different layouts, which are listed [here](http://intel.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to  register kernels for MKLDNN operators.
 
 `Layout` is also defined as a enum variable:
 
diff --git a/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md b/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
new file mode 100644
index 0000000000000000000000000000000000000000..79df6c59578e2acf495a3453ab61f069c3f09a49
--- /dev/null
+++ b/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
@@ -0,0 +1,1819 @@
+
+# Paddle Fluid 开发者指南
+
+---
+
+### ==1==. 为什么需要 PaddlePaddle Fluid？
+
+---
+
+### 两个基础问题
+
+<font size=6>
+
+1. 如何描述机器学习模型和优化过程？
+    - 完备自洽，表达能力足以支持潜在出现的各种计算需求
+1. 如何充分利用资源高效计算？
+    - 支持异步设备、多卡、分布式计算
+    - 降低计算/计算优化的开发成本
+    - ……
+
+</font>
+
+---
+
+### 如何描述模型和优化过程？
+
+<font size=6>
+
+<table>
+<thead>
+<tr>
+<th> </th>
+<th>一组连续执行的layers</th>
+<th>variable和operator构成的计算图 </th>
+<th>不再有模型的概念 </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> 2013</td>
+<td> Caffe，Theano, Torch, PaddlePaddle </td>
+<td> </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> 2015 </td>
+<td> </td>
+<td> TensorFlow, MxNet, Caffe2, ONNX, n-graph </td>
+<td> </td>
+</tr>
+<tr>
+<td>2016 </td>
+<td> </td>
+<td> </td>
+<td> PyTorch, TensorFlow Eager Execution, <font color=#483D8B>**==PaddlePaddle Fluid==** </td>
+</tr>
+
+</tbody>
+</table>
+
+---
+
+
+### <p align="center">目标 </p>
+
+<font size=6>
+
+- 提高对各类机器学习任务的描述能力：能够描述潜在出现的任意机器学习模型。
+- 代码结构逻辑清晰，各模块充分解耦：内外部贡献者能够专注于自己所需的功能模块，基于框架进行再次开发。
+- 从设计上，留下技术优化的空间和潜力。
+- 代码解耦后降低多设备支持、计算优化等的开发成本。
+- 在统一的设计理念下，实现自动可伸缩，自动容错的分布式计算。
+
+</font>
+
+---
+
+## ==2.== Design Overview
+
+---
+
+# Fluid: 系统形态
+
+- <span style="background-color:#ACD6FF;">[编译器式的执行流程，区分编译时和运行时](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)</span>
+<br>
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid-compiler.png" width=100%>
+</p>
+
+---
+
+#### 让我们在Fluid程序实例中，区分编译时和运行时
+
+---
+### Fluid 编译时
+
+<font size=5>
+
+- ==**定义前向计算**==
+
+  ```python
+  x = fluid.layers.data(name='x',shape=[13], dtype='float32')
+  y_predict = fluid.layers.fc(input=x, size=1, act=None)
+  y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+  cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+  avg_cost = fluid.layers.mean(x=cost)
+  ```
+
+- ==**添加反向、正则、优化**==
+  ```python
+  learning_rate = 0.01
+  sgd_optimizer = fluid.optimizer.SGD(learning_rate)
+  sgd_optimizer.minimize(avg_cost)
+  ```
+</font>
+
+---
+
+### `Program` vs. 计算图
+
+<font size=5>
+
+- 在科学计算领域，计算图是一种描述计算的经典方式。下图展示了从前向计算图（蓝色）开始，通过添加反向（红色）和优化算法相关（绿色）操作，构建出整个计算图的过程：
+-
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/graph_construction_example_all.png" width=60%>
+</p>
+
+
+- Fluid ==使用`Program`而不是计算图==来描述模型和优化过程。`Program`由`Block`、`Operator`和`Variable`构成，相关概念会在后文详细展开。
+- 编译时 Fluid 接受前向计算（这里可以先简单的理解为是一段有序的计算流）`Program`，为这段前向计算按照：前向 -> 反向 -> 梯度 clip -> 正则 -> 优化 的顺序，添加相关 `Operator`和`Variable`到`Program`到完整的计算。
+
+</font>
+
+---
+
+### Fluid 运行时
+
+<font size=5>
+
+- ==**读入数据**==
+
+  ```python
+  train_reader = paddle.batch(
+      paddle.reader.shuffle(paddle.dataset.uci_housing.train(), buf_size=500),
+      batch_size=20)
+  feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+  ```
+- ==**定义执行程序的设备**==
+  ```python
+  place = fluid.CPUPlace()
+  feeder = fluid.DataFeeder(place=place,feed_list=[x, y])
+  ```
+
+- ==创建执行器（Executor），执行初始化 `Program`和训练`Program`==
+
+  ```python
+  exe = fluid.Executor(place)
+  exe.run(fluid.default_startup_program())
+  PASS_NUM = 100
+  for pass_id in range(PASS_NUM):
+      for data in train_reader():
+          avg_loss_value, = exe.run(fluid.default_main_program(),
+                                    feed=feeder.feed(data),
+                                    fetch_list=[avg_cost])
+          print(avg_loss_value)
+  ```
+</font>
+
+---
+
+### 总结：框架做什么？用户做什么？
+<br>
+
+<font size=5>
+<table>
+<thead>
+<tr>
+<th>构建训练</th>
+<th>执行训练</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>
+<span style="background-color:#B3D9D9">用户</span>：描述前向运算<br><span style="background-color:#DAB1D5;">框架</span>：添加反向运算<br><span style="background-color:#DAB1D5;">框架</span>：添加优化运算<br><span style="background-color:#DAB1D5;">框架</span>：添加内存优化<br><span style="background-color:#DAB1D5;">框架</span>：添加并行/多设备/分布式相关的计算单元
+</td>
+
+<td>
+<span style="background-color:#DAB1D5;">框架</span>：创建Operator（计算）+ Variable（数据）<br><span style="background-color:#DAB1D5;">框架</span>：创建`Block`<br><span style="background-color:#DAB1D5;">框架</span>：内存管理/设备管理<br><span style="background-color:#DAB1D5;">框架</span>：执行计算
+</td>
+</tr>
+</tbody>
+</table>
+</font>
+
+---
+
+### <p align="center">总结：编译时</p>
+<font size=5>
+
+<span style="background-color:#A3D1D1;">**用户编写一段Python程序，描述模型的前向计算**</span>
+1. 创建变量描述 `VarDesc`
+1. 创建operators的描述 `OpDesc`
+1. 创建operators的属性
+1. 推断变量的类型和形状，进行静态检查：`inferShape`
+1. 规划变量的内存复用
+1. 创建反向计算
+1. 添加优化相关的Operators
+1. （可选）添加多卡/多机相关的Operator，生成在多卡/多机上运行的程序
+
+</font>
+
+---
+
+### <p align="center">总结：运行时</p>
+<font size=5>
+
+<span style="background-color:#C7C7E2;">**执行规划好的计算**</span>
+1. 创建`Executor`
+1. 为将要执行的一段计算，在层级式的`Scope`空间中创建`Scope`
+1. 创建`Block`，依次执行`Block`
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/compile_run_time.png" width=50%><br>
+<font size=3> Figure. 编译时运行时概览</font>
+</p>
+
+</font>
+
+---
+<!-- *template: invert -->
+## ==3==. 用户如何描述计算？
+---
+
+### Fluid：==像写程序一样==定义计算
+<font size=5>
+
+- 顺序执行
+    ```python
+    x = fluid.layers.data(name='x',shape=[13], dtype='float32')
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    ```
+
+- 条件分支: [swith](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md)、[ifelse](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md)
+
+   ```python
+   a = fluid.Var(10)
+   b = fluid.Var(0)
+
+   switch = fluid.switch()
+   with switch.block():
+      with switch.case(fluid.less_equal(a, 10)):
+          fluid.print("Case 1")
+      with switch.case(fluid.larger(a, 0)):
+          fluid.print("Case 2")
+      with switch.default():
+          fluid.print("Case 3")
+   ```
+
+>[A Lisp cond form may be compared to a continued if-then-else as found in many algebraic programming languages](https://www.cs.cmu.edu/Groups/AI/html/cltl/clm/node84.html).
+
+</font>
+
+---
+
+### Fluid: ==像写程序一样==定义计算
+
+<font size=5>
+
+- 循环：[while](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py#L105)
+
+  ```python
+  d0 = layers.data("d0", shape=[10], dtype='float32')
+  data_array = layers.array_write(x=d0, i=i)
+  array_len = layers.fill_constant(shape=[1],dtype='int64', value=3)
+
+  cond = layers.less_than(x=i, y=array_len)
+  while_op = layers.While(cond=cond)
+  with while_op.block():
+      d = layers.array_read(array=data_array, i=i)
+      i = layers.increment(x=i, in_place=True)
+      layers.array_write(result, i=i, array=d)
+      layers.less_than(x=i, y=array_len, cond=cond)
+  ```
+
+- 完整实例请点查看 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_while_op.py#L36-L44)
+- beam search  [->]( https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py#L105)
+
+</font>
+
+---
+
+#### <p align="center">总结</p>
+
+<font size=5>
+
+1. 用户层提供的描述语法具有完备性、自洽性，有能力支持对复杂计算过程描述
+1. 使用方式和核心概念可以类比编程语言，认知能够直接迁移
+1. 能够支持：定义问题，逐步求解
+
+</font>
+
+---
+
+## ==3.== 核心概念
+
+---
+### 编译时概念 ：==变量和计算的描述==
+
+<font size=5>
+
+- `VarDesc` + `TensorDesc` + `OpDesc` -> `BlockDesc` -> `ProgramDesc`
+    - https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto
+
+- <span style="background-color:#DAB1D5;">什么是 Fluid Program</span>
+
+  - 在Fluid中，一个神经网络任务（训练/预测）被描述为一段`Program`
+  - `Program`包含对`Variable`（数据）和 `Operator`（对数据的操作）的描述
+  - `Variable` 和 `Operator` 被组织为多个可以嵌套的`Block`，构成一段完整的`Fluid Program`
+
+
+>编译阶段最终，经过 Transpiler 的执行规划，变换处理，生成使用`protobuf`序列化后的`ProgramDesc`。可以发送给多卡或者网络中的其它计算节点执行
+
+</font>
+
+---
+
+### 编译时概念 ：==**[Transpiler](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)**==
+<font size=5>
+
+1. 接受一段`ProgramDesc`作为输入，生成一段新的`ProgramDesc`
+
+    - *Memory optimization transpiler*：向原始`ProgramDesc` 中插入 `FreeMemoryOps`，在一次迭代优化结束前提前释放内存，使得能够维持较小的 memory footprint
+
+    - *Distributed training transpiler*：将原始的`ProgramDesc`中转化为对应的分布式版本，生成两段新的`ProgramDesc`:
+        1. trainer进程执行的`ProgramDesc`
+        1. parameter server执行的`ProgramDesc`
+
+1. ==**WIP**==: 接受一段`ProgramDesc`，生成可直接被`gcc`, `nvcc`, `icc`等编译的代码，编译后得到可执行文件
+
+</font>
+
+---
+### Transplier
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/transpiler.png" width=70%>
+</p>
+
+---
+
+### 打印 `ProgramDesc`
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/print_fluid_program.png" width=70%>
+</p>
+
+<font size=5>
+
+- `default_startup_program`：创建可学习参数，对参数进行初始化
+- `default_main_program`：由用户定义的模型，包括了前向、反向、优化及所有必要的计算
+
+- 打印可读的 `Program`
+  ```python
+  from paddle.v2.fluid import debuger
+  print debuger.pprint_program_codes(framework.default_main_program().desc)
+  ```
+</font>
+
+---
+### 输出效果
+
+<font size=5>
+
+<table>
+<thead>
+<th>variable in block 0</th>
+<th>variable in block 0</th>
+</thead>
+<tbody>
+<tr>
+<td><img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/program_desc1.png" width=70%></td>
+<td><img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/program_desc2.png" width=70%></td>
+</tr>
+</tbody>
+</table>
+</font>
+
+---
+
+### 运行时概念
+
+<font size=5>
+
+- 数据相关
+  - `Tensor` / `LoDTensor` / `Variable`
+  - `Scope`
+
+- 计算相关
+  - `Block`
+  - `Kernel`、`OpWithKernel`、`OpWithoutKernel`
+
+<table>
+<thead>
+<th></th>
+<th>protobuf messages</th>
+<th>C++ class objects</th>
+</thead>
+<tbody>
+<tr>
+<td>Data</td>
+<td>[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L107)
+</td>
+<td>[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24)
+</td>
+</tr>
+
+<tr>
+<td>Operation</td>
+<td>[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L35)
+</td>
+<td>[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L64)
+</td>
+</tr>
+<tr>
+<td>Block</td>
+<td>BlockDesc
+</td>
+<td>Block
+</td>
+</tr>
+
+
+</tbody>
+</table>
+
+- 执行相关 ：`Executor`
+
+</font>
+
+---
+#### Tensor 和 LoD(Level-of-Detail) Tensor
+<font size=5>
+
+- Tensor 是$n$-dimensional arry的推广，LoDTensor是在Tensor基础上附加了序列信息
+- Fluid中输入、输出，网络中的可学习参数全部统一使用LoDTensor（n-dimension array）表示
+- 一个mini-batch输入数据是一个LoDTensor
+  - 在Fluid中，RNN 处理变长序列无需padding，得益于 `LoDTensor`表示
+  - 可以简单将 LoD 理解为：`std::vector<std::vector<int>>`
+  - 对非序列数据，LoD 信息为空
+
+<table>
+<thead>
+<th></th>
+<th>TensorFlow</th>
+<th>PaddlePaddle</th>
+</thead>
+<tbody>
+<tr>
+<td>RNN</td>
+<td>Support
+</td>
+<td>Support
+</td>
+</tr>
+
+<tr>
+<td>recursive RNN</td>
+<td>Support
+</td>
+<td>Support
+</td>
+</tr>
+<tr>
+<td>padding zeros</td>
+<td>Must
+</td>
+<td>No need
+</td>
+<tr>
+<td>blob data type</td>
+<td>Tensor
+</td>
+<td>LODTensor
+</td>
+
+</tr>
+</tbody>
+</table>
+
+</font>
+
+---
+#### LoD 信息实例
+
+<font size=4>
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/LoDTensor.png" width=43%>
+</p>
+
+- 图(a)的LoD 信息
+  ```cpp
+  [0, 5, 8, 10, 14]
+  ```
+- 图(b)的 LoD 信息
+  ```cpp
+  [[0, 5, 8, 10, 14] /*level=1*/, [0, 2, 3, 5, 7, 8, 10, 13, 14] /*level=2*/]
+  ```
+</font>
+
+---
+#### Tensor, Variable, Scope 之间的关系
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/scope_variable_tensor.png" width=40%>
+</p>
+<font size=5>
+
+1. `Block` 是一个实现层的概念，不在应用层暴露给用户。目前用户无法自行创建并利用`Block`，用户能够感知的只有`Program`这个概念。
+1. 逻辑上，可以将 `Block` 类比为编程语言中的大括号：定义了一段作用域，其中运行一段代码
+1. `Executor`会为每一个`Block`创建一个`Scope`，`Block`是可嵌套的，因此`Scope`也是可嵌套的
+
+</font>
+
+---
+### Executor
+
+<font size=5>
+
+<table>
+<thead>
+<th>接口</th>
+<th>说明</th>
+</thead>
+<tbody>
+<tr>
+<td><p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/executor.png" width=60%>
+</p></td>
+<td><span style="background-color:#B3D9D9;">输入</span><br>1. `ProgramDesc`<br>2. `Scope`<br> 3.`block_id`<br><br><span style="background-color:#B3D9D9;">解释执行步骤</span><br>1. 创建所有 Variables<br> 2. 逐一创建 Operator 并运行
+</td>
+</tr>
+</tbody>
+</table>
+
+---
+### Operator/OpWithKernel/Kernel
+<font size=5>
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/operator1.png" width=50%>
+</p>
+
+- operator 无状态，Operator的核心是==Run==方法
+- 一个operator可以注册多个kernel
+- operator 可以无 kernel：while_op 、ifelse op
+
+</font>
+
+---
+#### Fluid Operator vs. PaddlePaddle layers
+<font size=5>
+
+<table>
+<thead>
+<th>Layer</th>
+<th>Operator</th>
+</thead>
+<tbody>
+<tr>
+<td><p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/layer.png" width=70%>
+</p></td>
+<td><p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/operator2.png" width=73%>
+</p></td>
+</tr>
+
+<tr>
+<td>1. 内部维护状态<br>2. 包含forward和backward方法</td>
+<td>1. 内部无状态<br>2. 只有Run方法</td>
+</tr>
+</tbody>
+</table>
+
+</font>
+
+---
+
+### ==4.== 内存管理
+
+---
+### 目标
+
+- 为异构设备提供统一的内存分配、回收接口
+- 最小化管理内存所需的时间，最小化管理开销
+- 减少内存碎片
+- 将内存管理与计算（Operators/Kernels）完全剥离
+- 统一内存管理是内存优化的基础
+
+---
+
+<font size=5>
+
+### Memory 接口
+
+- 内存管理模块向上层应用逻辑提供三个基础接口：
+  ```cpp
+  template <typename Place>
+  void* Alloc(Place place, size_t size);
+
+  template <typename Place>
+  void Free(Place place, void* ptr);
+
+  template <typename Place>
+  size_t Used(Place place);
+
+  struct Usage : public boost::static_visitor<size_t> {
+    size_t operator()(const platform::CPUPlace& cpu) const;
+    size_t operator()(const platform::CUDAPlace& gpu) const;
+  };
+  ```
+- 模板参数 `Place` 指示内存分配发生的设备
+- 实现时，需特化支持的 `Place`， 提供以上三个接口的实现
+
+</font>
+
+---
+### 代码结构
+
+<font size=5>
+
+内存管理模块可以理解为由以下两部分构成：
+
+1. SystemAllocator：实际从物理设备上分配、释放的内存的接口
+1. BuddyAllocator：内存管理算法
+
+</font>
+
+---
+### System Allocator
+
+<font size=5>
+
+- SystemAllocator 是实现物理内存分配、回收的基类
+    - 不同设备上的内存分配和回收终将转化为标准接口调用
+    - 为不同设备实现MemoryAllocator，继承自SystemAllocator
+
+  ```cpp
+  class SystemAllocator {
+   public:
+    virtual ~SystemAllocator() {}
+    virtual void* Alloc(size_t& index, size_t size) = 0;
+    virtual void Free(void* p, size_t size, size_t index) = 0;
+    virtual bool UseGpu() const = 0;
+  };
+  ```
+</font>
+
+---
+
+### CPU/GPU Allocator
+
+<font size=5>
+
+```cpp
+class CPUAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+};
+
+#ifdef PADDLE_WITH_CUDA
+class GPUAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+ private:
+  size_t gpu_alloc_size_ = 0;
+  size_t fallback_alloc_size_ = 0;
+};
+#endif
+```
+- CPUAllocator和GPUAllocator分别继承自SystemAllocator，分别调用相应的标准库函数实现物理内存的分配和释放。
+- 一旦大块、连续的物理内存分配之后，将通过内存管理算法实现内存的按块分配、回收、重用等。
+
+</font>
+
+---
+### CPU Allocator
+
+<font size=5>
+
+- CPU 内存的分配提供两种选项：
+    1. non-pinned memory：可分页内存
+    2. pinned memory：页锁定内存
+        - 分配过大的页锁定内存有可能因为系统可使用的分页内存减少，影响系统性能，默认CPU下分配的是可分页内存
+
+- 通过gflags进行设置一次性分配内存的大小以及是否使用页锁定内存。
+
+   ```cpp
+   DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
+   DEFINE_double(fraction_of_cpu_memory_to_use, 1,
+                 "Default use 100% of CPU memory for PaddlePaddle,"
+                 "reserve the rest for page tables, etc");
+   ```
+
+</font>
+
+---
+### GPU Allocator
+
+<font size=5>
+
+- 通过 cudaMalloc 分配GPU显存
+- GPUAllocator::Alloc 首先会计算指定GPU device上的可用显存
+    - 如果可用显存小于请求分配大小，调用cudaMalloc进行分配
+    - 如果可用显存不足，目前会报错退出。
+- 通过gflags控制GPU下一次性分配显存的大小：
+
+  ```cpp
+  DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
+                "Default use 92% of GPU memory for PaddlePaddle,"
+                "reserve the rest for page tables, etc");
+  ```
+
+</font>
+
+---
+#### 内存管理算法:  [Buddy Memory Allocation](https://en.wikipedia.org/wiki/Buddy_memory_allocation)
+
+<font size=5>
+
+- Memory Arena：一次性分配大块连续内存，之后会基于这块内存进行内存管理：动态分配、释放、重用内存块。
+- 伙伴内存分配：
+    - 将内存划分为 2 的幂次方个分区，使用 best-fit 方法来分配内存请求。
+    - 当释放内存时，检查 buddy 块，查看相邻的内存块是否也已被释放。如果是，将内存块合并，以最小化内存碎片。
+    - 分配的内存在物理内存的自然边界对齐，提高内存访问效率。
+    - 算法的时间效率高，单使用 best-fit 方法的缘故，会产生一定的内存浪费
+
+</font>
+
+---
+
+### Buddy Allocator
+
+<font size=5>
+
+- BuddyAllocator 是一个单例，每个设备（如： GPU/CPU(0)/GPU(1)） 拥有一个BuddyAllocator
+- BuddyAllocator 内部拥有一个私有成员变量 SystemAllocator
+- 当请求的内存超过BuddyAllocator管理的空余内存时，将会调用SystemAllocator去指定的设备上分配物理内存
+
+</font>
+
+---
+### 实例：CPU 下内存管理接口的实现
+
+<font size=5>
+
+- 对上层应用，统一通过BuddyAllocator来实现内存的分配、释放以及用量查询
+    ```cpp
+    template <>
+    void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
+      VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+      void* p = GetCPUBuddyAllocator()->Alloc(size);
+      VLOG(10) << "  pointer=" << p;
+      return p;
+    }
+
+    template <>
+    void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
+      VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+      GetCPUBuddyAllocator()->Free(p);
+    }
+
+    template <>
+    size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
+      return GetCPUBuddyAllocator()->Used();
+    }
+    ```
+</font>
+
+---
+### ==5.== 多设备支持
+
+---
+### 多设备支持（一）
+
+<font size=5>
+
+- step 1：添加Place类型，<span style="background-color:#DAB1D5;">由用户实现添加到框架</span>
+   - 可以将Place类型理解为一个整数加上一个枚举型，包括：设备号 + 设备类型
+
+    <p align="center">
+    <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/place.png" width=40%>
+    </p>
+- DeviceContext
+    - 不同的Place会对应一个相应的DeviceContext，用于组织管理与设备相关的信息
+      - 例如，GpuDeviceContext中会管理Cuda stream
+    - 目前实现中一些特殊的库也会对应有自己的DeviceContext：例如：
+      ```cpp
+      class MKLDNNDeviceContext : public CPUDeviceContext {……}
+      ```
+    - 每种设备对应的DeviceContext需要管理的内容不尽相同，视具体需求来实现
+
+</font>
+
+---
+
+### 多设备支持（二）
+
+<font size=5>
+
+- step 2: 增加KernelType，为相应的KernelType注册Kernel对象，<span style="background-color:#DAB1D5;">由用户实现注册给框架</span> 可以按照：
+    1. Place 执行设备
+    1. DataType 执行数据类型 FP32/FP64/INT32/INT64
+    1. Memory layout： 运行时 Tensor 在内存中的排布格式 NCHW、 NHWC
+    1. 使用的库
+
+    来区分Kernel，为同一个operator注册多个 Kernel。
+
+    ```cpp
+    struct OpKernelType {
+      proto::DataType data_type_;
+      DataLayout data_layout_;
+      platform::Place place_;
+      LibraryType library_type_;
+    }
+    ```
+
+</font>
+
+---
+
+### 多设备支持（三）
+
+<font size=5>
+
+step 3: 运行时的 KernelType 推断和Kernel切换，<span style="background-color:#DAB1D5;">按需要修改Kernel推断和Kernel切换规则</span>
+- Expected Kernel：期待调用的Kernel：由（1）`Place`和计算精度决定；或（2）用户在配置中显示指定使用的计算库，如`cudnn`、`mkldnn`等。
+- Actual Kernel：运行时从`Operator`的输入（`Variable`）可以推断出实际需要的`KernelType`
+- 当Expected Kernel和Actual Kernel不一致的时候，框架会插入`data_transformer`或者`data_layerout_transform`等，保证Expected Kernel可以执行，包括：
+   - CPUPlace -> GPUPlace ：跨设备内存复制
+   - NCHW -> nChw8c ：Layout转换
+   - FP32 -> FP16 ：精度转换 _**尚未支持**_
+   - ……
+- 以上过程实现在OperatorWithKernel类的Run方法中 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.cc#L497)
+
+</font>
+
+---
+## ==6.== while_op
+
+---
+### while_op
+
+<font size=5>
+
+- 循环执行一段`Program`，直到条件operator判断循环条件不满足时终止循环
+- while_op 的特殊之处：
+  1. while_op 没有 kernel
+  1. while_op 拥有自己的`Block`，会形成一段嵌套的`Block`
+  1. ==while_op 内部创建了一个 Executor，来循环执行`Block`==
+
+- while_op 输入输出 ： LoDTensorArray
+    ```cpp
+    namespace paddle {
+    namespace framework {
+    using LoDTensorArray = std::vector<LoDTensor>;
+    }
+    }
+    ```
+    - 每一次循环，从原始输入中“切出”一个片段
+    - LoDTensorArray 在Python端暴露，是Fluid支持的基础数据结构之一，用户可以直接创建并使用
+
+</font>
+
+---
+### while_op [Run](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/while_op.cc#L42) 方法概览
+
+<font size=5>
+
+```cpp
+
+void Run(const framework::Scope &scope,
+         const platform::Place &dev_place) const override {
+  PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
+  auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
+  PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
+
+  framework::Executor executor(dev_place);
+  auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+
+  auto *program = block->Program();
+  auto step_scopes =
+      scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();
+
+  while (cond.data<bool>()[0]) {
+    auto &current_scope = scope.NewScope();
+    step_scopes->push_back(&current_scope);
+    executor.Run(*program, &current_scope, block->ID(),
+                   false /*create_local_scope*/);
+  }
+}
+
+```
+
+</font>
+
+---
+### while_op 的重要应用：Dynamic RNN
+
+---
+
+### 什么是 `dynamicRNN` ?
+
+<font size=5>
+<br>
+
+1. 用户可以自定义在一个时间步之内的计算, 框架接受序列输入数据，在其上循环调用用户定义的单步计算
+1. 可学习参数在多个时间步之间共享
+1. `dynamicRNN` 由 `while_op` 实现
+1. 如果`dynamicRNN`中定义了`memory`，将会构成一个循环神经网络，否则其行为就等于在输入序列上循环调用预定义的单步计算
+
+</font>
+
+---
+
+#### `dynamic RNN` 用户接口
+<font size=5>
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/user_interface.png" width=75%>
+</p>
+
+- `dynamicRNN` 中的重要元素
+  1. **step input**: `dynamicRNN` 每个时间步的输入
+  1. **step function**: 用户定义的单步计算
+  1. **memory**: 用于形成循环连接
+  1. **external/static memory**：单步计算的每一步都可以全部读取到的外部输入
+
+</font>
+
+---
+
+#### dynamicRNN 中的 Memory
+
+<font size=5>
+
+`dynamicRNN`中`memory`的行为非常类似于 C++ 中的引用变量
+  - `memory` “指向” 一个operator的输出变量，记作： A
+  - `memory` 可以被 LoDTensor 初始化（当LoD信息为空时，为非序列，否则为序列）,默认`memory`被初始化为零
+  - `memory` 在 operator A 前向计算之后，进行前向计算
+  - 当 `memory` 的前向计算会 "指向" A 的输出 LoDTensor
+  - `memory` 的输出可以是另一个 operator 的输入，于是形成了“循环”连接
+
+</font>
+
+---
+
+### DynamicRNN 实现细节
+
+<font size=5>
+
+- `while_op` <span style="background-color:#DAB1D5;">无法独立构成dynamicRNN</span>，必须和一组相关的 operator 及数据结构配合
+    - 依赖的 operators (这里仅列出最重要的，并非全部):
+        - `lod_rank_table` operator
+        - `lod_tensor_to_array` operator
+        - `array_to_lod_tensor` operator
+        - `shrink_memory` operator
+    - 依赖的数据结构
+        - `TensorArray`
+        - `LoDRankTable`
+
+- 在Fluid中，RNN接受变长序列输入，无需填充，以上数据结构和相关的operator配合工作，实现了对变长输入以batch计算
+
+</font>
+
+---
+
+### `dynamicRNN` 如何实现 batch 计算 ?
+
+<font size=5>
+
+- 问题：
+  - RNN 可以看作是一个展开的前向网络，前向网络的深度是最长序列的长度
+  - 如果不对变长序列进行填充，将它们填充到一样长度，每个mini-batch输入将会不等长，每个样本展开长度不一致，导致前向和反向计算实现困难
+
+</font>
+
+----
+##### 实例 ：RNN encoder-decoder with attention
+
+<font size=5>
+
+- 以机器翻译的RNN encoder-decoder 模型（涉及了`dynamicRNN`的所有设计要素）为例，下图是 RNN encoder-decoder 的原始输入：
+  <p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/raw_input.png" width=100%><br><font size=3> Figure. RNN encoder-decoder 原始batch 输入数据</font>
+  </p>
+
+- source word sequences 是encoder RNN的输出，是一个LoDTensor
+- target word sequences 是look_uptable的输入，是一个LoDTensor
+- 上图中一个矩形方块是CPU/GPU内存中一片连续的内存空间，表示一个dense vector
+
+</font>
+
+---
+
+### `dynamicRNN` 如何实现 batch 计算 ?
+
+<font size=5>
+
+1. 对一个mini batch中不等长样本进行排序，最长样本变成batch中的第一个，最短样本是batch中最后一个
+      - `LoDTensor` -> `LoDRankTable` :heavy_plus_sign: `lod_rank_table operaator`
+          - 可以将`LoDRankTable`理解为对LoDTensor中的多个序列按照长度排序LoDRankTable 存储了排序之后的index
+
+2. 构建每个时间步的batch输入：随着时间步增加，每个时间步的batch输入可能会逐渐缩小
+    - `TensorArray` :heavy_plus_sign: `lod_tensor_to_array` -> `LoDTensor` (without LoD)
+3. 每个时间步输出写入一个输出 `LoDTensorArray`
+3. `dynamicRNN`循环结束后, 按照`LoDRankTable`中记录的信息对输出`LoDTensorArray`重排序，还原会原始输入顺序
+    - `TensorArray` :heavy_plus_sign: `array_to_lod_tensor` -> `LoDTensor`
+
+</font>
+
+---
+
+### 运行实例
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/sorted_input.png" width=100%>
+</p>
+
+---
+### 运行实例
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/1.png" width=100%>
+</p>
+
+<font size=5>
+
+- 执行到第5~7个batch时，batch size将会缩小
+
+</font>
+
+---
+### 运行实例
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/1.png" width=80%>
+</p>
+
+<font size=5>
+
+- 第5 ~ 7个batch时RNN的`memory`会发生什么？
+    - `memory` 指向某个operator的输出Tensor，在该operator前向计算之后，“取回”其计算结果
+    - 5 ~ 7时，遇到了序列的结束，==下一个时间步计算不再需要在已经结束的序列上展开==
+    - 在`dynamicRNN`中`shrink_memory` operator 用来缩小`memory`的batch输入
+
+</font>
+
+---
+### 运行实例：batch 1 ~ 2
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/2.png" width=70%><br><font size=4>Figure. 第1、2个batch输入dynamicRNN的batch输入</font>
+</p>
+
+---
+### 运行实例：batch 3 ~ 4
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/3.png" width=70%><br><font size=4>Figure. 第3、4个batch输入dynamicRNN的batch输入</font>
+</p>
+
+---
+
+### 运行实例：batch 5 ~ 7
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/4.png" width=70%><br><font size=4>Figure. 第5、6、7个batch输入dynamicRNN的batch输入</font>
+</p>
+
+---
+### ==7.== Fluid 代码结构
+
+---
+### Fluid 代码结构
+
+<table>
+<thead>
+<tr>
+<th>代码结构</th>
+<th>模块结构</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid_module_1.png" width=60%>
+</p>
+</td>
+<td>
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid_module_2.png" width=60%>
+</p>
+</td>
+</tr>
+
+</tbody>
+</table>
+
+---
+
+### ==8.== 文档总结
+
+---
+<font size=5>
+
+- 设计概览
+  - 重构概览 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/refactorization.md)
+  - fluid [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md)
+  - fluid_compiler [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)
+- 核心概念
+  - variable 描述 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/var_desc.md)
+  - Tensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.md)
+  - LoDTensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
+  - TensorArray [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md)
+  - Program [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md)
+  - Block [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md)
+  - Scope [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md)
+
+---
+
+- 重要功能模块
+  - backward [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/backward.md)
+  - 内存优化 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/memory_optimization.md)
+  - evaluator [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/executor.md)
+  - python API [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md)
+  - regularization [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/regularization.md)
+
+- 开发指南
+  - 支持新设硬件设备库 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md)
+  - 添加新的Operator [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_cn.md)
+  - 添加新的Kernel [->](
+https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_kernel_en.md)
+
+</font>
+
+---
+
+### ==9.== 开发指南
+
+---
+
+#### 建议开发环境：使用 Docker 编译和测试
+
+<font size=5>
+
+Docker编译PaddlePaddle源码: [->](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)
+
+PaddlePaddle 在 Dockerhub 地址：[->](
+    https://hub.docker.com/r/paddlepaddle/paddle/tags/)
+
+1. 获取PaddlePaddle的Docker镜像
+    ```bash
+    docker pull paddlepaddle/paddle:latest-dev
+    ```
+
+1. 启动 docker container
+
+    ```bash
+    docker run -it -v $PWD/Paddle:/paddle paddlepaddle/paddle:latest-dev /bin/bash
+    ```
+
+1. 进入docker container后，从源码编译，请参考文档 [->]( http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/build_from_source_cn.html)
+
+</font>
+
+---
+
+### 一些说明
+
+<font size=5>
+
+1. PaddlePaddle的Docker镜像为了减小体积，默认没有安装vim，可以在容器中执行`apt-get install -y vim`来安装vim。
+1. 开发推荐使用tag为`latest-dev`的镜像，其中打包了所有编译依赖。`latest`及`lastest-gpu`是production镜像，主要用于运行PaddlePaddle程序。
+2. 在Docker中运行GPU程序，推荐使用nvidia-docker，[否则需要将CUDA库和设备挂载到Docker容器内](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)。
+   <font size=4>
+
+   ```bash
+   nvidia-docker run -it -v $PWD/Paddle:/paddle paddlepaddle/paddle:latest-dev /bin/bash
+   ```
+   </font>
+
+
+</font>
+
+---
+
+### [如何贡献](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/dev/contribute_to_paddle_cn.html)
+
+<font size=5>
+
+- ==提交PullRequest前请务必阅读==： [->](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/dev/contribute_to_paddle_cn.html)
+- 代码要求
+    1. 代码注释遵守 Doxygen 的样式
+    1. 确保编译器选项 WITH_STYLE_CHECK 已打开，并且编译能通过代码样式检查
+    1. 所有代码必须具有单元测试，且能够通过所有单元测试
+- 使用 `pre-commit` 钩子提交Pull Request
+    1. 帮助格式化源代码（C++，Python）
+    1. 在提交前自动检查一些基本事宜：如每个文件只有一个 EOL，Git 中不要添加大文件等
+    1. 安装pre-commit，并在PaddlePaddle根目录运行：
+    ```bash
+      ➜  pip install pre-commit
+      ➜  pre-commit install
+    ```
+</font>
+
+---
+
+### 如何贡献
+
+<font size=5>
+
+1. 开始开发之前请先建立issue。
+    - 让其它同学知道某项工作已经有人在进行，以避免多人开发同一功能的情况。
+1. 提交PR必须关联相关的issue。做法请参考：[->](https://help.github.com/articles/closing-issues-using-keywords/)
+    - 目的：为了在提交的版本中留有记录描述这个PR是为了开发什么样的功能，为了解决什么样的问题。
+    - 当PR被merge后，关联的issue会被自动关闭。
+1. PR review 中，reviewer的每条comment都必须回复。
+    - 如修改完可直接回复：Done。
+    - 目的：review comment 中可能会有（1）询问类型的问题；（2）可以在下一个PR修改的问题；（3）comment意见不合理等。需要明确回复，以便reviewer和其他人有历史可查，便于区分是否已经进行修改，或者准备下一个PR修改，或者意见不合理可以不用进行修改。
+
+</font>
+
+---
+
+### ==10.== 添加新的 Operator
+
+---
+
+### 概念简介
+
+<font size=5>
+
+添加一个新的operator，会涉及实现以下C++类的派生类：
+
+1. `framework::OperatorBase`: Operator(简写，Op)基类。
+1. `framework::OpKernel`: Op计算函数的基类，称作Kernel。
+1. `framework::OperatorWithKernel`：继承自OperatorBase，Op有计算函数，称作有Kernel。
+1. `class OpProtoAndCheckerMaker`：描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
+
+依据是否包含kernel，可以将Op分为两种：
+1. 包含Kernel的Op：继承自OperatorWithKernel，==绝大多数operator都属于这一类==
+1. 不包含kernel的Op，继承自OperatorBase，只有少量Op属于这一类，例如while_op，ifelse_op
+
+<span style="background-color:#DAB1D5;">这里主要介绍带Kernel的Op如何编写。</span>
+
+</font>
+
+---
+
+#### 添加新的Operator需要修改/添加哪些文件？
+
+<font size=5>
+
+<table>
+<thead>
+<tr>
+<th>内容</th>
+<th>定义位置</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>
+OpProtoMake定义
+</td>
+<td>
+`.cc`文件，<span style="background-color:#DAB1D5;">Backward Op不需要OpProtoMaker</span>
+</td>
+</tr>
+<tr>
+<td>
+Op定义
+</td>
+<td>
+`.cc`文件
+</td>
+</tr>
+<tr>
+<td>
+Kernel实现
+</td>
+<td>
+<span style="background-color:#DAB1D5;">CPU、CUDA共享Kernel实现在`.h`文件中</span>，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。
+</td>
+</tr>
+
+<tr>
+<td>
+注册Op
+</td>
+<td>
+Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中
+</td>
+</tr>
+
+</tbody>
+</table>
+
+- 添加 Operator 之前请阅读：[Operator 命名规范](https://github.com/PaddlePaddle/Paddle/blob/63cca04cfd488a4dab6d6273fd04a8017ef45932/doc/fluid/dev/name_convention.md)及[Operator Markdown注释规范](https://github.com/PaddlePaddle/Paddle/blob/63cca04cfd488a4dab6d6273fd04a8017ef45932/doc/fluid/dev/op_markdown_format.md)。
+- 实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。
+- 根据文件名自动构建op和Python端绑定，<span style="background-color:#DAB1D5;">请务必遵守以上命名，否则需要进一步修改PyBind相关文件及CMakeLists.txt</span>。
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step1</span>: 定义ProtoMaker类
+
+<font size=5>
+
+下面均以[clip_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.h)为例进行介绍
+
+- clip_op计算公式：$Out = \min(\max(X, min), max)$
+- 首先定义`ProtoMaker`来描述该Op的输入、输出，并添加注释（<font size=4>*下面代码段的中注释进行了简化，实现时需按照规范添加注释*</font>）：
+
+    ```cpp
+    template <typename AttrType>
+    class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
+     public:
+      ClipOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+          : OpProtoAndCheckerMaker(proto, op_checker) {
+        AddInput("X","(Tensor)The input of clip op.");
+        AddOutput("Out", "(Tensor),The output of clip op.");
+        AddAttr<AttrType>(
+            "min", "(float),Minimum value.");
+        AddAttr<AttrType>(
+            "max", "(float),Maximum value.");
+        AddComment(R"DOC(
+        ……
+    )DOC");
+      }
+    };
+    ```
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step2</span>: 定义Operator类
+
+<font size=5>
+
+下面的代码段实现了`clip_op`的定义：
+
+```cpp
+class ClipOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ClipOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ClipOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto max = ctx->Attrs().Get<float>("max");
+    auto min = ctx->Attrs().Get<float>("min");
+    PADDLE_ENFORCE_LT(min, max, "max should be greater than min.");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+```
+</font>
+
+---
+
+### Operator 类中需要完成的工作
+
+<font size=5>
+
+1. clip_op 继承自`OperatorWithKernel`，
+
+    ```cpp
+    using framework::OperatorWithKernel::OperatorWithKernel;
+    ```
+    表示使用基类`OperatorWithKernel`的构造函数。
+
+1. 重写`InferShape`接口。
+    - `InferShape` 为const函数，不能修改Op的成员变
+    - `InferShape` 的参数为 `const framework::InferShapeContext &ctx`，从中可获取到输入输出以及属性
+    - `InferShape` 会被调用两次，一次是编译时（创建op），一次是运行时（调用op的`Run`方法时），需要完成以下功能：
+        1. 做检查， 尽早报错：检查输入数据维度、类型等是否合法
+        2. 设置输出Tensor的形状
+
+<span style="background-color:#DAB1D5;">通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中。</span>
+
+</font>
+
+---
+
+### 补充说明
+
+<font size=5>
+
+1. `InferShape`目前支持两种实现方式，<span style="background-color:#DAB1D5;">二者最后都会生成一个functor注册给OpInfo结构体。</span>
+    1. 继承framework::InferShapeBase，实现为一个functor（参考 [mul_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L22)）
+    2. override InferShape函数（参考 [clip_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.cc#L24)）
+
+1. 什么是`functor` ?
+
+   - 类或结构体仅重载了`()`，一般是可被多个kernel复用的计算函数。
+
+        <font size=4>
+
+        ```cpp
+        template <typename T>
+        class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
+         public:
+          void operator()(const platform::CPUDeviceContext& ctx,
+                          framework::Tensor* out,
+                          const framework::Tensor* prob,
+                          const framework::Tensor* labels, const bool softLabel) {
+               ……
+          }
+        };
+        ```
+        </font>
+
+    - 在 clip_op 内也会看到将一段计算函数抽象为functor的使用法： [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.h#L27)。
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step3</span>: 定义OpKernel类
+
+<font size=5>
+
+- `ClipKernel`继承自`framework::OpKernel`，带有下面两个模板参数:
+    1. `typename DeviceContext`: 表示设备类型，不同设备共享同一个Kernel时，需添加该模板参数。不共享时，需要提供针对不同设备的特化实现。
+    1. `typename T` : 表示支持的数据类型，如`float`, `double`等
+
+- 在`ClipKernel`类中重写`Compute`方法
+    1. `Compute`接受输入参数：`const framework::ExecutionContext& context`
+        - `ExecutionContext` 是从 `Scope`中将运行时Op的输入、输出`Variable`组织在一起，使得Op在调用`Compute`方法时，能够简单地通过名字拿到需要的输入输出`Variable`
+        - 与`InferShapeContext`相比，`ExecutionContext` 中增加了设备类型
+    1. 在`Compute`函数里实现`OpKernel`的具体计算逻辑
+
+</font>
+
+---
+#### ClipKernel 代码概览
+
+<font size=5>
+
+```cpp
+template <typename DeviceContext, typename T>
+class ClipKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto max = context.Attr<T>("max");
+    auto min = context.Attr<T>("min");
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    const T* x_data = x->data<T>();
+    int64_t numel = x->numel();
+    Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x_data,
+          x_data + numel, out_data, ClipFunctor<T>(min, max));
+  }
+};
+```
+
+- 为了使`OpKernel`的计算过程书写更加简单，并且CPU、CUDA的代码可以复用， Fluid 使用 Eigen 作为基础的矩阵运算库
+- Fluid对Eigen unsupported Tensor提供了一些基本的封装，可以在`Compute`接口中直接调用
+    - 关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_cn.md)。
+
+</font>
+
+---
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step4</span>: 实现反向Op
+
+<font size=5>
+
+- ==**反向Op没有`ProtoMaker`**==，除此之外定义与实现方式前向Op完全一致，不再赘述
+- 这里仅对反向Op的输入输出进行说明：
+    1. 反向Op的输入
+        - 前向Op的输出
+        - 反向传播过程中传递给当前Op的梯度
+            - 需要注意，<span style="background-color:#e1c4c4;">Fluid中，不区分Cost Op和中间层Op，所有Op都必须正确处理接收到的梯度</span>
+    2. 反向Op的输出
+        - 对可学习参数的求导结果
+        - 对所有输入的求导结果
+
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step5</span>: 注册Op及Kernel
+
+<font size=5>
+
+至此Op和Op kernel都已经实现完毕，接下来，需要在`.cc`和`cu`文件中注册op和kernel
+
+1. 在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
+
+    <font size=4>
+
+    ```cpp
+    namespace ops = paddle::operators;
+    REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad,
+                ops::ClipOpGrad);
+    REGISTER_OP_CPU_KERNEL(
+        clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>);
+    REGISTER_OP_CPU_KERNEL(
+        clip_grad, ops::ClipGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ```
+
+   - 在上面的代码片段中：
+
+     1. `REGISTER_OP` ： 注册`ops::ClipOp`类，类型名为`clip`，该类的`ProtoMaker`为`ops::ClipOpMaker`，注册`ops::ClipOpGrad`，类型名为`clip_grad`
+     1. `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op，例如：优化算法相关的Op
+     1. `REGISTER_OP_CPU_KERNEL` ：注册`ops::ClipKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::ClipGradKernel`类
+
+    </font>
+1. 按照同样方法，在`.cu`文件中注册GPU Kernel
+   -  <span style="background-color:#e1c4c4;">如果CUDA Kernel的实现基于Eigen，需在 `.cu`的开始加上宏定义 `#define EIGEN_USE_GPU` </span>
+
+</font>
+
+---
+
+##### 编译和Python端绑定
+
+<font size=5>
+
+- 运行下面命令可以仅编译新添加的Op：
+
+  ```
+  make mul_op
+  ```
+  - <span style="background-color:#e1c4c4;">需注意，运行单元测试需要编译整个工程</span>
+
+- 如果遵循前文的文件命名规则，构建过程中，会自动为新增的op添加Python端绑定，并链接到生成的lib库中
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step6</span>: 添加前向单测及梯度检测
+
+<font size=5>
+
+- 新增Op的单元测试统一添加至：[python/paddle/v2/fluid/tests/unittests](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/unittests)目录
+- 前向Operator单测
+
+    1. Op单元测试继承自`OpTest`，各项具体的单元测试在`TestClipOp`里完成，所有单测case都以`TestXX`命名
+    1. 单元测试Operator，需要：
+        1. 在`setUp`函数定义输入、输出，以及相关的属性参数
+        1. 生成随机的输入数据
+        1. 在Python脚本中实现与前向operator相同的计算逻辑，得到输出值，与operator前向计算的输出进行对比
+        1. 反向梯度检测流程测试框架已经实现，直接调用相应接口`check_grad`即可
+
+- `clip_op` 单测代码请参考 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_clip_op.py)，这里不再展开
+
+</font>
+
+---
+#### 编译执行单测
+
+<font size=5>
+
+- `python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译
+
+    - <span style="background-color:#e1c4c4;">运行单元测试测时需要编译整个工程，并且编译时需要打开`WITH_TESTING`</span>, 即`cmake paddle_dir -DWITH_TESTING=ON`
+- 编译成功后，执行下面的命令来运行单元测试：
+
+  ```bash
+  make test ARGS="-R test_mul_op -V"
+  ```
+
+  或者:
+
+  ```
+  ctest -R test_mul_op
+  ```
+</font>
+
+---
+
+### 添加Op的一些注意事项
+
+<font size=5>
+
+- 为每个Op创建单独的`*_op.h`（如有）、`*_op.cc`和`*_op.cu`（如有）。<span style="background-color:#e1c4c4;">不允许一个文件中包含多个Op</span>，将会导致编译出错。
+- 注册Op时的类型名，需要和该Op的名字一样。<span style="background-color:#e1c4c4;">不允许在`A_op.cc`里面，注册`REGISTER_OP(B, ...)`</span>，会导致单元测试出错。
+- 如果Op<span style="background-color:#e1c4c4;">没有实现CUDA Kernel，不要创建空的`*_op.cu`</span>，会导致单元测试出错。
+- 如果多个Op依赖一些共用的函数，可以创建非`*_op.*`格式的文件来存放，如`gather.h`文件。
+
+</font>
+
+---
+
+### ==10.== 使用相关问题
+
+---
+
+### 定义前向计算
+
+<font size=5>
+
+- 当在python端执行时：
+    ```python
+    import paddle.v2.fluid as fluid
+    ```
+    [`framework.py`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#L1040)定义了两个全局`Program`:
+    ```python
+    # program is a global instance.
+    _main_program_ = Program()
+    _startup_program_ = Program()
+    ```
+
+- 前向定义的过程就是不断往`mian_program`中添加Op和Variable
+- 如果需要执行一个新的`mian_program`时，可以调用调用：
+    ```python
+    def switch_main_program(program):
+        """
+        Switch the main program to a new program.
+        This funtion returns the previous main program.
+        """
+        ……
+    ```
+</font>
+
+---
+
+### 自定义参数的初始化
+
+<font size=5>
+
+- 调用`fluid.ParamAttr(……)`接口，自定义参数的初始化
+
+  ```python
+  w_param_attrs = ParamAttr(name=None,
+      initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
+      learning_rate=1.0,
+      regularizer=L1Decay(1.0),
+      trainable=True,
+      clip=GradientClipByValue(-1.0, 1.0),
+  )
+  y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
+  ```
+
+- 补充问题：如何创建 `Variable`
+  ```python
+  cur_program = Program()
+  cur_block = cur_program.current_block()
+  new_var = cur_block.create_var(name="X", shape=[-1, 16, 16], dtype="float32")
+  ```
+
+</font>
+
+---
+
+### 添加反向Op
+
+<font size=5>
+
+- 调用`fluid.backward.append_backward(X)`（`X`是一个Variable），来为一段前向`ProgramDesc`添加反Op
+
+    ```python
+    data = fluid.layers.data(name="data", shape=(2,3,4))
+    out = fluid.layers.fc(input=data,size=128,act=None)
+    loss = fluid.layers.reduce_sum(out)
+    fluid.backward.append_backward(loss=loss)
+    ```
+
+- 添加优化相关的Op
+    ```python
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(loss)
+    ```
+
+- 可以随时调用`print(fluid.default_main_program())`来输出当前的`main_program`
+
+- 当构建完成整个`Program`后，调用下面的接口执行内存优化：
+  ```python
+  fluid.memory_optimize(fluid.default_main_program())
+  ```
+  - _<span style="background-color:#e1c4c4;">注：内存优化目前仍在持续开发中，有可能不够稳定。</span>_
+
+</font>
+
+---
+
+### 总结：编译时执行流程
+
+<font size=5>
+
+- 用户定义前向计算
+- 添加反向Op到`default_main_program`
+- 添加 gradient clipping Op 到
+- 添加 regularization Op 到`default_main_program`
+- 为指定的优化算法，添加相关的状态 variable of optimizer 到`default_startup_program`
+    - 状态相关 variable是指如学习率, 历史 momentum, 二阶momentum等
+- 添加初始化 variable 的Op 到 `default_startup_program`
+- 为整个网络最后一个op，添加设置其接受到的梯度的Op到`default_main_program`
+- 进行内存优化规划
+
+</font>
+
+---
+
+### Feed 数据 (一)：通过 feed 字典
+
+<font size=5>
+
+- 执行executor的run方法时，指定feed字典，feed op 会将指定的数据放到`x`和`y`两个Variable中
+  ```python
+  y_data = np.random.randint(0, 8, [1]).astype("int32")
+  y_tensor = core.Tensor()
+  y_tensor.set(y_data, place)
+
+  x_data = np.random.uniform(0.1, 1, [11, 8]).astype("float32")
+  x_tensor = core.Tensor()
+  x_tensor.set(x_data, place)
+  ……
+  cost = exe.run(
+      fluid.default_main_program(),
+      feed={'x': x_tensor,
+            'y': y_tensor},
+      fetchlist=[avg_cost])
+  ```
+
+- 这种方法较为底层，一般用于单测中
+
+</font>
+
+---
+
+### Feed 数据 (二)：使用 DataFeeder接口
+
+<font size=5>
+
+- 编写一个data_reader函数，data_reader是一个Python generator
+
+  ```python
+  def demo_reader():
+      def random_generator():
+          yield np.random.uniform(0.1, 1, [4]), np.random.randint(0, 1, [1])
+      return random_generator
+  ```
+- 在训练任务中使用 DataFeeder 接口
+  ```python
+  cost = exe.run(
+      fluid.default_main_program(),
+      feed={'x': x_tensor,
+            'y': y_tensor},
+      fetchlist=[avg_cost])
+
+  train_reader = paddle.batch(
+      paddle.reader.shuffle(demo_reader(), buf_size=500), batch_size=4)
+  feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+  for data in train_reader():
+      cost = exe.run(
+          fluid.default_main_program(),
+          feed=feeder.feed(data),
+          fetch_list=[cost])
+  ```
+
+</font>
+
+---
+
+### 常见问题
+
+<font size=5>
+
+- 如何使用 evaluator ? [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_label_semantic_roles.py#L168)
+
+    ```python
+    accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+    for pass_id in range(PASS_NUM):
+        accuracy.reset()
+        for data in train_reader():
+            loss, acc = exe.run(fluid.default_main_program(),
+                                feed=feeder.feed(data),
+                                fetch_list=[avg_cost] + accuracy.metrics)
+             pass_acc = accuracy.eval(exe)
+             # acc 当前一个batch 的 accuracy
+             # pass_acc 当前batch 的 accuracy
+         pass_total_acc = accuracy.eval(exe)  # 整个pass的accuracy
+    ```
+
+- 如何在训练中测试？[->](https://github.com/dzhwinter/benchmark/blob/master/fluid/vgg16.py#L144)
+- 如何保存训练好的模型？[->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py#L143)
+- 如何加载训练好的模型进行预测？[->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py#L154)
+- 如何在同一个训练任务中定义多个Program，并交替运行？ [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/demo/fc_gan.py)
+- 如何profile？Fluid 实现了profile 工具，可以直接调用。请参考示例 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_profiler.py)
+
+
+</font>
+
+---
diff --git a/doc/fluid/getstarted/index_cn.rst b/doc/fluid/getstarted/index_cn.rst
index 75af7354be93a6eeabfa9ccf86903505402a7ca6..3daea71d0933a2774227ff2b5e744392ca6b1765 100644
--- a/doc/fluid/getstarted/index_cn.rst
+++ b/doc/fluid/getstarted/index_cn.rst
@@ -17,3 +17,4 @@
   :maxdepth: 1
 
   concepts/use_concepts_cn.rst
+  developer's_guide_to_paddle_fluid.md
diff --git a/doc/fluid/getstarted/index_en.rst b/doc/fluid/getstarted/index_en.rst
index 75a43f4af87c34830ec940068196e6ca72640501..fb20bb4f245281c3acf67c417979dc63c144fef3 100644
--- a/doc/fluid/getstarted/index_en.rst
+++ b/doc/fluid/getstarted/index_en.rst
@@ -16,3 +16,4 @@ Here is an example of linear regression. It introduces workflow of PaddlePaddle,
   :maxdepth: 1
 
   concepts/index_en.rst
+  developer's_guide_to_paddle_fluid.md
diff --git a/doc/fluid/getstarted/quickstart_cn.rst b/doc/fluid/getstarted/quickstart_cn.rst
index 135beb75d0330f39d062753aa2aa83a077f36bb1..6a964d4f8561f30aa10936d2399698c51583442c 100644
--- a/doc/fluid/getstarted/quickstart_cn.rst
+++ b/doc/fluid/getstarted/quickstart_cn.rst
@@ -11,7 +11,7 @@ PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.
 
      pip install paddlepaddle
 
-如果需要安装支持GPU的版本（cuda7.5_cudnn5_avx_openblas），需要执行：
+如果需要安装支持GPU的版本（cuda8.0_cudnn5_avx_openblas），需要执行：
 
   .. code-block:: bash
 
@@ -28,18 +28,18 @@ PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.
 
      import paddle.dataset.uci_housing as uci_housing
      import paddle.fluid as fluid
-  
+
      with fluid.scope_guard(fluid.core.Scope()):
          # initialize executor with cpu
          exe = fluid.Executor(place=fluid.CPUPlace())
-         # load inference model 
+         # load inference model
          [inference_program, feed_target_names,fetch_targets] =  \
              fluid.io.load_inference_model(uci_housing.fluid_model(), exe)
          # run inference
-         result = exe.run(inference_program, 
-                          feed={feed_target_names[0]: uci_housing.predict_reader()}, 
+         result = exe.run(inference_program,
+                          feed={feed_target_names[0]: uci_housing.predict_reader()},
                           fetch_list=fetch_targets)
-         # print predicted price is $12,273.97 
+         # print predicted price is $12,273.97
          print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000)
 
 执行 :code:`python housing.py` 瞧！ 它应该打印出预测住房数据的清单。
diff --git a/doc/fluid/getstarted/quickstart_en.rst b/doc/fluid/getstarted/quickstart_en.rst
index df6619cfd039fc1fdca8cde57db9cc6aebf8f029..680122f25893a5a48fac103266bda4788f891f6d 100644
--- a/doc/fluid/getstarted/quickstart_en.rst
+++ b/doc/fluid/getstarted/quickstart_en.rst
@@ -12,7 +12,7 @@ Simply run the following command to install, the version is cpu_avx_openblas:
 
      pip install paddlepaddle
 
-If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
+If you need to install GPU version (cuda8.0_cudnn5_avx_openblas), run:
 
   .. code-block:: bash
 
@@ -31,18 +31,18 @@ code:
 
      import paddle.dataset.uci_housing as uci_housing
      import paddle.fluid as fluid
-  
+
      with fluid.scope_guard(fluid.core.Scope()):
          # initialize executor with cpu
          exe = fluid.Executor(place=fluid.CPUPlace())
-         # load inference model 
+         # load inference model
          [inference_program, feed_target_names,fetch_targets] =  \
              fluid.io.load_inference_model(uci_housing.fluid_model(), exe)
          # run inference
-         result = exe.run(inference_program, 
-                          feed={feed_target_names[0]: uci_housing.predict_reader()}, 
+         result = exe.run(inference_program,
+                          feed={feed_target_names[0]: uci_housing.predict_reader()},
                           fetch_list=fetch_targets)
-         # print predicted price is $12,273.97 
+         # print predicted price is $12,273.97
          print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000)
 
 Run :code:`python housing.py` and voila! It should print out a list of predictions
diff --git a/doc/fluid/howto/cluster/fluid_recordio.md b/doc/fluid/howto/cluster/fluid_recordio.md
new file mode 100644
index 0000000000000000000000000000000000000000..55ce63ec193948424cd0b87f13d56b9cf6154dfc
--- /dev/null
+++ b/doc/fluid/howto/cluster/fluid_recordio.md
@@ -0,0 +1,127 @@
+# How to use RecordIO in Fluid
+
+If you want to use RecordIO as your training data format, you need to convert to your training data
+to RecordIO files and reading them in the process of training, PaddlePaddle Fluid provides some
+interface to deal with the RecordIO files.
+
+## Generate RecordIO File
+
+Before start training with RecordIO files, you need to convert your training data
+to RecordIO format by `fluid.recordio_writer.convert_reader_to_recordio_file`, the sample codes
+as follows:
+
+```python
+    reader = paddle.batch(mnist.train(), batch_size=1)
+    feeder = fluid.DataFeeder(
+        feed_list=[  # order is image and label
+            fluid.layers.data(
+            name='image', shape=[784]),
+            fluid.layers.data(
+            name='label', shape=[1], dtype='int64'),
+        ],
+        place=fluid.CPUPlace())
+    fluid.recordio_writer.convert_reader_to_recordio_file('./mnist.recordio', reader, feeder)
+```
+
+The above code snippet would generate a RecordIO `./mnist.recordio` on your host.
+
+**NOTE**: we recommend users to set `batch_size=1` when generating the recordio files so that users can
+adjust it flexibly while reading it.
+
+## Use the RecordIO file in a Local Training Job
+
+PaddlePaddle Fluid provides an interface `fluid.layers.io.open_recordio_file` to load your RecordIO file
+and then you can use them as a Layer in your network configuration, the sample codes as follows:
+
+```python
+    data_file = fluid.layers.io.open_recordio_file(
+        filename="./mnist.recordio",
+        shapes=[(-1, 784),(-1, 1)],
+        lod_levels=[0, 0],
+        dtypes=["float32", "int32"])
+    data_file = fluid.layers.io.batch(data_file, batch_size=4)
+
+    img, label = fluid.layers.io.read_file(data_file)
+    hidden = fluid.layers.fc(input=img, size=100, act='tanh')
+    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_loss = fluid.layers.mean(loss)
+
+    fluid.optimizer.Adam(learning_rate=1e-3).minimize(avg_loss)
+
+    place = fluid.CPUPlace()
+
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+    avg_loss_np = []
+
+    # train a pass
+    batch_id = 0
+    while True:
+        tmp, = exe.run(fetch_list=[avg_loss])
+
+        avg_loss_np.append(tmp)
+        print(batch_id)
+        batch_id += 1
+```
+
+## Use the RecordIO files in Distributed Training
+
+1. generate multiple RecordIO files
+
+For a distributed training job, you may have multiple trainer nodes,
+and one or more RecordIO files for one trainer node, you can use the interface
+`fluid.recordio_writer.convert_reader_to_recordio_files` to convert your training data
+into multiple RecordIO files, the sample codes as follows:
+
+```python
+    reader = paddle.batch(mnist.train(), batch_size=1)
+    feeder = fluid.DataFeeder(
+        feed_list=[  # order is image and label
+            fluid.layers.data(
+            name='image', shape=[784]),
+            fluid.layers.data(
+            name='label', shape=[1], dtype='int64'),
+        ],
+        place=fluid.CPUPlace())
+    fluid.recordio_writer.convert_reader_to_recordio_files(
+          filename_suffix='./mnist.recordio', batch_per_file=100, reader, feeder)
+```
+
+The above codes would generate multiple RecordIO files on your host like:
+
+```bash
+.
+ \_mnist-00000.recordio
+ |-mnist-00001.recordio
+ |-mnist-00002.recordio
+ |-mnist-00003.recordio
+ |-mnist-00004.recordio
+```
+
+2. open multiple RecordIO files by `fluid.layers.io.open_files`
+
+For a distributed training job, the distributed operator system will schedule trainer process on multiple nodes,
+each trainer process reads parts of the whole training data, we usually take the following approach to make the training
+data allocated by each trainer process as uniform as possiable:
+
+```python
+def gen_train_list(file_pattern, trainers, trainer_id):
+   file_list = glob.glob(file_pattern)
+   ret_list = []
+   for idx, f in enumerate(file_list):
+       if (idx + trainers) % trainers == trainer_id:
+           ret_list.append(f)
+   return ret_list
+
+trainers = int(os.getenv("TRAINERS"))
+trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+data_file = fluid.layers.io.open_files(
+    filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0),
+    thread_num=1,
+    shapes=[(-1, 784),(-1, 1)],
+    lod_levels=[0, 0],
+    dtypes=["float32", "int32"])
+img, label = fluid.layers.io.read_file(data_files)
+...
+```
diff --git a/doc/fluid/howto/cluster/nccl2_rdma_training.md b/doc/fluid/howto/cluster/nccl2_rdma_training.md
new file mode 100644
index 0000000000000000000000000000000000000000..cecd5c3a7a7339e3be6772543a534728ec132105
--- /dev/null
+++ b/doc/fluid/howto/cluster/nccl2_rdma_training.md
@@ -0,0 +1,110 @@
+# Distributed Training with NCCL2 and RDMA
+
+When doing distributed multi-GPU training, network bandwith often becomes the
+bottle neck. We introduce a way to use NCCL2 to do such training job to
+achieve best performace.
+
+## Prepare Hardwares with RDMA and Multiple GPUs
+
+I'm using two Linux servers each of them is installed with 8 GPUs and
+one 100Gb RDMA card.
+Base environment is:
+
+* OS: CentOS 7.4
+* RDMA device: "Mellanox Technologies MT27700 Family [ConnectX-4]"
+* Kernel version: `4.4.88-1.el7.elrepo.x86_64`
+* Docker version: `1.12.6`
+* Docker storage driver: `overlay2`
+* IP addresses: 192.168.16.30,192.168.16.34
+
+In general, the steps including:
+
+1. Install GPU drivers
+1. Install RDMA drivers
+1. Install "InfiniBand Support"
+1. Use docker to run tests and make sure GPUs and RDMA can work inside
+   the container.
+
+I'll ommit section "Install GPU drivers" because we can find it easily
+somewhere else.
+
+### Install RDMA drivers
+
+For my case, I've got two machines with device
+"Mellanox Technologies MT27700 Family [ConnectX-4]" installed. The OS was
+"CentOS 7.4" and I updated the kernel to version 4.4 so that docker can
+work with latest overlay2 filesystem.
+
+***NOTE: before you start, make sure you have a way to get a console
+of the server other than ssh because we may need to re-configure the
+network device.***
+
+1. Go to http://www.mellanox.com/page/products_dyn?product_family=26,
+   download `MLNX_OFED` software in the bottom of the page, and upload it
+   onto the server.
+1. Run `./mlnxofedinstall --add-kernel-support` in the software package.
+1. Run `/etc/init.d/openibd restart` to make everything work, note that
+   this operation may cause the network goes down if you are using this
+   RDMA device as default network device and use ssh to login the server.
+1. Re-configure the network interface, for example:
+   `ifconfig eth2 192.168.16.30/20 up`, then add routes if needed:
+   `ip route add default via 192.168.16.1 dev eth2`.
+1. Do the same thing on the other node.
+1. Use `ping` to test if the two nodes have typical ICMP connection.
+1. Use either `udaddy` or `ib_write_bw` to test the network connection is
+   ready and have the desired bandwith.
+
+### Prepare Docker Image to Run RDMA Programs
+
+1. Build a docker image using cuda base image like: `nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04` and install paddlepaddle whl
+   package in it.
+1. Start a docker container and mount GPU driver libs into it (you can
+   skip this step if you are using nvidia-docker).
+1. Mount RDMA dirvers and libs into the docker image (see below section),
+   also `udaddy` and `ib_write_bw` if needed.
+1. Mount GPU devices and RDMA devices into the container using `--device`
+   or just use privileged mode `--privileged`.
+1. Start the container using host network mode: `--net=host`
+
+### RDMA Library Files Needed
+
+Usually, `MLNX_OFED` install latest supported libs under
+`/usr/lib64/mlnx_ofed/valgrind`. Other libs also needed to run RDMA programs
+is listed below. These libs must be mounted into the docker container.
+
+* Libs under `/usr/lib64/mlnx_ofed/valgrind`
+  * libibcm.so
+  * libibverbs.so
+  * libmlx4.so
+  * libmlx5.so
+  * libmlx5-rdmav2.so
+  * librdmacm.so
+* Other libs:
+  * libnl-3.so.200
+  * libnl-route-3.so.200
+  * libnuma.so.1
+
+## Start to Run the Training Job
+
+Setting NCCL environment variables to turn NCCL switches on and off:
+
+
+| Env Name | Description |
+| --- | --- |
+| NCCL_SOCKET_IFNAME | The RDMA device, e.g. eth2 |
+| NCCL_P2P_DISABLE | Set to 1 to disable P2P transfer between GPUs |
+| NCCL_IB_DISABLE | Set to 1 to disable using RDMA |
+| NCCL_IB_CUDA_SUPPORT | Set to 1 to enable GPU Direct if supported |
+| NCCL_DEBUG | Set debug level: VERSION, WARN, INFO |
+
+My two servers are: `192.168.16.30,192.168.16.34`, On node 1, Run :
+
+```bash
+PADDLE_TRAINER_ID=0 PADDLE_PORT=48372 PADDLE_WORKERS=192.168.16.30,192.168.16.34 POD_IP=192.168.16.30 stdbuf -oL python vgg16.py
+```
+
+On node 2, Run:
+
+```bash
+PADDLE_TRAINER_ID=1 PADDLE_PORT=48372 PADDLE_WORKERS=192.168.16.30,192.168.16.34 POD_IP=192.168.16.34 stdbuf -oL python vgg16.py
+```
diff --git a/doc/fluid/howto/index_cn.rst b/doc/fluid/howto/index_cn.rst
index 97aeaf167d329529f2b120b5a3d4085e0510fe16..b57af64f44da82926c4862578f3072960ca5aa92 100644
--- a/doc/fluid/howto/index_cn.rst
+++ b/doc/fluid/howto/index_cn.rst
@@ -3,5 +3,6 @@
 
 .. toctree::
   :maxdepth: 1
-  
+
+  inference/index_cn.rst
   optimization/index_cn.rst
diff --git a/doc/fluid/howto/inference/build_and_install_lib_cn.rst b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c8d9992fcc92c25f8c14f71c79bde9f79fd92b1f
--- /dev/null
+++ b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
@@ -0,0 +1,96 @@
+安装与编译C++预测库
+===========================
+
+直接下载安装
+-------------
+
+======================   ========================================
+版本说明                            C++预测库   
+======================   ========================================
+cpu_avx_mkl              `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/fluid.tgz>`_ 
+cpu_avx_openblas         `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/fluid.tgz>`_
+cpu_noavx_openblas       `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/fluid.tgz>`_
+cuda7.5_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+cuda8.0_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+cuda8.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+======================   ========================================
+
+从源码编译
+----------
+用户也可以从 PaddlePaddle 核心代码编译C++预测库，只需在编译时配制下面这些编译选项：
+
+=================   =========
+选项                 值   
+=================   =========
+CMAKE_BUILD_TYPE    Release
+FLUID_INSTALL_DIR   安装路径    
+WITH_FLUID_ONLY     ON（推荐）
+WITH_SWIG_PY        OFF（推荐
+WITH_PYTHON         OFF（推荐）
+WITH_GPU            ON/OFF
+WITH_MKL            ON/OFF
+=================   =========
+
+建议按照推荐值设置，以避免链接不必要的库。其它可选编译选项按需进行设定。
+
+下面的代码片段从github拉取最新代码，配制编译选项（需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径）：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+     PADDLE_ROOT=/path/of/capi
+     git clone https://github.com/PaddlePaddle/Paddle.git
+     cd Paddle
+     mkdir build
+     cd build
+     cmake -DFLUID_INSTALL_DIR=$PADDLE_ROOT \
+           -DCMAKE_BUILD_TYPE=Release \
+           -DWITH_FLUID_ONLY=ON \
+           -DWITH_SWIG_PY=OFF \
+           -DWITH_PYTHON=OFF \
+           -DWITH_MKL=OFF \
+           -DWITH_GPU=OFF  \
+           ..
+      make
+      make inference_lib_dist
+
+成功编译后，使用C++预测库所需的依赖（包括：（1）编译出的PaddlePaddle预测库和头文件；（2）第三方链接库和头文件；（3）版本信息与编译选项信息）
+均会存放于PADDLE_ROOT目录中。目录结构如下：
+
+  .. code-block:: text
+
+     PaddleRoot/
+     ├── CMakeCache.txt
+     ├── paddle
+     │   └── fluid
+     │       ├── framework
+     │       ├── inference
+     │       ├── memory
+     │       ├── platform
+     │       ├── pybind
+     │       └── string
+     ├── third_party
+     │   ├── boost
+     │   │   └── boost
+     │   ├── eigen3
+     │   │   ├── Eigen
+     │   │   └── unsupported
+     │   └── install
+     │       ├── gflags
+     │       ├── glog
+     │       ├── mklml
+     │       ├── protobuf
+     │       ├── snappy
+     │       ├── snappystream
+     │       └── zlib
+     └── version.txt
+     
+version.txt 中记录了该预测库的版本信息，包括Git Commit ID、使用OpenBlas或MKL数学库、CUDA/CUDNN版本号，如：
+
+  .. code-block:: text
+
+     GIT COMMIT ID: c95cd4742f02bb009e651a00b07b21c979637dc8
+     WITH_MKL: ON
+     WITH_GPU: ON
+     CUDA version: 8.0
+     CUDNN version: v5
diff --git a/doc/fluid/howto/inference/index_cn.rst b/doc/fluid/howto/inference/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a903423548decd0992bf19772fb2cb143f6a12b5
--- /dev/null
+++ b/doc/fluid/howto/inference/index_cn.rst
@@ -0,0 +1,8 @@
+预测库
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  build_and_install_lib_cn.rst
+  inference_support_in_fluid_cn.md
diff --git a/doc/fluid/howto/inference/inference_support_in_fluid_cn.md b/doc/fluid/howto/inference/inference_support_in_fluid_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..309b17fccd5c461c9c22beb64eb4c6792b7e4a7a
--- /dev/null
+++ b/doc/fluid/howto/inference/inference_support_in_fluid_cn.md
@@ -0,0 +1,304 @@
+# 使用指南
+
+## 目录：
+
+- Python Inference API
+- Inference C++ API
+- Inference实例
+- Inference计算优化
+
+## Python Inference API **[改进中]**
+- 保存Inference模型 ([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/io.py#L295))
+
+  ```python
+  def save_inference_model(dirname,
+                           feeded_var_names,
+                           target_vars,
+                           executor,
+                           main_program=None,
+                           model_filename=None,
+                           params_filename=None):
+  ```
+  Inference模型和参数将会保存到`dirname`目录下：
+  - 序列化的模型
+    - `model_filename`为`None`，保存到`dirname/__model__`
+    - `model_filename`非`None`，保存到`dirname/model_filename`
+  - 参数
+    - `params_filename`为`None`，单独保存到各个独立的文件，各文件以参数变量的名字命名
+    - `params_filename`非`None`，保存到`dirname/params_filename`
+
+- 两种存储格式
+  - 参数保存到各个独立的文件
+    - 如，设置`model_filename`为`None`、`params_filename`为`None`
+
+    ```bash
+    $ cd recognize_digits_conv.inference.model
+    $ ls
+    $ __model__ batch_norm_1.w_0 batch_norm_1.w_2 conv2d_2.w_0 conv2d_3.w_0 fc_1.w_0 batch_norm_1.b_0 batch_norm_1.w_1 conv2d_2.b_0 conv2d_3.b_0 fc_1.b_0
+    ```
+  - 参数保存到同一个文件
+    - 如，设置`model_filename`为`None`、`params_filename`为`__params__`
+
+    ```bash
+    $ cd recognize_digits_conv.inference.model
+    $ ls
+    $ __model__ __params__
+    ```
+- 加载Inference模型([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/io.py#L380))
+  ```python
+  def load_inference_model(dirname,
+                           executor,
+                           model_filename=None,
+                           params_filename=None):
+    ...
+    return [program, feed_target_names, fetch_targets]
+  ```
+
+## 链接Fluid Inference库
+- 示例项目([链接](https://github.com/luotao1/fluid_inference_example.git))
+
+  - GCC配置
+    ```bash
+    $ g++ -o a.out -std=c++11 main.cc \
+          -I${PADDLE_ROOT}/ \
+          -I${PADDLE_ROOT}/third_party/install/gflags/include \
+          -I${PADDLE_ROOT}/third_party/install/glog/include \
+          -I${PADDLE_ROOT}/third_party/install/protobuf/include \
+          -I${PADDLE_ROOT}/third_party/eigen3 \
+          -L${PADDLE_ROOT}/paddle/fluid/inference -lpaddle_fluid \
+          -lrt -ldl -lpthread
+    ```
+
+  - CMake配置
+    ```cmake
+    include_directories(${PADDLE_ROOT}/)
+    include_directories(${PADDLE_ROOT}/third_party/install/gflags/include)
+    include_directories(${PADDLE_ROOT}/third_party/install/glog/include)
+    include_directories(${PADDLE_ROOT}/third_party/install/protobuf/include)
+    include_directories(${PADDLE_ROOT}/third_party/eigen3)
+    target_link_libraries(${TARGET_NAME}
+                          ${PADDLE_ROOT}/paddle/fluid/inference/libpaddle_fluid.so
+                          -lrt -ldl -lpthread)
+    ```
+
+  - 设置环境变量：
+  `export LD_LIBRARY_PATH=${PADDLE_ROOT}/paddle/fluid/inference:$LD_LIBRARY_PATH`
+
+
+
+## C++ Inference API
+
+- 推断流程([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/test_helper.h#L91))
+
+  - 1、 初始化设备
+    ```cpp
+    #include "paddle/fluid/framework/init.h"
+    paddle::framework::InitDevices(false);
+    ```
+
+  - 2、 定义place，executor，scope
+    ```cpp
+    auto place = paddle::platform::CPUPlace();
+    auto executor = paddle::framework::Executor(place);
+    auto* scope = new paddle::framework::Scope();
+    ```
+
+  - 3、 加载模型
+    ```cpp
+    #include "paddle/fluid/inference/io.h"
+    auto inference_program = paddle::inference::Load(executor, *scope, dirname);
+    // or
+    auto inference_program = paddle::inference::Load(executor,
+                                                     *scope,
+                                                     dirname + "/" + model_filename,
+                                                     dirname + "/" + params_filename);
+    ```
+
+  - 4、 获取`feed_target_names`和`fetch_target_names`
+    ```cpp
+    const std::vector<std::string>& feed_target_names = inference_program->GetFeedTargetNames();
+    const std::vector<std::string>& fetch_target_names = inference_program->GetFetchTargetNames();
+    ```
+
+  - 5、 准备`feed`数据
+    ```cpp
+    #include "paddle/fluid/framework/lod_tensor.h"
+    std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+    ...
+    std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+    for (size_t i = 0; i < feed_target_names.size(); ++i) {
+      // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
+      feed_targets[feed_target_names[i]] = cpu_feeds[i];
+    }
+    ```
+
+  - 6、 定义`Tensor`来`fetch`结果
+    ```cpp
+    std::vector<paddle::framework::LoDTensor*> cpu_fetchs;
+    std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+    for (size_t i = 0; i < fetch_target_names.size(); ++i) {
+      fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
+    }
+    ```
+
+  - 7、 执行`inference_program`
+    ```cpp
+    executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+    ```
+
+  - 8、 使用`fetch`数据
+    ```cpp
+    for (size_t i = 0; i < cpu_fetchs.size(); ++i) {
+      std::cout << "lod_i: " << cpu_fetchs[i]->lod();
+      std::cout << "dims_i: " << cpu_fetchs[i]->dims();
+      std::cout << "result:";
+      float* output_ptr = cpu_fetchs[i]->data<float>();
+      for (int j = 0; j < cpu_fetchs[i]->numel(); ++j) {
+        std::cout << " " << output_ptr[j];
+      }
+      std::cout << std::endl;
+    }
+    ```
+    针对不同的数据，4. - 8.可执行多次。
+
+  - 9、 释放内存
+    ```cpp
+    delete scope;
+    ```
+
+
+- 接口说明
+
+  ```cpp
+  void Run(const ProgramDesc& program, Scope* scope,
+           std::map<std::string, const LoDTensor*>& feed_targets,
+           std::map<std::string, LoDTensor*>& fetch_targets,
+           bool create_vars = true,
+           const std::string& feed_holder_name = "feed",
+           const std::string& fetch_holder_name = "fetch");
+  ```
+  - 使用Python API `save_inference_model`保存的`program`里面包含了`feed_op`和`fetch_op`，用户提供的`feed_targets`、`fetch_targets`必须和`inference_program`中的`feed_op`、`fetch_op`保持一致。
+  - 用户提供的`feed_holder_name`和`fetch_holder_name`也必须和`inference_program`中`feed_op`、`fetch_op`保持一致，可使用`SetFeedHolderName`和`SetFetchHolderName`接口重新设置`inferece_program`
+  - 默认情况下，除了`persistable`属性设置为`True`的`Variable`之外，每次执行`executor.Run`会创建一个局部`Scope`，并且在这个局部`Scope`中创建和销毁所有的`Variable`，以最小化空闲时的内存占用。
+  - `persistable`属性为`True`的`Variable`有：
+    - Operators的参数`w`、`b`等
+    - `feed_op`的输入变量
+    - `fetch_op`的输出变量
+
+
+- **不在每次执行时创建和销毁变量
+ ([PR](https://github.com/PaddlePaddle/Paddle/pull/9301))**
+  - 执行`inference_program`
+    ```cpp
+    // Call once
+    executor.CreateVariables(*inference_program, scope, 0);
+    // Call as many times as you like
+    executor.Run(
+        *inference_program, scope, feed_targets, fetch_targets, false);
+    ```
+  - **优点**
+    - 节省了频繁创建、销毁变量的时间（约占每次`Run`总时间的1% ~ 12%）
+    - 执行结束后可获取所有Operators的计算结果
+  - **缺点**
+    - 空闲时也会占用大量的内存
+    - 在同一个`Scope`中，相同的变量名是公用同一块内存的，容易引起意想不到的错误
+
+
+- **不在每次执行时创建Op([PR](https://github.com/PaddlePaddle/Paddle/pull/9630))**
+  - 执行`inference_program`
+    ```cpp
+    // Call once
+    auto ctx = executor.Prepare(*inference_program, 0);
+    // Call as many times as you like if you have no need to change the inference_program
+    executor.RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets);
+    ```
+  - **优点**
+    - 节省了频繁创建、销毁Op的时间
+  - **缺点**
+    - 一旦修改了`inference_program`，则需要重新创建`ctx`
+
+
+- **多线程共享Parameters([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/test_multi_thread_helper.h))**
+  - 主线程
+    - 1、 初始化设备
+    - 2、 定义`place`，`executor`，`scope`
+    - 3、 加载模型，得到`inference_program`
+  - 从线程
+    - **复制`inference_program`得到`copy_program`，修改`copy_program`的`feed_holder_name`和`fetch_holder_name`**
+      ```cpp
+      auto copy_program = std::unique_ptr<paddle::framework::ProgramDesc>(
+                 new paddle::framework::ProgramDesc(*inference_program));
+      std::string feed_holder_name = "feed_" + paddle::string::to_string(thread_id);
+      std::string fetch_holder_name = "fetch_" + paddle::string::to_string(thread_id);
+      copy_program->SetFeedHolderName(feed_holder_name);
+      copy_program->SetFetchHolderName(fetch_holder_name);
+      ```
+    - 4、 获取`copy_program`的`feed_target_names`和`fetch_target_names`
+    - 5、 准备feed数据，定义Tensor来fetch结果
+    - 6、 执行`copy_program`
+      ```cpp
+      executor->Run(*copy_program, scope, feed_targets, fetch_targets, true, feed_holder_name, fetch_holder_name);
+      ```
+    - 7、 使用fetch数据
+  - 主线程
+    - 8、 释放资源
+
+
+- 基本概念
+  - 数据相关：
+    - [Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/tensor.md)，一个N维数组，数据可以是任意类型（int，float，double等）
+    - [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md)，带LoD(Level-of-Detail)即序列信息的Tensor
+    - [Scope](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md)，记录了变量Variable
+  - 执行相关：
+    - [Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md)，无状态执行器，只跟设备相关
+    - Place
+      - CPUPlace，CPU设备
+      - CUDAPlace，CUDA GPU设备
+  - 神经网络表示：
+    - [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md).
+
+    详细介绍请参考[**Paddle Fluid开发者指南**](https://github.com/lcy-seso/learning_notes/blob/master/Fluid/developer's_guid_for_Fluid/Developer's_Guide_to_Paddle_Fluid.md)
+
+
+
+## Inference实例
+
+  1. fit a line: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc)
+  1. image classification: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_image_classification.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_image_classification.cc)
+  1. label semantic roles: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_label_semantic_roles.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc)
+  1. recognize digits: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc)
+  1. recommender system: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recommender_system.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc)
+  1. understand sentiment: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_understand_sentiment.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc)
+  1. word2vec: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_word2vec.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_word2vec.cc)
+
+
+## Inference计算优化
+- 使用Python推理优化工具([inference_transpiler](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/inference_transpiler.py))
+  ```python
+  class InferenceTranspiler:
+    def transpile(self, program, place, scope=None):
+        ...
+        if scope is None:
+            scope = global_scope()
+        ...
+  ```
+  - 使用`InferenceTranspiler`将会直接修改`program`。
+  - 使用`InferenceTranspiler`会修改参数的值，请确保`program`的参数在`scope`内。
+- 支持的优化
+  - 融合batch_norm op的计算
+- 使用示例([链接](https://github.com/Xreki/Xreki.github.io/blob/master/fluid/inference/inference_transpiler.py))
+  ```python
+  import paddle.fluid as fluid
+  # NOTE: Applying the inference transpiler will change the inference_program.
+  t = fluid.InferenceTranspiler()
+  t.transpile(inference_program, place, inference_scope)
+  ```
+
+
+
+
+## 内存使用优化
+- 使用Python内存优化工具([memory_optimization_transipiler](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/memory_optimization_transpiler.py))
+  ```python
+  fluid.memory_optimize(inference_program)
+  ```
diff --git a/doc/fluid/howto/optimization/benchmark/README.md b/doc/fluid/howto/optimization/benchmark/README.md
deleted file mode 120000
index db30af7f53231c687f9ad61ad961a685733cbad0..0000000000000000000000000000000000000000
--- a/doc/fluid/howto/optimization/benchmark/README.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../benchmark/cluster/README.md
\ No newline at end of file
diff --git a/doc/fluid/howto/optimization/benchmark/vgg16/README.md b/doc/fluid/howto/optimization/benchmark/vgg16/README.md
deleted file mode 120000
index ca963ef5f06aa0c2fe507ba7548dca8017358120..0000000000000000000000000000000000000000
--- a/doc/fluid/howto/optimization/benchmark/vgg16/README.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../../benchmark/cluster/vgg16/README.md
\ No newline at end of file
diff --git a/doc/fluid/howto/optimization/cpu_profiling_cn.md b/doc/fluid/howto/optimization/cpu_profiling_cn.md
index 8266dec3c6125a09b90ac0ccd4aa5464f5c7db31..198a05a79e19227e90eaafe116217a164cd51a7d 100644
--- a/doc/fluid/howto/optimization/cpu_profiling_cn.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_cn.md
@@ -1,3 +1,5 @@
+# CPU性能调优
+
 此教程会介绍如何使用Python的cProfile包、Python库yep、Google perftools来进行性能分析 (profiling) 与调优（performance tuning）。
 
 Profling 指发现性能瓶颈。系统中的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。Tuning 指消除瓶颈。性能优化的过程通常是不断重复地 profiling 和 tuning。
@@ -8,7 +10,7 @@ PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大
 * Python 与 C++ 混合代码的性能分析
 
 
-# Python代码的性能分析
+## Python代码的性能分析
 
 ### 生成性能分析文件
 
diff --git a/doc/fluid/howto/optimization/cpu_profiling_en.md b/doc/fluid/howto/optimization/cpu_profiling_en.md
index e95556dd608b7ff0a3eb18873df0015a2da94e7c..216694965b3c878a8a5f3ccd2a0cba8d21d9ce05 100644
--- a/doc/fluid/howto/optimization/cpu_profiling_en.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_en.md
@@ -1,3 +1,5 @@
+# Tune CPU performance
+
 This tutorial introduces techniques we use to profile and tune the
 CPU performance of PaddlePaddle.  We will use Python packages
 `cProfile` and `yep`, and Google's `perftools`.
@@ -14,7 +16,7 @@ the profiling and tuning of
 1. the Python code and
 1. the mixture of Python and C++ code.
 
-# Profiling the Python Code
+## Profiling the Python Code
 
 ### Generate the Performance Profiling File
 
diff --git a/doc/fluid/howto/optimization/host_memory_profiling_cn.md b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..9b55a66ded8b48f7105c05f1462839a72ab5f904
--- /dev/null
+++ b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
@@ -0,0 +1,89 @@
+## 堆内存分析和优化
+
+计算机程序都可能有内存泄漏的风险。**内存泄漏**一般是由于程序在堆(heap)上分配了内存而没有释放，随着程序的运行占用的内存越来越大，一方面会影响程序的稳定性，可能让运行速度越来越慢，或者造成oom，甚至会影响运行程序的机器的稳定性，造成宕机。
+
+
+目前有很多内存泄漏分析工具，比较经典的有[valgrind](http://valgrind.org/docs/manual/quick-start.html#quick-start.intro), [gperftools](https://gperftools.github.io/gperftools/)。
+
+因为Fluid是用Python驱动C++ core来运行，valgrind直接分析非常困难，需要自己编译debug版本的、带valgrind支持的专用Python版本，而且输出的信息中大部分是Python自己的符号和调用信息，分析起来很困难，另外使用valgrind会让程序运行速度变得非常慢，所以不建议使用。
+
+本教程主要介绍[gperftools](https://gperftools.github.io/gperftools/)的使用。
+
+gperftool主要支持以下四个功能：
+
+- thread-caching malloc
+- heap-checking using tcmalloc
+- heap-profiling using tcmalloc
+- CPU profiler
+
+Paddle也提供了基于gperftool的[CPU性能分析教程](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/cpu_profiling_cn.md)。
+
+对于堆内存的分析，主要用到thread-caching malloc和heap-profiling using tcmalloc。
+
+## 使用流程
+#### 环境
+本教程基于paddle提供的Docker开发环境paddlepaddle/paddle:latest-dev，基于Ubuntu 16.04.4 LTS环境。
+
+#### 使用流程
+
+- 安装google-perftools
+
+```
+apt-get install libunwind-dev 
+apt-get install google-perftools
+```
+
+- 安装pprof
+
+```
+go get -u github.com/google/pprof
+```
+
+- 设置运行环境
+
+```
+export PPROF_PATH=/root/gopath/bin/pprof
+export PPROF_BINARY_PATH=/root/gopath/bin/pprof
+export LD_PRELOAD=/usr/lib/libtcmalloc.so.4
+```
+
+- 使用heap profile来运行python程序。本质上是周期性的对堆的分配情况做一次快照。
+
+```
+# HEAPPROFILE 设置生成的堆分析文件的目录和文件前缀
+# HEAP_PROFILE_ALLOCATION_INTERVAL 设置每分配多少存储dump一次dump，默认1GB
+env HEAPPROFILE="./perf_log/test.log" HEAP_PROFILE_ALLOCATION_INTERVAL=209715200 python trainer.py
+```
+
+随着程序的运行，会在perf_log这个文件夹下生成很多文件，如下：
+
+```
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0001.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0002.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0003.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0004.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0005.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0006.heap
+```
+
+- 使用pprof对heap文件进行分析。分析有两种模式：
+	- 完整模式。会对当前heap做一个分析，显示目前分配内存一些调用路径。
+
+	```
+	pprof --pdf python test.log.0012.heap
+	```
+	上述命令会生成一个profile00x.pdf的文件，可以直接打开，例如：[memory_cpu_allocator](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_cpu_allocator.pdf)。从下图可以看出，在CPU版本fluid的运行过程中，分配存储最多的模块式CPUAllocator. 而别的模块相对而言分配内存较少，所以被忽略了，这对于分配内存泄漏是很不方便的，因为泄漏是一个缓慢的过程，在这种图中是无法看到的。
+	
+	![result](https://user-images.githubusercontent.com/3048612/40964027-a54033e4-68dc-11e8-836a-144910c4bb8c.png)
+	
+	- Diff模式。可以对两个时刻的heap做diff，把一些内存分配没有发生变化的模块去掉，而把增量部分显示出来。
+	```
+	pprof --pdf --base test.log.0010.heap python test.log.1045.heap
+	```
+	生成的结果为：[`memory_leak_protobuf`](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_leak_protobuf.pdf)
+	
+	从图中可以看出：ProgramDesc这个结构，在两个版本之间增长了200MB+，所以这里有很大的内存泄漏的可能性，最终结果也确实证明是这里造成了泄漏。
+	
+	![result](https://user-images.githubusercontent.com/3048612/40964057-b434d5e4-68dc-11e8-894b-8ab62bcf26c2.png)
+	![result](https://user-images.githubusercontent.com/3048612/40964063-b7dbee44-68dc-11e8-9719-da279f86477f.png)
+	
diff --git a/doc/fluid/images/1.png b/doc/fluid/images/1.png
new file mode 100644
index 0000000000000000000000000000000000000000..67daf566f91aab570e60971c4ea8e2be876e214d
Binary files /dev/null and b/doc/fluid/images/1.png differ
diff --git a/doc/fluid/images/2.png b/doc/fluid/images/2.png
new file mode 100644
index 0000000000000000000000000000000000000000..43367777f41449a666e7a3b571f09ac5d5dfb1ae
Binary files /dev/null and b/doc/fluid/images/2.png differ
diff --git a/doc/fluid/images/3.png b/doc/fluid/images/3.png
new file mode 100644
index 0000000000000000000000000000000000000000..481021ef306e2596818aab7fe17a570754f63635
Binary files /dev/null and b/doc/fluid/images/3.png differ
diff --git a/doc/fluid/images/4.png b/doc/fluid/images/4.png
new file mode 100644
index 0000000000000000000000000000000000000000..4279f41e06de459f18b9a622539511d555e9a0af
Binary files /dev/null and b/doc/fluid/images/4.png differ
diff --git a/doc/fluid/images/LoDTensor.png b/doc/fluid/images/LoDTensor.png
new file mode 100644
index 0000000000000000000000000000000000000000..75369f5378309e0f304b83f6bb69bdb195eac079
Binary files /dev/null and b/doc/fluid/images/LoDTensor.png differ
diff --git a/doc/fluid/images/compile_run_time.png b/doc/fluid/images/compile_run_time.png
new file mode 100644
index 0000000000000000000000000000000000000000..0bc9b2fd0e81b4851e6d96171ccb9a05d0f42a48
Binary files /dev/null and b/doc/fluid/images/compile_run_time.png differ
diff --git a/doc/fluid/images/executor.png b/doc/fluid/images/executor.png
new file mode 100644
index 0000000000000000000000000000000000000000..b29c0d779e3d46b779b5baeabe3176adaeb00a6d
Binary files /dev/null and b/doc/fluid/images/executor.png differ
diff --git a/doc/fluid/images/fluid_examples.png b/doc/fluid/images/fluid_examples.png
new file mode 100644
index 0000000000000000000000000000000000000000..aa99472c0f914cde128fd7b3bd8dc29ac24f94b6
Binary files /dev/null and b/doc/fluid/images/fluid_examples.png differ
diff --git a/doc/fluid/images/fluid_module_1.png b/doc/fluid/images/fluid_module_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..554782ba54e43efc3d6babbb94e3cac3530ac649
Binary files /dev/null and b/doc/fluid/images/fluid_module_1.png differ
diff --git a/doc/fluid/images/fluid_module_2.png b/doc/fluid/images/fluid_module_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..4219efccbb1e87839adf6b5720fe46808b7d2fcf
Binary files /dev/null and b/doc/fluid/images/fluid_module_2.png differ
diff --git a/doc/fluid/images/layer.png b/doc/fluid/images/layer.png
new file mode 100644
index 0000000000000000000000000000000000000000..e46db4c9c6f5b65ff274b498b716b11de343a8b0
Binary files /dev/null and b/doc/fluid/images/layer.png differ
diff --git a/doc/fluid/images/op.dot b/doc/fluid/images/op.dot
new file mode 100644
index 0000000000000000000000000000000000000000..c8ad839cb88788e9b5906402257cc7bbc3ddcb54
--- /dev/null
+++ b/doc/fluid/images/op.dot
@@ -0,0 +1,4 @@
+digraph sample { 
+  graph [rankdir=TD]; node [shape=record];
+  op [label="{Operator| InferShape()=0\lRun()=0\l | map&#60;string, string[]&#62; inputs_\lmap&#60;string, string[]&#62; outputs_ \l AttributeMap attrs_\l}"]; 
+}
\ No newline at end of file
diff --git a/doc/fluid/images/op_op_with_kern_class_diagram.dot b/doc/fluid/images/op_op_with_kern_class_diagram.dot
new file mode 100644
index 0000000000000000000000000000000000000000..8f24e9ea83acf879c7008f2d97113c0a4cc111c3
--- /dev/null
+++ b/doc/fluid/images/op_op_with_kern_class_diagram.dot
@@ -0,0 +1,38 @@
+digraph sample { 
+  graph [rankdir=TD]; node [shape=record];
+  op [label="{Operator| InferShape()=0\lRun()=0\l | map&#60;string, string[]&#62; inputs_\lmap&#60;string, string[]&#62; outputs_ \l AttributeMap attrs_\l}"]; 
+  op_with_kern [label="{OpWithKernel | InferShape()=0\lRun()\l | map&#60;OpKernelKey,OpKernel&#62;kernels_ }"]
+  op_kernel [label="{OpKernel | Compute()=0}"]
+  op_kernel_key [label="{OpKernelKey| Place place\n...}"]
+
+  op -> op_with_kern [dir=back, arrowtail=onormal]
+  op_with_kern -> op_kernel [arrowhead=vee, label="contains many"]
+
+  {
+    rank=same;
+    op_with_kern
+    op_kernel
+  }
+
+  op_kernel -> op_kernel_key [style=invis]
+
+  {
+    rank=same;
+    op_kernel
+    op_kernel_key
+  }
+
+  op_with_kern -> op_kernel_key [arrowhead=vee, label ="\nas map key"]
+
+  mul_op [label="MulOp"]
+  op_with_kern -> mul_op [dir=back, arrowtail=onormal]
+  mul_kernel [label="template &#60;typename Place&#62;\lclass MulOpKernel\l"]
+  op_kernel -> mul_kernel [dir=back, arrowtail=onormal]
+  mul_op -> mul_kernel [arrowhead=vee, label="register many"]
+  
+  {
+    rank=same;
+    mul_op;
+    mul_kernel;
+  }
+}
\ No newline at end of file
diff --git a/doc/fluid/images/op_with_kernel.dot b/doc/fluid/images/op_with_kernel.dot
new file mode 100644
index 0000000000000000000000000000000000000000..4f5af4f7b5f5a69693a058c99eb658900136077a
--- /dev/null
+++ b/doc/fluid/images/op_with_kernel.dot
@@ -0,0 +1,26 @@
+digraph sample { 
+  graph [rankdir=TD]; node [shape=record];
+  op [label="{Operator}"]; 
+  op_with_kern [label="{OpWithKernel | InferShape()=0\lRun()\l | map&#60;OpKernelKey,OpKernel&#62;kernels_ }"]
+  op_kernel [label="{OpKernel | Compute()=0}"]
+  op_kernel_key [label="{OpKernelKey| Place place\n...}"]
+
+  op -> op_with_kern [dir=back, arrowtail=onormal]
+  op_with_kern -> op_kernel [arrowhead=vee, label="contains many"]
+
+  {
+    rank=same;
+    op_with_kern
+    op_kernel
+  }
+
+  op_kernel -> op_kernel_key [style=invis]
+
+  {
+    rank=same;
+    op_kernel
+    op_kernel_key
+  }
+
+  op_with_kern -> op_kernel_key [arrowhead=vee, label ="\nas map key"]
+}
\ No newline at end of file
diff --git a/doc/fluid/images/operator1.png b/doc/fluid/images/operator1.png
new file mode 100644
index 0000000000000000000000000000000000000000..3975b06f615b7a88dfc11e71b6451fdf4ce42d60
Binary files /dev/null and b/doc/fluid/images/operator1.png differ
diff --git a/doc/fluid/images/operator2.png b/doc/fluid/images/operator2.png
new file mode 100644
index 0000000000000000000000000000000000000000..b7bb1fae2050d3a70797517bc20dbbdef3dfcb7c
Binary files /dev/null and b/doc/fluid/images/operator2.png differ
diff --git a/doc/fluid/images/place.png b/doc/fluid/images/place.png
new file mode 100644
index 0000000000000000000000000000000000000000..14e77511d639af155e5a3725cde05323e0cc94f2
Binary files /dev/null and b/doc/fluid/images/place.png differ
diff --git a/doc/fluid/images/print_fluid_program.png b/doc/fluid/images/print_fluid_program.png
new file mode 100644
index 0000000000000000000000000000000000000000..e8e459e1b3d5c8706b3caa05dc371db8d46df4a5
Binary files /dev/null and b/doc/fluid/images/print_fluid_program.png differ
diff --git a/doc/fluid/images/program_desc1.png b/doc/fluid/images/program_desc1.png
new file mode 100644
index 0000000000000000000000000000000000000000..0656336914ece957f2e5bb4d70ad337a63e31d88
Binary files /dev/null and b/doc/fluid/images/program_desc1.png differ
diff --git a/doc/fluid/images/program_desc2.png b/doc/fluid/images/program_desc2.png
new file mode 100644
index 0000000000000000000000000000000000000000..db5bfa1231345add8661b4f8ef0fc9d861f40d24
Binary files /dev/null and b/doc/fluid/images/program_desc2.png differ
diff --git a/doc/fluid/images/raw_input.png b/doc/fluid/images/raw_input.png
new file mode 100644
index 0000000000000000000000000000000000000000..0725f92d2b169c2b59ec7c68b402859c2a2dd1d8
Binary files /dev/null and b/doc/fluid/images/raw_input.png differ
diff --git a/doc/fluid/images/scope_variable_tensor.png b/doc/fluid/images/scope_variable_tensor.png
new file mode 100644
index 0000000000000000000000000000000000000000..59b0de6fb36f9f6b469227c05760a7612bb30b4d
Binary files /dev/null and b/doc/fluid/images/scope_variable_tensor.png differ
diff --git a/doc/fluid/images/sorted_input.png b/doc/fluid/images/sorted_input.png
new file mode 100644
index 0000000000000000000000000000000000000000..ff601128368ee179e3fd33e5e295a9ddd3dcbaeb
Binary files /dev/null and b/doc/fluid/images/sorted_input.png differ
diff --git a/doc/fluid/images/transpiler.png b/doc/fluid/images/transpiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..422973c0dc7aa2b544d2fc86a97ace706388cb9e
Binary files /dev/null and b/doc/fluid/images/transpiler.png differ
diff --git a/doc/fluid/images/user_interface.png b/doc/fluid/images/user_interface.png
new file mode 100644
index 0000000000000000000000000000000000000000..ffc94e3d8945ec6291460afd90e8fcc600828390
Binary files /dev/null and b/doc/fluid/images/user_interface.png differ
diff --git a/doc/mobile/CMakeLists.txt b/doc/mobile/CMakeLists.txt
index b104a6318d474d6531670b8ac3569448774850c7..7b34ba8d0768427802b11614c6962f3c3f6ef4e3 100644
--- a/doc/mobile/CMakeLists.txt
+++ b/doc/mobile/CMakeLists.txt
@@ -15,6 +15,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
 # HTML output director
 set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
 
+set(IMPORT_PADDLE_STRING "")
+set(IMPORT_PADDLEV2_STRING "")
+
 configure_file(
         "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
         "${BINARY_BUILD_DIR_EN}/conf.py"
@@ -27,8 +30,6 @@ sphinx_add_target(paddle_mobile_docs
         ${CMAKE_CURRENT_SOURCE_DIR}
         ${SPHINX_HTML_DIR_EN})
 
-add_dependencies(paddle_mobile_docs gen_proto_py paddle_python)
-
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
 
@@ -49,5 +50,3 @@ sphinx_add_target(paddle_mobile_docs_cn
         ${SPHINX_CACHE_DIR_CN}
         ${CMAKE_CURRENT_SOURCE_DIR}
         ${SPHINX_HTML_DIR_CN})
-
-add_dependencies(paddle_mobile_docs_cn gen_proto_py paddle_python)
diff --git a/doc/mobile/cross_compiling_for_android_cn.md b/doc/mobile/cross_compiling_for_android_cn.md
index cdd6917239371a660d0df05bb623f0b94f8f11a3..0607748b751e9f2d606236d9e98868335379b05c 100644
--- a/doc/mobile/cross_compiling_for_android_cn.md
+++ b/doc/mobile/cross_compiling_for_android_cn.md
@@ -63,16 +63,16 @@ Android的Docker开发镜像向用户提供两个可配置的参数：
 - 编译`armeabi-v7a`，`Android API 21`的PaddlePaddle库
 
 ```bash
-$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev ./paddle/scripts/paddle_build.sh build_android
 ```
 
 - 编译`arm64-v8a`，`Android API 21`的PaddlePaddle库
 
 ```bash
-$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev ./paddle/scripts/paddle_build.sh build_android
 ```
 
-执行上述`docker run`命令时，容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
+执行上述`docker run`命令时，容器执行[paddle/scripts/paddle_build.sh build_android](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/paddle_build.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
 
 ## 基于Linux交叉编译环境的编译方式
 本文档将以Linux x86-64平台为例，介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。
diff --git a/doc/mobile/cross_compiling_for_android_en.md b/doc/mobile/cross_compiling_for_android_en.md
index 6af16fc114a2310e364023ec43cc3c64149af8f7..572063e8012efee2d2e142eb57e459e0e8c6382c 100644
--- a/doc/mobile/cross_compiling_for_android_en.md
+++ b/doc/mobile/cross_compiling_for_android_en.md
@@ -36,7 +36,7 @@ $ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android
 We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below:
 
 ```bash
-$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android ./paddle/scripts/paddle_build.sh build_android
 ```
 
 The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
@@ -70,7 +70,7 @@ The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
 
 The ARM-64 architecture (`arm64-v8a`) requires at least level 21 of Android API.
 
-The default entry-point of the Docker image, [`paddle/scripts/docker/build_android.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`.  For information about other configuration arguments, please continue reading.
+The build command, [`paddle/scripts/paddle_build.sh build_android`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/paddle_build.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`.  For information about other configuration arguments, please continue reading.
 
 The above command generates and outputs the inference library in `$PWD/install_android` and puts third-party libraries in `$PWD/install_android/third_party`.
 
diff --git a/doc/mobile/index_cn.rst b/doc/mobile/index_cn.rst
index 8297316e8fbb2b8f41954030293feadbcd81295e..56d1515005f6e40b084c6b2184c6a0b3e3a00496 100644
--- a/doc/mobile/index_cn.rst
+++ b/doc/mobile/index_cn.rst
@@ -1,9 +1,9 @@
 移动端
-=====
+======
 
 ..  toctree::
   :maxdepth: 1
 
   cross_compiling_for_android_cn.md
   cross_compiling_for_ios_cn.md
-  cross_compiling_for_raspberry_cn.md
\ No newline at end of file
+  cross_compiling_for_raspberry_cn.md
diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in
index 76b82fd97f1ed642696c4414676b694ebda9ad81..890f70615538af23cd05b9ffd685e870a5644cdb 100644
--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -16,8 +16,8 @@ import os, subprocess
 sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
-import paddle
-import paddle.v2
+@IMPORT_PADDLE_STRING@
+@IMPORT_PADDLEV2_STRING@
 
 MarkdownParser = parser.CommonMarkParser
 AutoStructify = transform.AutoStructify
diff --git a/doc/templates/conf.py.en.in b/doc/templates/conf.py.en.in
index 5aa5c1381fa3fad4ebc181c7868da03ae0138016..5b09464cb991f96127edec40f7dbbc97a8d82582 100644
--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@@ -16,8 +16,8 @@ import os, subprocess
 sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
-import paddle
-import paddle.v2
+@IMPORT_PADDLE_STRING@
+@IMPORT_PADDLEV2_STRING@
 
 
 MarkdownParser = parser.CommonMarkParser
diff --git a/doc/v2/CMakeLists.txt b/doc/v2/CMakeLists.txt
index be957d37b14c618e9346251b3bd3dbaf1541773f..d230a1b9217eea6740419822f350096e361a4435 100644
--- a/doc/v2/CMakeLists.txt
+++ b/doc/v2/CMakeLists.txt
@@ -15,6 +15,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
 # HTML output director
 set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
 
+set(IMPORT_PADDLE_STRING "")
+set(IMPORT_PADDLEV2_STRING "")
+
 configure_file(
     "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
     "${BINARY_BUILD_DIR_EN}/conf.py"
@@ -27,8 +30,6 @@ sphinx_add_target(paddle_v2_docs
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_EN})
 
-add_dependencies(paddle_v2_docs gen_proto_py paddle_python)
-
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
 
@@ -50,6 +51,4 @@ sphinx_add_target(paddle_v2_docs_cn
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_CN})
 
-add_dependencies(paddle_v2_docs_cn gen_proto_py paddle_python)
-
 add_subdirectory(api)
diff --git a/doc/v2/api/CMakeLists.txt b/doc/v2/api/CMakeLists.txt
index 2670a21a227546ffcee4f10f395feef3c58df9b4..0c74522cb089b17c8419e9058f76631b0fe0df93 100644
--- a/doc/v2/api/CMakeLists.txt
+++ b/doc/v2/api/CMakeLists.txt
@@ -7,6 +7,9 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
 # HTML output director
 set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
 
+set(IMPORT_PADDLE_STRING "import paddle")
+set(IMPORT_PADDLEV2_STRING "import paddle.v2")
+
 configure_file(
     "${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in"
     "${BINARY_BUILD_DIR_EN}/conf.py"
diff --git a/doc/v2/api/config/layer.rst b/doc/v2/api/config/layer.rst
index 29388f5005bf779a1bfa63c0d46d35996c0c792d..1a6496968cae1fef88142ba9ca3f9e63a81b196d 100644
--- a/doc/v2/api/config/layer.rst
+++ b/doc/v2/api/config/layer.rst
@@ -142,7 +142,7 @@ gated_unit
 -----------
 ..  autoclass:: paddle.v2.layer.gated_unit
     :noindex:
-    
+
 Recurrent Layer Group
 =====================
 
@@ -354,7 +354,7 @@ dropout
 --------
 ..  autoclass:: paddle.v2.layer.dropout
     :noindex:
-    
+
 dot_prod
 ---------
 .. autoclass:: paddle.v2.layer.dot_prod
@@ -460,6 +460,11 @@ multi_binary_label_cross_entropy_cost
 ..  autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
     :noindex:
 
+classification_cost
+-------------------
+.. autoclass:: paddle.v2.layer.classification_cost
+   :noindex:
+
 huber_regression_cost
 -------------------------
 ..  autoclass:: paddle.v2.layer.huber_regression_cost
@@ -534,7 +539,7 @@ detection_output
 ----------------
 ..  autoclass:: paddle.v2.layer.detection_output
     :noindex:
-    
+
 Check Layer
 ============
 
diff --git a/doc/v2/build_and_install/build_from_source_cn.rst b/doc/v2/build_and_install/build_from_source_cn.rst
index 115b92a33888abf1e1be400e1abbb58b632a2976..6421c5308271c2508597d849c79709255caf349a 100644
--- a/doc/v2/build_and_install/build_from_source_cn.rst
+++ b/doc/v2/build_and_install/build_from_source_cn.rst
@@ -19,10 +19,11 @@
 ----------------
 
 PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像
-可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。或者
-参考下述可选步骤，从源码中构建用于编译PaddlePaddle的Docker镜像。
+可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`__ 找到，您也可以
+在 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`__ 找到 paddle_manylinux_devel
+镜像的编译以及使用方法。或者参考下述可选步骤，从源码中构建用于编译PaddlePaddle的Docker镜像。
 
-如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
+如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 :ref:`编译依赖 <_compile_deps>` 之后才能开始编译的步骤。
 
 编译PaddlePaddle，需要执行：
 
@@ -34,13 +35,11 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
    # 2. 可选步骤：源码中构建用于编译PaddlePaddle的Docker镜像
    docker build -t paddle:dev .
    # 3. 执行下面的命令编译CPU-Only的二进制
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
    # 4. 或者也可以使用为上述可选步骤构建的镜像（必须先执行第2步）
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
 
-注：上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。如果使用自行
-构建的镜像（上述第4步）会执行 :code:`Dockerfile` 描述的默认入口程序 :code:`build.sh` 可以省略步骤3中
-最后的执行脚本的命令。
+注：上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。
 
 编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
 
@@ -71,15 +70,15 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 
 .. code-block:: bash
 
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
 
 如果期望执行其中一个单元测试，（比如 :code:`test_sum_op` ）：
 
 .. code-block:: bash
 
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
-   bash /paddle/paddle/scripts/docker/build.sh
-   cd /paddle/build
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   ./paddle/scripts/paddle_build.sh build
+   cd build
    ctest -R test_sum_op -V
 
 .. _faq_docker:
@@ -107,7 +106,7 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 
 - 学习 Docker 有多难？
 
-  理解 Docker 并不难，大概花十分钟看一下[这篇文章](https://zhuanlan.zhihu.com/p/19902938)。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+  理解 Docker 并不难，大概花十分钟看一下 `如何使用Docker <https://zhuanlan.zhihu.com/p/19902938>`_ 。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
 
 - 我可以用 IDE 吗？
 
@@ -115,17 +114,16 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 
   很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
 
-  ```emacs
-  (global-set-key "\C-cc" 'compile)
-  (setq compile-command
-   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
-  ```
+  .. code-block:: emacs
+
+    (global-set-key "\C-cc" 'compile)
+    (setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
 
   就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
 
 - 可以并行编译吗？
 
-  是的。我们的 Docker image 运行一个 [Bash 脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+  是的。我们的 Docker image 运行一个 `Paddle编译Bash脚本 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
 
 - Docker 需要 sudo
 
@@ -133,11 +131,11 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 
 - 在 Windows/MacOS 上编译很慢
 
-  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考[这个issue](https://github.com/PaddlePaddle/Paddle/issues/627)。
+  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考 `如何为Windows/Mac计算机上的Docker增加内存和虚拟机 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 。
 
 - 磁盘不够
 
-  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考[这篇文章](https://zaiste.net/posts/removing_docker_containers/)来清理这些内容。
+  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考 `如何删除Docker Container <https://zaiste.net/posts/removing_docker_containers/>`_ 来清理这些内容。
 
 
 .. _compile_deps:
@@ -197,7 +195,7 @@ BLAS
 
 PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
 `OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集，
-还会下载MKL-DNN数学库，详细参考 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
+还会下载MKL-DNN数学库，详细参考 `mkldnn设计文档 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
 
 如果关闭MKL，则会使用OpenBLAS作为BLAS库。
 
@@ -213,7 +211,7 @@ PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行，
 编译选项的设置
 ++++++++++++++
 
-PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如
 
 ..  code-block:: bash
 
diff --git a/doc/v2/build_and_install/build_from_source_en.rst b/doc/v2/build_and_install/build_from_source_en.rst
index 8fef9e7347e8d924026999bfda985381750c6b51..b08b45d43ec7f1deb2889832079a731ee724a44c 100644
--- a/doc/v2/build_and_install/build_from_source_en.rst
+++ b/doc/v2/build_and_install/build_from_source_en.rst
@@ -11,7 +11,7 @@ To build PaddlePaddle, you need
 1. A computer -- Linux, Windows, MacOS.
 2. Docker.
 
-Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image. 
+Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image.
 We run all the tools by running this image.
 
 .. _build_step:
@@ -22,8 +22,12 @@ How To Build
 You need to use Docker to build PaddlePaddle
 to avoid installing dependencies by yourself. We have several pre-built
 Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ ,
+you can also find how to build and use paddle_manylinux_devel Docker image from
+`here <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`__
 Or you can build your own image from source as the optional step below:
 
+If you don't wish to use docker，you need to install several compile dependencies manually as :ref:`Compile Dependencies <_compile_deps>` shows to start compilation.
+
 .. code-block:: bash
 
    # 1. clone the source code
@@ -32,14 +36,12 @@ Or you can build your own image from source as the optional step below:
    # 2. Optional: build development docker image from source
    docker build -t paddle:dev .
    # 3. Run the following command to build a CPU-Only binaries
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
    # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
 
 NOTE: The above command try to mount the current working directory (root directory of source code)
-into :code:`/paddle` directory inside docker container. If you are using your own image
-(Step 4) it will run default entry-point :code:`build.sh` , so you could omit the last
-command in step 3.
+into :code:`/paddle` directory inside docker container.
 
 When the compile finishes, you can get the output whl package under
 build/python/dist, then you can choose to install the whl on local
@@ -72,21 +74,21 @@ Set :code:`WITH_GPU=ON` Can also run tests on GPU.
 
 .. code-block:: bash
 
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
 
 If you wish to run only one unit test, like :code:`test_sum_op`:
 
 .. code-block:: bash
 
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
-   bash /paddle/paddle/scripts/docker/build.sh
-   cd /paddle/build
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   ./paddle/scripts/paddle_build.sh build
+   cd build
    ctest -R test_sum_op -V
 
 .. _faq_docker:
 
 Frequently Asked Questions
-----------------
+---------------------------
 
 - What is Docker?
 
@@ -108,7 +110,7 @@ Frequently Asked Questions
 
 - How difficult is it to learn Docker?
 
-    It takes you ten minutes to read [an introductory article](https://docs.docker.com/get-started) and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools.  Not even to mention the time saved when other people trying to reproduce the issue you have.
+    It takes you ten minutes to read `an introductory article <https://docs.docker.com/get-started>`_ and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools.  Not even to mention the time saved when other people trying to reproduce the issue you have.
 
 - Can I use my favorite IDE?
 
@@ -116,17 +118,16 @@ Frequently Asked Questions
 
   Many PaddlePaddle developers are using Emacs.  They add the following few lines into their `~/.emacs` configure file:
 
-  ```emacs
-  (global-set-key "\C-cc" 'compile)
-  (setq compile-command
-   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
-  ```
+  .. code-block:: emacs
+
+    (global-set-key "\C-cc" 'compile)
+    (setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
 
   so they could type `Ctrl-C` and `c` to build PaddlePaddle from source.
 
 - Does Docker do parallel building?
 
-  Our building Docker image runs a [Bash script](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh), which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
+  Our building Docker image runs a  `Bash script <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ , which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
 
 - Docker requires sudo
 
@@ -134,16 +135,16 @@ Frequently Asked Questions
 
 - Docker on Windows/MacOS builds slowly
 
-  On Windows and MacOS, Docker containers run in a Linux VM.  You might want to give this VM some more memory and CPUs so to make the building efficient.  Please refer to [this issue](https://github.com/PaddlePaddle/Paddle/issues/627) for details.
+  On Windows and MacOS, Docker containers run in a Linux VM.  You might want to give this VM some more memory and CPUs so to make the building efficient.  Please refer to `this issue  <https://github.com/PaddlePaddle/Paddle/issues/627>`_ for details.
 
 - Not enough disk space
 
-  Examples in this article use option `--rm` with the `docker run` command.  This option ensures that stopped containers do not exist on hard disks.  We can use `docker ps -a` to list all containers, including stopped.  Sometimes `docker build` generates some intermediate dangling images, which also take disk space.  To clean them, please refer to [this article](https://zaiste.net/posts/removing_docker_containers/).
+  Examples in this article use option `--rm` with the `docker run` command.  This option ensures that stopped containers do not exist on hard disks.  We can use `docker ps -a` to list all containers, including stopped.  Sometimes `docker build` generates some intermediate dangling images, which also take disk space.  To clean them, please refer to `this article <https://zaiste.net/posts/removing_docker_containers/>`_ .
 
 .. _compile_deps:
 
 Appendix: Compile Dependencies
-----------------
+-------------------------------
 
 PaddlePaddle need the following dependencies when compiling, other dependencies
 will be downloaded automatically.
@@ -164,11 +165,11 @@ will be downloaded automatically.
 .. _build_options:
 
 Appendix: Build Options
-----------------
+-------------------------
 
 Build options include whether build binaries for CPU or GPU, which BLAS
 library to use etc. You may pass these settings when running cmake.
-For detailed cmake tutorial please refer to `here <https://cmake.org/cmake-tutorial>`_ 。
+For detailed cmake tutorial please refer to `here <https://cmake.org/cmake-tutorial>`__ 。
 
 
 You can add :code:`-D` argument to pass such options, like:
@@ -217,7 +218,7 @@ keep on with latest cuDNN versions. Be sure to run with the same version of cuDN
 you built.
 
 Pass Compile Options
-++++++++++++++
+++++++++++++++++++++++
 
 You can pass compile options to use intended BLAS/CUDA/Cudnn libraries.
 When running cmake command, it will search system paths like
diff --git a/doc/v2/build_and_install/docker_install_cn.rst b/doc/v2/build_and_install/docker_install_cn.rst
index 79d214635a069a739060e0b79424729f6ff90387..106c86bace075764c84bc2a7f7cb09d466fa8794 100644
--- a/doc/v2/build_and_install/docker_install_cn.rst
+++ b/doc/v2/build_and_install/docker_install_cn.rst
@@ -73,6 +73,7 @@
 当然，您也可以进入到Docker容器中，以交互式的方式执行或调试您的代码：
 
   .. code-block:: bash
+
      docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
      cd /work
      python train.py
@@ -97,7 +98,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
 
 国内用户可以使用下面的镜像源来加速访问：
 
-  .. code-block: bash
+  .. code-block:: bash
 
     docker run -p 8888:8888 docker.paddlepaddlehub.com/book
 
diff --git a/doc/v2/build_and_install/docker_install_en.rst b/doc/v2/build_and_install/docker_install_en.rst
index e0e0559fb858a093db96a9b4ec1c5a45d6c71a38..25aecb8d0da9feb00006da6259b529b7011d91cb 100644
--- a/doc/v2/build_and_install/docker_install_en.rst
+++ b/doc/v2/build_and_install/docker_install_en.rst
@@ -80,6 +80,7 @@ Also, you can go into the container shell, run or debug your code
 interactively:
 
   .. code-block:: bash
+
      docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
      cd /work
      python train.py
@@ -104,7 +105,7 @@ We provide a packaged book image, simply issue the command:
 
 For users in China, we provide a faster mirror:
 
-  .. code-block: bash
+  .. code-block:: bash
 
     docker run -p 8888:8888 docker.paddlepaddlehub.com/book
 
diff --git a/doc/v2/build_and_install/index_cn.rst b/doc/v2/build_and_install/index_cn.rst
index e079bb661f3a5141a09dfbc6893d1bf945697bc9..1a9305ac4b6578c14a962f223c647a71e3b8a72b 100644
--- a/doc/v2/build_and_install/index_cn.rst
+++ b/doc/v2/build_and_install/index_cn.rst
@@ -6,7 +6,7 @@
 PaddlePaddle针对不同的用户群体提供了多种安装方式。
 
 专注深度学习模型开发
------------------
+--------------------
 
 PaddlePaddle提供了多种python wheel包，可通过pip一键安装：
 
@@ -18,7 +18,7 @@ PaddlePaddle提供了多种python wheel包，可通过pip一键安装：
 这是最便捷的安装方式，请根据机器配置和系统选择对应的安装包。
 
 关注底层框架
-----------
+-------------
 
 PaddlePaddle提供了基于Docker的安装方式，请参照以下教程：
 
@@ -45,7 +45,7 @@ PaddlePaddle提供了基于Docker的安装方式，请参照以下教程：
 
 
 常见问题汇总
------------
+--------------
 
 如果在安装过程中遇到了问题，请先尝试在下面的页面寻找答案：
 
diff --git a/doc/v2/build_and_install/index_en.rst b/doc/v2/build_and_install/index_en.rst
index 5b3de0f8c3e5496060646b5ddb080d0d338a8bfa..7990bacbd6966e88e8763e9c5709e410f7e9fed4 100644
--- a/doc/v2/build_and_install/index_en.rst
+++ b/doc/v2/build_and_install/index_en.rst
@@ -1,12 +1,12 @@
 install and Compile
-==========
+======================
 
 .. _install_steps:
 
 PaddlePaddle provides various methods of installation for many different users
 
 Focus on Deep Learning Model Development
------------------
+----------------------------------------
 
 PaddlePaddle provides lots of packages of python wheel , that pip can install:
 
@@ -18,7 +18,7 @@ PaddlePaddle provides lots of packages of python wheel , that pip can install:
 This is the most convenient way of installation. Please choose the right installation package with machine configure and system.
 
 Follow the Bottom Frame
-----------
+------------------------
 
 PaddlePaddle also supports installation using Docker. Please refer to the tutorial below:
 
diff --git a/doc/v2/build_and_install/pip_install_cn.rst b/doc/v2/build_and_install/pip_install_cn.rst
index b3d882743785e8ee301b71b696230531d2b7ba58..853bdb21bbcf07ae1742d2196dbcfe4668828b7b 100644
--- a/doc/v2/build_and_install/pip_install_cn.rst
+++ b/doc/v2/build_and_install/pip_install_cn.rst
@@ -10,20 +10,38 @@ PaddlePaddle可以使用常用的Python包管理工具
 使用pip安装
 ------------------------------
 
-
-执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件，版本为cpu_avx_openblas。
+执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件。
 
   .. code-block:: bash
 
      pip install paddlepaddle
 
+当前的默认版本为0.12.0，cpu_avx_openblas，您可以通过指定版本号来安装其它版本，例如:
+
+  .. code-block:: bash
+
+      pip install paddlepaddle==0.11.0
+
 
-如果需要安装支持GPU的版本（cuda7.5_cudnn5_avx_openblas），需要执行：
+如果需要安装支持GPU的版本（cuda8.0_cudnn5_avx_openblas），需要执行：
 
   .. code-block:: bash
 
      pip install paddlepaddle-gpu
 
+当前的默认版本也是0.12.0，PaddlePaddle针对不同需求提供了更多版本的安装包，部分列表如下：
+
+=================================   ========================================
+版本号                               版本说明
+=================================   ========================================
+paddlepaddle-gpu==0.12.0            使用CUDA 8.0和cuDNN 5编译的0.12.0版本
+paddlepaddle-gpu==0.11.0.post87     使用CUDA 8.0和cuDNN 7编译的0.11.0版本
+paddlepaddle-gpu==0.11.0.post8      使用CUDA 8.0和cuDNN 5编译的0.11.0版本
+paddlepaddle-gpu==0.11.0            使用CUDA 7.5和cuDNN 5编译的0.11.0版本
+=================================   ========================================
+
+您可以在 `Release History <https://pypi.org/project/paddlepaddle-gpu/#history>`_ 中找到paddlepaddle-gpu的各个发行版本。
+
 如果需要获取并安装最新的（开发分支）PaddlePaddle，可以从我们的CI系统中下载最新的whl安装包和c-api开发包并安装，
 您可以从下面的表格中找到需要的版本：
 
@@ -37,12 +55,11 @@ PaddlePaddle可以使用常用的Python包管理工具
     :header: "版本说明", "cp27-cp27mu", "cp27-cp27m"
     :widths: 1, 3, 3
 
-    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
+    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
 
 .. _pip_dependency:
 
@@ -69,7 +86,7 @@ PaddlePaddle发布的安装包会尽量对齐 `manylinux1 <https://www.python.or
 ------------------------------
 
 - paddlepaddle*.whl is not a supported wheel on this platform.
-  
+
   出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准，需要使用最新的pip (>9.0.0) 才可以安装。可以使用下面的命令更新您的pip：
 
     .. code-block:: bash
diff --git a/doc/v2/build_and_install/pip_install_en.rst b/doc/v2/build_and_install/pip_install_en.rst
index 1e409d86b9775094998f72f92954f4bbc1013ea1..fecf6d3712feac3265100a6121901ba784f7d5cc 100644
--- a/doc/v2/build_and_install/pip_install_en.rst
+++ b/doc/v2/build_and_install/pip_install_en.rst
@@ -12,20 +12,38 @@ Install using pip
 ------------------------------
 
 Run the following command to install PaddlePaddle on the current
-machine, it will also download requirements, the version is cpu_avx_openblas.
+machine, it will also download requirements.
 
   .. code-block:: bash
 
      pip install paddlepaddle
 
+the default version is 0.12.0, cpu_avx_openblas, you can specify the versions to satisfy your demands, like:
 
-If you wish to install GPU version (cuda7.5_cudnn5_avx_openblas), just run:
+  .. code-block:: bash
+
+      pip install paddlepaddle==0.11.0
+
+If you need to install a GPU-enabled version (cuda8.0_cudnn5_avx_openblas), you need to run:
 
   .. code-block:: bash
 
      pip install paddlepaddle-gpu
 
-If you wish to install the latest develop branch PaddlePaddle, 
+The default version is also 0.12.0, PaddlePaddle provides several versions of packages for different needs, as shown in the table:
+
+=================================   ========================================
+版本号                               版本说明
+=================================   ========================================
+paddlepaddle-gpu==0.12.0            0.12.0 built with CUDA 8.0 and cuDNN 5
+paddlepaddle-gpu==0.11.0.post87     0.11.0 built with CUDA 8.0 and cuDNN 7
+paddlepaddle-gpu==0.11.0.post8      0.11.0 built with CUDA 8.0 and cuDNN 5
+paddlepaddle-gpu==0.11.0            0.11.0 built with CUDA 7.5 and cuDNN 5
+=================================   ========================================
+
+You can find all versions released of paddlepaddle-gpu in `Release History <https://pypi.org/project/paddlepaddle-gpu/#history>`_ .
+
+If you wish to install the latest develop branch PaddlePaddle,
 you can download the latest whl package from our CI system. Access
 the below links, log in as guest, then click at the "Artifact"
 tab, you'll find the download link of whl packages.
@@ -40,12 +58,11 @@ If the links below shows up the login form, just click "Log in as guest" to star
     :header: "version", "cp27-cp27mu", "cp27-cp27m"
     :widths: 1, 3, 3
 
-    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
+    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
 
 .. _pip_dependency:
 
@@ -79,7 +96,7 @@ FAQ
 ------------------------------
 
 - paddlepaddle*.whl is not a supported wheel on this platform.
-  
+
   The main cause of this issue is that your current platform is
   not supported. Please check that you are using Python 2.7 series.
   Besides, pypi only supports manylinux1 standard, you'll need to
diff --git a/doc/v2/design/mkl/mkldnn.md b/doc/v2/design/mkl/mkldnn.md
index 1bd2e7bc34ee79eb753b3520d97e5e7beca89b0b..bd5bcf6f67168c21cebb046a629b948d1661e75c 100644
--- a/doc/v2/design/mkl/mkldnn.md
+++ b/doc/v2/design/mkl/mkldnn.md
@@ -5,7 +5,7 @@
 充分展现英特尔平台的优势，有效提升PaddlePaddle在英特尔架构上的性能。
 
 <div align="center">
-<img src="image/overview.png"><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/v2/images/overview.png"><br/>
 Figure 1. PaddlePaddle on IA
 </div>
 
@@ -42,16 +42,43 @@ Figure 1. PaddlePaddle on IA
 
 MKL，MKLML以及MKL-DNN三者关系如下表：
 
-| Name        |  Open Source     | License     | Descriptions  |
-| :---------- | :--------------- | :---------- | :------------ |
-|   MKL       |     No           | Proprietary | Accelerate math processing routines |
-|   MKLML     |     No           | Proprietary | Small package of MKL, especially for Machine Learning |
-|   MKL-DNN   |     Yes          | Apache 2.0  | Accelerate primitives processing routines especially for Deep Neural Networks  |
+<table>
+<thead>
+<tr>
+<th>Name</th>
+<th>Open Source</th>
+<th>License</th>
+<th>Descriptions</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>MKL</td>
+<td>No</td>
+<td>Proprietary</td>
+<td>Accelerate math processing routines</td>
+</tr>
+<tr>
+<td>MKLML</td>
+<td>No</td>
+<td>Proprietary</td>
+<td>Small package of MKL, especially for Machine Learning</td>
+</tr>
+
+<tr>
+<td>MKL-DNN</td>
+<td>Yes</td>
+<td>Apache 2.0</td>
+<td>Accelerate primitives processing routines especially for Deep Neural Networks</td>
+</tr>
+
+</tbody>
+</table>
 
 MKLML可以与MKL-DNN共同使用，以此达到最好的性能。
 
 <div align="center">
-<img src="image/engine.png"><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/v2/images/engine.png"><br/>
 Figure 2. PaddlePaddle with MKL Engines
 </div>
 
@@ -103,7 +130,7 @@ MKL-DNN的库目前只有动态库`libmkldnn.so`。
 所以我们定义了一个`MKLDNNMatrix`用于管理MKL-DNN数据的不同格式以及相互之间的转换。
 
 <div align="center">
-<img src="image/matrix.png"><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/v2/images/matrix.png"><br/>
 Figure 3. MKLDNNMatrix
 </div>
 
@@ -113,7 +140,7 @@ Figure 3. MKLDNNMatrix
 子类只需要使用定义好的接口，实现具体的函数功能即可。
 
 <div align="center">
-<img src="image/layers.png"><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/v2/images/layers.png"><br/>
 Figure 4. MKLDNNLayer
 </div>
 
@@ -150,7 +177,7 @@ Figure 4. MKLDNNLayer
 所以整体上，在实现每个子类的时候就不需要关心分支的事情了。
 
 <div align="center">
-<img src="image/gradients.png"><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/v2/images/gradients.png"><br/>
 Figure 5. Merge Gradients
 </div>
 
diff --git a/doc/v2/dev/contribute_to_paddle_cn.md b/doc/v2/dev/contribute_to_paddle_cn.md
index d8bf093e09b53b302225739fa67146adc7976e4b..add06e42f1bbd221b48eb83e4e84d4a7c89e7483 100644
--- a/doc/v2/dev/contribute_to_paddle_cn.md
+++ b/doc/v2/dev/contribute_to_paddle_cn.md
@@ -51,6 +51,8 @@ Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理 G
 
 Paddle 使用 `clang-format` 来调整 C/C++ 源代码格式，请确保 `clang-format` 版本在 3.8 以上。
 
+注：通过`pip install pre-commit`和`conda install -c conda-forge pre-commit`安装的`yapf`稍有不同的，Paddle 开发人员使用的是`pip install pre-commit`。
+
 ## 开始开发
 
 在本例中，我删除了 README.md 中的一行，并创建了一个新文件。
diff --git a/doc/v2/howto/capi/workflow_of_capi_cn.md b/doc/v2/howto/capi/workflow_of_capi_cn.md
index 1968c1099ac5734cd68b437f2f7aa428d7b5265e..3acdbae28e9b35f8a9104a89c9a5799f8c892334 100644
--- a/doc/v2/howto/capi/workflow_of_capi_cn.md
+++ b/doc/v2/howto/capi/workflow_of_capi_cn.md
@@ -59,7 +59,7 @@
     代码示例如下：
 
     ```python
-    from paddle.utils.merge_model import merge_v2_modelss
+    from paddle.utils.merge_model import merge_v2_model
     from mnist_v2 import network
 
     net = network(is_infer=True)
diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md b/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md
index dee1b7554f97af17989c3f7739d8feea3b6b8e3f..b2dc4da8451af317df76c5b3df328b6f58429610 100644
--- a/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md
@@ -41,7 +41,7 @@ Training docker image needs to package the paddle pserver and paddle trainer run
 - Generating the initialization arguments for `Paddle PServer` and `Paddle Training` processes.
 
 Since the paddlepaddle official docker image already has the runtimes we need, we'll take it as the base image and pack some additional scripts for the processes mentioned above to build our training image. for more detail, please find from the following link:
-- https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/src/k8s_train/Dockerfile
+- https://github.com/PaddlePaddle/Paddle/tree/develop/doc/v2/howto/cluster/multi_cluster/src/k8s_train/Dockerfile
 
 
 ```bash
@@ -62,7 +62,7 @@ represent the Docker Image which built in this step.
 ### Prepare Training Data
 
 We can download and split the training job by creating a Kubernetes Job, or custom your image
-by editing [k8s_train](./src/k8s_train/).
+by editing [k8s_train](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/v2/howto/cluster/multi_cluster/src/k8s_train).
 
 Before creating a Job, we need to bind a [persistenVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes) by the different type of
 the different file system, the generated dataset would be saved on this volume.
diff --git a/doc/v2/images/FullyConnected.jpg b/doc/v2/images/FullyConnected.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b2241f401434e527f95ee4e0e541a3f2ff78fd1e
Binary files /dev/null and b/doc/v2/images/FullyConnected.jpg differ
diff --git a/doc/v2/images/add_security_group.png b/doc/v2/images/add_security_group.png
new file mode 100644
index 0000000000000000000000000000000000000000..bd34f46c9b0ada7027fd53e553e7d033255d25fc
Binary files /dev/null and b/doc/v2/images/add_security_group.png differ
diff --git a/doc/v2/images/bi_lstm.jpg b/doc/v2/images/bi_lstm.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..adec1606d64d6e35ffe7e62abfa9a09309b05c84
Binary files /dev/null and b/doc/v2/images/bi_lstm.jpg differ
diff --git a/doc/v2/images/checkpointing.png b/doc/v2/images/checkpointing.png
new file mode 100644
index 0000000000000000000000000000000000000000..c221e8474f90f37e31416cbb19c9452207a0d14c
Binary files /dev/null and b/doc/v2/images/checkpointing.png differ
diff --git a/doc/v2/images/create_efs.png b/doc/v2/images/create_efs.png
new file mode 100644
index 0000000000000000000000000000000000000000..e5f1526033d1daf401700989af1d25919bcb7675
Binary files /dev/null and b/doc/v2/images/create_efs.png differ
diff --git a/doc/v2/images/csr.png b/doc/v2/images/csr.png
new file mode 100644
index 0000000000000000000000000000000000000000..3dc10b8de4f6d3f517624956b1694b689405a031
Binary files /dev/null and b/doc/v2/images/csr.png differ
diff --git a/doc/v2/images/data_dispatch.png b/doc/v2/images/data_dispatch.png
new file mode 100644
index 0000000000000000000000000000000000000000..5bdcc24d6a6d193cb014f8c38b362451fded5e54
Binary files /dev/null and b/doc/v2/images/data_dispatch.png differ
diff --git a/doc/v2/images/dataset.graffle b/doc/v2/images/dataset.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..c10a423ed16a23229a9ee33d11bfc82bb59646c8
Binary files /dev/null and b/doc/v2/images/dataset.graffle differ
diff --git a/doc/v2/images/dataset.png b/doc/v2/images/dataset.png
new file mode 100644
index 0000000000000000000000000000000000000000..2fb7f1cce3b6dd21489392557826e95a9f207c34
Binary files /dev/null and b/doc/v2/images/dataset.png differ
diff --git a/doc/v2/images/doc_en.png b/doc/v2/images/doc_en.png
new file mode 100644
index 0000000000000000000000000000000000000000..ed6b9178fba91a3bdf45ae797a9924f84146fbc8
Binary files /dev/null and b/doc/v2/images/doc_en.png differ
diff --git a/doc/v2/images/efs_mount.png b/doc/v2/images/efs_mount.png
new file mode 100644
index 0000000000000000000000000000000000000000..0f9e3cab98445707e5e9baa18ddabe15cdf04576
Binary files /dev/null and b/doc/v2/images/efs_mount.png differ
diff --git a/doc/v2/images/encoder-decoder-attention-model.png b/doc/v2/images/encoder-decoder-attention-model.png
new file mode 100644
index 0000000000000000000000000000000000000000..79f911d4ba12ac0c0d1a936c9df639c302786914
Binary files /dev/null and b/doc/v2/images/encoder-decoder-attention-model.png differ
diff --git a/doc/v2/images/engine.png b/doc/v2/images/engine.png
new file mode 100644
index 0000000000000000000000000000000000000000..1f5f65c2cc765a514a3ba9e7b7f468e1dc4b0c3b
Binary files /dev/null and b/doc/v2/images/engine.png differ
diff --git a/doc/v2/images/file_storage.graffle b/doc/v2/images/file_storage.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..50a17e70fa255495337c529a3bf12a5c0024a5be
Binary files /dev/null and b/doc/v2/images/file_storage.graffle differ
diff --git a/doc/v2/images/file_storage.png b/doc/v2/images/file_storage.png
new file mode 100644
index 0000000000000000000000000000000000000000..fccb4e3e7e738224c7f1584326bd5f351ce799aa
Binary files /dev/null and b/doc/v2/images/file_storage.png differ
diff --git a/doc/v2/images/glossary_rnn.dot b/doc/v2/images/glossary_rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..2cd0fb1820c44b0e8e0b869f9d39fcad27efa758
--- /dev/null
+++ b/doc/v2/images/glossary_rnn.dot
@@ -0,0 +1,42 @@
+digraph G{
+	subgraph cluster_timestep0 {
+		label="recurrent timestep i-1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc0_0 [label="fc 0"]
+		fc0_1 [label="fc 1"]
+		fc0_2 [label="fc 2"]
+
+		fc0_0 -> fc0_1
+		fc0_1 -> fc0_2
+	}
+
+	subgraph cluster_timestep1 {
+		label="recurrent timestep i"
+		node [style=filled];
+		fc1_0 [label="fc 0"]
+		fc1_1 [label="fc 1"]
+		fc1_2 [label="fc 2"]
+		color=blue
+
+		fc1_0 -> fc1_1
+		fc1_1 -> fc1_2
+	}
+
+	subgraph cluster_timestep2 {
+		label="recurrent timestep i+1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc2_0 [label="fc 0"]
+		fc2_1 [label="fc 1"]
+		fc2_2 [label="fc 2"]
+
+		fc2_0 -> fc2_1
+		fc2_1 -> fc2_2
+	}
+	
+	
+	fc0_1 -> fc1_1 [style="dotted" constraint=false]
+	fc1_1 -> fc2_1 [style="dotted" constraint=false]
+
+}
\ No newline at end of file
diff --git a/doc/v2/images/glossary_rnn_with_memory.dot b/doc/v2/images/glossary_rnn_with_memory.dot
new file mode 100644
index 0000000000000000000000000000000000000000..0f101ec2d8f15aec76c57f328046b6b55cf0c7eb
--- /dev/null
+++ b/doc/v2/images/glossary_rnn_with_memory.dot
@@ -0,0 +1,48 @@
+digraph G{
+	subgraph cluster_timestep0 {
+		label="recurrent timestep i-1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc0_0 [label="fc 0"]
+		fc0_1 [label="fc 1"]
+		fc0_2 [label="fc 2"]
+		m0 [label="memory"]
+		fc0_0 -> fc0_1
+		fc0_1 -> fc0_2
+		fc0_1 -> m0
+		m0 -> fc0_1
+	}
+
+	subgraph cluster_timestep1 {
+		label="recurrent timestep i"
+		node [style=filled];
+		fc1_0 [label="fc 0"]
+		fc1_1 [label="fc 1"]
+		fc1_2 [label="fc 2"]
+		m1 [label="memory"]
+		color=blue
+		fc1_0 -> fc1_1
+		fc1_1 -> fc1_2
+		fc1_1 -> m1
+		m1 -> fc1_1
+	}
+
+	subgraph cluster_timestep2 {
+		label="recurrent timestep i+1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc2_0 [label="fc 0"]
+		fc2_1 [label="fc 1"]
+		fc2_2 [label="fc 2"]
+		m2 [label="memory"]
+		fc2_0 -> fc2_1
+		fc2_1 -> fc2_2
+		fc2_1 -> m2
+		m2 -> fc2_1
+	}
+	
+	
+	m0 -> m1 [style="dotted" constraint=false]
+	m1 -> m2 [style="dotted" constraint=false]
+
+}
\ No newline at end of file
diff --git a/doc/v2/images/gradients.png b/doc/v2/images/gradients.png
new file mode 100644
index 0000000000000000000000000000000000000000..f031bcf8e4cec14e63075b8b9d2c7bbd9f1b1a3c
Binary files /dev/null and b/doc/v2/images/gradients.png differ
diff --git a/doc/v2/images/init_lock.graffle b/doc/v2/images/init_lock.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..fa9149f21b1311eed48ef72ec55e556559d0fc94
Binary files /dev/null and b/doc/v2/images/init_lock.graffle differ
diff --git a/doc/v2/images/init_lock.png b/doc/v2/images/init_lock.png
new file mode 100644
index 0000000000000000000000000000000000000000..92404ee6d6c0f9a7727952bae3c869ba338ecd7f
Binary files /dev/null and b/doc/v2/images/init_lock.png differ
diff --git a/doc/v2/images/k8s-paddle-arch.png b/doc/v2/images/k8s-paddle-arch.png
new file mode 100644
index 0000000000000000000000000000000000000000..b3800c4fe81302d35e49f7dbacb9221c4dfa5cde
Binary files /dev/null and b/doc/v2/images/k8s-paddle-arch.png differ
diff --git a/doc/v2/images/layers.png b/doc/v2/images/layers.png
new file mode 100644
index 0000000000000000000000000000000000000000..306f79b7a844610915eb8944128f57d2b7a3065a
Binary files /dev/null and b/doc/v2/images/layers.png differ
diff --git a/doc/v2/images/managed_policy.png b/doc/v2/images/managed_policy.png
new file mode 100644
index 0000000000000000000000000000000000000000..c7ecda555b81d7750e9292a9ab72d2f517f76a2a
Binary files /dev/null and b/doc/v2/images/managed_policy.png differ
diff --git a/doc/v2/images/matrix.png b/doc/v2/images/matrix.png
new file mode 100644
index 0000000000000000000000000000000000000000..c33ce9cf0335e47cc8c1253304d0fe179186e6f2
Binary files /dev/null and b/doc/v2/images/matrix.png differ
diff --git a/doc/v2/images/nvvp1.png b/doc/v2/images/nvvp1.png
new file mode 100644
index 0000000000000000000000000000000000000000..1af23ac3c52929b2b0645d2f9fa4d4c6db1f6e77
Binary files /dev/null and b/doc/v2/images/nvvp1.png differ
diff --git a/doc/v2/images/nvvp2.png b/doc/v2/images/nvvp2.png
new file mode 100644
index 0000000000000000000000000000000000000000..177c9db708da6863d1075f3e615f5962dbe18b29
Binary files /dev/null and b/doc/v2/images/nvvp2.png differ
diff --git a/doc/v2/images/nvvp3.png b/doc/v2/images/nvvp3.png
new file mode 100644
index 0000000000000000000000000000000000000000..d8f393667d6569b6f1e61ffccac43fae5888b6db
Binary files /dev/null and b/doc/v2/images/nvvp3.png differ
diff --git a/doc/v2/images/nvvp4.png b/doc/v2/images/nvvp4.png
new file mode 100644
index 0000000000000000000000000000000000000000..51f2f3e183295de6cf8ddaf2b3b8a0862aa35f01
Binary files /dev/null and b/doc/v2/images/nvvp4.png differ
diff --git a/doc/v2/images/overview.png b/doc/v2/images/overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..8fb7bbb9dd654bf363d701d0c8cd4a557043d188
Binary files /dev/null and b/doc/v2/images/overview.png differ
diff --git a/doc/v2/images/paddle-cloud-in-data-center.png b/doc/v2/images/paddle-cloud-in-data-center.png
new file mode 100644
index 0000000000000000000000000000000000000000..da5d1a77562480ad1d886f5f21dbd84001d3d508
Binary files /dev/null and b/doc/v2/images/paddle-cloud-in-data-center.png differ
diff --git a/doc/v2/images/paddle-etcd.graffle b/doc/v2/images/paddle-etcd.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..f973dc9b9dbf72e9bc31e2d32822916cd281f8d9
Binary files /dev/null and b/doc/v2/images/paddle-etcd.graffle differ
diff --git a/doc/v2/images/paddle-etcd.png b/doc/v2/images/paddle-etcd.png
new file mode 100644
index 0000000000000000000000000000000000000000..57981ceb4b94f0f7d6dfa63f3d28c0402bf9cc31
Binary files /dev/null and b/doc/v2/images/paddle-etcd.png differ
diff --git a/doc/v2/images/paddle-model-sharding.graffle b/doc/v2/images/paddle-model-sharding.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..fba30f0ca2b47f0d202a432821d95e55aac37ec8
Binary files /dev/null and b/doc/v2/images/paddle-model-sharding.graffle differ
diff --git a/doc/v2/images/paddle-model-sharding.png b/doc/v2/images/paddle-model-sharding.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c3f6724ef46c6527e63a4cd8cb0b50fe0167124
Binary files /dev/null and b/doc/v2/images/paddle-model-sharding.png differ
diff --git a/doc/v2/images/paddle-ps-0.png b/doc/v2/images/paddle-ps-0.png
new file mode 100644
index 0000000000000000000000000000000000000000..47ef32806f182cab003da77f1556823b3f6d1721
Binary files /dev/null and b/doc/v2/images/paddle-ps-0.png differ
diff --git a/doc/v2/images/paddle-ps-1.png b/doc/v2/images/paddle-ps-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..f3125db73096c52bac6e7c60e1675552857c0774
Binary files /dev/null and b/doc/v2/images/paddle-ps-1.png differ
diff --git a/doc/v2/images/paddle-ps.graffle b/doc/v2/images/paddle-ps.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..0e536ffdd91cd696008b4c01bad3cb53edebdc16
Binary files /dev/null and b/doc/v2/images/paddle-ps.graffle differ
diff --git a/doc/v2/images/paddle-task-queues.graffle b/doc/v2/images/paddle-task-queues.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..4263ed8bfd2ef0e55058828bf23f2fac3595e5fd
Binary files /dev/null and b/doc/v2/images/paddle-task-queues.graffle differ
diff --git a/doc/v2/images/paddle-task-queues.png b/doc/v2/images/paddle-task-queues.png
new file mode 100644
index 0000000000000000000000000000000000000000..5f980266795776752cebd0c346b85c4a75a47780
Binary files /dev/null and b/doc/v2/images/paddle-task-queues.png differ
diff --git a/doc/v2/images/paddle-task-states.graffle b/doc/v2/images/paddle-task-states.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..cf1a0b9246d9386a949d2dbb8c32fe84f72eea83
Binary files /dev/null and b/doc/v2/images/paddle-task-states.graffle differ
diff --git a/doc/v2/images/paddle-task-states.png b/doc/v2/images/paddle-task-states.png
new file mode 100644
index 0000000000000000000000000000000000000000..4ae43cb66c071aee9eb90d875e2373b29af9c3e0
Binary files /dev/null and b/doc/v2/images/paddle-task-states.png differ
diff --git a/doc/v2/images/ps_cn.png b/doc/v2/images/ps_cn.png
new file mode 100644
index 0000000000000000000000000000000000000000..f9525739cc8bc6506adde642aafa0a85ae3ebebc
Binary files /dev/null and b/doc/v2/images/ps_cn.png differ
diff --git a/doc/v2/images/ps_en.png b/doc/v2/images/ps_en.png
new file mode 100644
index 0000000000000000000000000000000000000000..6537d3d56589ca9f19a77a50a970e4b5275e6ce0
Binary files /dev/null and b/doc/v2/images/ps_en.png differ
diff --git a/doc/v2/images/pserver_and_trainer.png b/doc/v2/images/pserver_and_trainer.png
new file mode 100644
index 0000000000000000000000000000000000000000..f41fe48920590333ad332bb51eb18e03dc251541
Binary files /dev/null and b/doc/v2/images/pserver_and_trainer.png differ
diff --git a/doc/v2/images/pserver_init.graffle b/doc/v2/images/pserver_init.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..5f3f1f52be8aa7f9049a8fcd6b7c93c8560c1676
Binary files /dev/null and b/doc/v2/images/pserver_init.graffle differ
diff --git a/doc/v2/images/pserver_init.png b/doc/v2/images/pserver_init.png
new file mode 100644
index 0000000000000000000000000000000000000000..dfe491ff98dd7db1c336093c80964a260df2cd90
Binary files /dev/null and b/doc/v2/images/pserver_init.png differ
diff --git a/doc/v2/images/route53_create_recordset.png b/doc/v2/images/route53_create_recordset.png
new file mode 100644
index 0000000000000000000000000000000000000000..34e476c7beac30fcdde13fccc4cc8d08b4be3d35
Binary files /dev/null and b/doc/v2/images/route53_create_recordset.png differ
diff --git a/doc/v2/images/route53_create_zone.png b/doc/v2/images/route53_create_zone.png
new file mode 100644
index 0000000000000000000000000000000000000000..25b7ddb831c5cba97f4b2edddd27da3234d621af
Binary files /dev/null and b/doc/v2/images/route53_create_zone.png differ
diff --git a/doc/v2/images/sequence_data.png b/doc/v2/images/sequence_data.png
new file mode 100644
index 0000000000000000000000000000000000000000..6e47a46b8955dfe977e85898fe3c9f33ed28de7e
Binary files /dev/null and b/doc/v2/images/sequence_data.png differ
diff --git a/doc/v2/images/simple_full_hierarchical_recurrent.dot b/doc/v2/images/simple_full_hierarchical_recurrent.dot
new file mode 100644
index 0000000000000000000000000000000000000000..ff278a0323bb2c3ef07bf6f016a3a8df05783581
--- /dev/null
+++ b/doc/v2/images/simple_full_hierarchical_recurrent.dot
@@ -0,0 +1,30 @@
+digraph G {
+  rankdir=LR;
+
+  subgraph cluster_t0 {
+    a [label="4"]
+    b [label="5"]
+    c [label="2"]
+  }
+  
+  subgraph cluster_t1 {
+    d [label="0"]
+    e [label="9"]
+  }
+
+  subgraph cluster_t2 {
+    f [label="8"]
+    g [label="1"]
+    h [label="4"]
+  }
+
+  a -> b;
+  b -> c;
+  c -> d [constraint=false];
+
+  d -> e;
+  e -> f [constraint=false];
+  
+  f -> g;
+  g -> h;
+}
\ No newline at end of file
diff --git a/doc/v2/images/simple_full_recurrent.dot b/doc/v2/images/simple_full_recurrent.dot
new file mode 100644
index 0000000000000000000000000000000000000000..cee281fbac993afbd0cc3416570f95965cdf0a59
--- /dev/null
+++ b/doc/v2/images/simple_full_recurrent.dot
@@ -0,0 +1,19 @@
+digraph G {
+  rankdir=LR;
+  a [label="4"]
+  b [label="5"]
+  c [label="2"]
+  d [label="0"]
+  e [label="9"]
+  f [label="8"]
+  g [label="1"]
+  h [label="4"]
+
+  a -> b;
+  b -> c;
+  c -> d;
+  d -> e;
+  e -> f;
+  f -> g;
+  g -> h;
+}
\ No newline at end of file
diff --git a/doc/v2/images/submit-job.graffle b/doc/v2/images/submit-job.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..677cdfb6d9a32168bf71729eb841fa1ca0dd31d6
Binary files /dev/null and b/doc/v2/images/submit-job.graffle differ
diff --git a/doc/v2/images/submit-job.png b/doc/v2/images/submit-job.png
new file mode 100644
index 0000000000000000000000000000000000000000..3046a460a7ba708079e88a560debaa215a694680
Binary files /dev/null and b/doc/v2/images/submit-job.png differ
diff --git a/doc/v2/images/trainer.graffle b/doc/v2/images/trainer.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..43415ed8cf61a5acfa34f8e56b9577f338dbf254
Binary files /dev/null and b/doc/v2/images/trainer.graffle differ
diff --git a/doc/v2/images/trainer.png b/doc/v2/images/trainer.png
new file mode 100644
index 0000000000000000000000000000000000000000..6537d3d56589ca9f19a77a50a970e4b5275e6ce0
Binary files /dev/null and b/doc/v2/images/trainer.png differ
diff --git a/doc/v2/images/trainer_cn.png b/doc/v2/images/trainer_cn.png
new file mode 100644
index 0000000000000000000000000000000000000000..f9525739cc8bc6506adde642aafa0a85ae3ebebc
Binary files /dev/null and b/doc/v2/images/trainer_cn.png differ
diff --git a/doc/v2/images/worker_security_group.png b/doc/v2/images/worker_security_group.png
new file mode 100644
index 0000000000000000000000000000000000000000..57eb0265a34ad4223b69600d2a3dd355482e0bf5
Binary files /dev/null and b/doc/v2/images/worker_security_group.png differ
diff --git a/doc/v2/images/workflow_of_CAPI.png b/doc/v2/images/workflow_of_CAPI.png
new file mode 100644
index 0000000000000000000000000000000000000000..a4399ade048b3fe10d2d9c714bc34333ca068edb
Binary files /dev/null and b/doc/v2/images/workflow_of_CAPI.png differ
diff --git a/go/pserver/client/c/test/CMakeLists.txt b/go/pserver/client/c/test/CMakeLists.txt
index 411dc50332672143d7a1f7bd0556ae86dc37f6f3..4500b1f288372ed0e2d9d383234df97ae976c60b 100644
--- a/go/pserver/client/c/test/CMakeLists.txt
+++ b/go/pserver/client/c/test/CMakeLists.txt
@@ -13,4 +13,3 @@
 # limitations under the License.
 #
 cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer)
-add_style_check_target(test_cclient test_cclient.c)
diff --git a/paddle/.gitignore b/paddle/.gitignore
index 1c1c0c2c829f088d7e3f52ca007fcb8f33a16a36..01904aa6ef2057afee95ddd6e30cde064b06c52e 100644
--- a/paddle/.gitignore
+++ b/paddle/.gitignore
@@ -11,7 +11,6 @@ GTAGS
 *.pb.cc
 *.pb.h
 *_pb2.py
-paddle_*
 output/
 google/
 Makefile
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 8b1ca5e16548334ed0c9a6d31b88e0805304579e..d722eec1892206ac44c49e7a12d92be0c54df8c0 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -24,6 +24,6 @@ if(NOT WITH_FLUID_ONLY)
 endif()
 
 add_subdirectory(testing)
-if(NOT MOBILE_INFERENCE AND NOT RPI)
+if(NOT MOBILE_INFERENCE AND NOT RPI AND NOT WITH_C_API)
   add_subdirectory(fluid)
 endif()
diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp
index a3d6f0f080abcf1f45d9bc5fbdb39bb6b6ca1553..0d9ad30de9c1f3f8f58c856a748abdc050ff8740 100644
--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@@ -94,7 +94,7 @@ void UpdateCallback::apply(Parameter* p) {
 }
 
 class UpdateCallbackWrapper {
-public:
+ public:
   explicit UpdateCallbackWrapper(const UpdateCallback& callback)
       : callback(const_cast<UpdateCallback&>(callback)) {}
 
@@ -105,7 +105,7 @@ public:
     delete p;
   }
 
-private:
+ private:
   UpdateCallback& callback;
 };
 
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index 67368d1a99d980b248789d24a2ea4f466255687a..7866122006a996cbe5201c661cab9c81aa82a219 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -59,9 +59,10 @@ class RangeError {};
 
 /// Not support Error, such as access GPU memory directly, etc.
 class UnsupportError : public std::runtime_error {
-public:
-  UnsupportError() : std::runtime_error(" "){};
-  UnsupportError(const std::string& message) : std::runtime_error(message){};
+ public:
+  UnsupportError() : std::runtime_error(" ") {}
+  explicit UnsupportError(const std::string& message)
+      : std::runtime_error(message) {}
 };
 
 /// This type will map to python's list of float.
@@ -105,7 +106,7 @@ class Matrix {
   DISABLE_COPY(Matrix);
   static Matrix* createByPaddleMatrixPtr(void* sharedPtr);
 
-public:
+ public:
   virtual ~Matrix();
 
   /**
@@ -231,7 +232,7 @@ public:
 
   bool isGpu() const;
 
-private:
+ private:
   void* getSharedPtr() const;
 
   MatrixPrivate* m;
@@ -248,7 +249,7 @@ class Vector {
 
   void* getSharedPtr();
 
-public:
+ public:
   ~Vector();
 
   /// Create Vector filled with zero.
@@ -310,10 +311,10 @@ public:
   /// __len__ in python
   size_t getSize() const;
 
-private:
+ private:
   VectorPrivate* m;
 
-private:
+ private:
   friend class Parameter;
   friend class ParameterOptimizer;
   friend struct ParameterTraverseCallbackPrivate;
@@ -325,7 +326,7 @@ class IVector {
   DISABLE_COPY(IVector);
   static IVector* createByPaddleVectorPtr(void* ptr);
 
-public:
+ public:
   /// Create IVector filled with zero
   static IVector* createZero(size_t sz, bool useGpu = isUsingGpu());
 
@@ -389,7 +390,7 @@ public:
   /// This method will map to python __len__();
   size_t getSize() const;
 
-private:
+ private:
   void* getSharedPtr() const;
 
   friend class Arguments;
@@ -400,11 +401,11 @@ struct ArgumentsPrivate;
 
 /// The Arguments is actual a std::vector<paddle::Argument> in paddle.
 class Arguments {
-private:
+ private:
   Arguments();  // Internal Create.
   DISABLE_COPY(Arguments);
 
-public:
+ public:
   /**
    * Create a arguments with size.
    * Note that it can be zero.
@@ -475,12 +476,12 @@ public:
 
   float sum() const;
 
-private:
+ private:
   static Arguments* createByPaddleArgumentVector(void* ptr);
   static Arguments* createByPaddleArgument(const void* ptr);
   void* getInternalArgumentsPtr() const;
 
-private:
+ private:
   ArgumentsPrivate* m;
   friend class Trainer;
   friend class GradientMachine;
@@ -507,7 +508,7 @@ class ParameterConfig {
   static ParameterConfig* createParameterConfigFromParameterPtr(void* ptr);
   void* getRawPtr();
 
-public:
+ public:
   ~ParameterConfig();
 
   /**
@@ -515,10 +516,10 @@ public:
    */
   std::string toProtoString() const;
 
-private:
+ private:
   ParameterConfigPrivate* m;
 
-private:
+ private:
   friend class Parameter;
   friend class ParameterOptimizer;
   friend struct ParameterTraverseCallbackPrivate;
@@ -529,7 +530,7 @@ class OptimizationConfig {
   DISABLE_COPY(OptimizationConfig);
   OptimizationConfig();
 
-public:
+ public:
   static OptimizationConfig* createFromProtoString(const std::string& str);
   ~OptimizationConfig();
 
@@ -538,7 +539,7 @@ public:
    */
   std::string toProtoString();
 
-private:
+ private:
   OptimizationConfigPrivate* m;
 
   friend class TrainerConfig;
@@ -549,11 +550,11 @@ private:
 
 struct ParameterPrivate;
 class Parameter {
-private:
+ private:
   Parameter();
   DISABLE_COPY(Parameter);
 
-public:
+ public:
   virtual ~Parameter();
 
   /**
@@ -580,11 +581,11 @@ public:
 
   size_t getSize() const;
 
-private:
+ private:
   static Parameter* createFromRawPtr(void* ptr);
   static Parameter* createFromSharedPtr(void* ptr);
 
-private:
+ private:
   ParameterPrivate* m;
   friend class UpdateCallbackWrapper;
   friend class GradientMachine;
@@ -598,14 +599,14 @@ struct ModelConfigPrivate;
  * It is used by GradientMachine.
  */
 class ModelConfig {
-private:
+ private:
   ModelConfig();
   DISABLE_COPY(ModelConfig);
 
-public:
+ public:
   virtual ~ModelConfig();
 
-private:
+ private:
   ModelConfigPrivate* m;
   friend class TrainerConfig;
   friend struct TrainerConfigPrivate;
@@ -619,11 +620,11 @@ struct TrainerConfigPrivate;
  * It is used by GradientMachine.
  */
 class TrainerConfig {
-private:
+ private:
   TrainerConfig();
   DISABLE_COPY(TrainerConfig);
 
-public:
+ public:
   virtual ~TrainerConfig();
 
   static TrainerConfig* createFromTrainerConfigFile(
@@ -634,7 +635,7 @@ public:
 
   OptimizationConfig* getOptimizationConfig() const;
 
-private:
+ private:
   TrainerConfigPrivate* m;
   friend class Trainer;
 };
@@ -654,7 +655,7 @@ private:
  * @endcode
  */
 class UpdateCallback {
-public:
+ public:
   virtual ~UpdateCallback();
   virtual void apply(Parameter* p);
 };
@@ -664,14 +665,14 @@ class ParameterTraverseCallback {
   DISABLE_COPY(ParameterTraverseCallback);
   ParameterTraverseCallback();
 
-public:
+ public:
   ~ParameterTraverseCallback();
 
   void apply(const std::vector<Vector*>& vecs,
              const ParameterConfig& config,
              size_t sparseId);
 
-private:
+ private:
   ParameterTraverseCallbackPrivate* m;
   friend class ParameterOptimizer;
 };
@@ -686,7 +687,7 @@ class ParameterOptimizer {
   DISABLE_COPY(ParameterOptimizer);
   ParameterOptimizer();
 
-public:
+ public:
   static ParameterOptimizer* create(OptimizationConfig* config);
 
   ~ParameterOptimizer();
@@ -710,7 +711,7 @@ public:
   ParameterTraverseCallback* needSpecialTraversal(
       const ParameterConfig& config) const;
 
-private:
+ private:
   ParameterOptimizerPrivate* m;
 };
 
@@ -718,11 +719,11 @@ class SequenceGenerator;
 class Evaluator;
 struct GradientMachinePrivate;
 class GradientMachine {
-private:
+ private:
   GradientMachine();
   DISABLE_COPY(GradientMachine);
 
-public:
+ public:
   virtual ~GradientMachine();
 
   /**
@@ -817,7 +818,7 @@ public:
 
   void eval(Evaluator* evaluator);
 
-private:
+ private:
   GradientMachinePrivate* m;
 
   static GradientMachine* createFromPaddleModelPtr(
@@ -833,10 +834,10 @@ private:
 
 struct ParameterUpdaterPrivate;
 class ParameterUpdater {
-private:
+ private:
   ParameterUpdater();
 
-public:
+ public:
   static ParameterUpdater* createLocalUpdater(OptimizationConfig* config);
   static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
                                                int passCount,
@@ -911,17 +912,17 @@ public:
    */
   void catchUpWith();
 
-private:
+ private:
   ParameterUpdaterPrivate* m;
 };
 
 struct EvaluatorPrivate;
 class Evaluator {
-private:
+ private:
   Evaluator();
   DISABLE_COPY(Evaluator);
 
-public:
+ public:
   ~Evaluator();
 
   /**
@@ -945,7 +946,7 @@ public:
 
   double getValue(const std::string name) const;
 
-private:
+ private:
   EvaluatorPrivate* m;
 
   friend class GradientMachine;
@@ -953,13 +954,13 @@ private:
 
 struct TrainerPrivate;
 class Trainer {
-private:
+ private:
   TrainerPrivate* m;
   Trainer();
   Trainer(TrainerConfig* optConfig, GradientMachine* gm);
   DISABLE_COPY(Trainer);
 
-public:
+ public:
   virtual ~Trainer();
 
   /// Create A Trainer By TrainerConfig. using paddle command line.
@@ -1002,7 +1003,7 @@ public:
 
 /// the N-Best results generated from one input sequence.
 class ISequenceResults {
-public:
+ public:
   virtual ~ISequenceResults();
 
   /// Number of result.
@@ -1026,7 +1027,7 @@ class SequenceGenerator {
   DISABLE_COPY(SequenceGenerator);
   SequenceGenerator();
 
-public:
+ public:
   virtual ~SequenceGenerator();
 
   /**
@@ -1044,10 +1045,10 @@ public:
   void setMaxLength(size_t maxlength);
   void setBeamSize(size_t beamSize);
 
-private:
+ private:
   static SequenceGenerator* createByGradientMachineSharedPtr(void* ptr);
   friend class GradientMachine;
 
-private:
+ private:
   SequenceGeneratorPrivate* m;
 };
diff --git a/paddle/api/SequenceGenerator.cpp b/paddle/api/SequenceGenerator.cpp
index 1b30aec8f6b6b73764886a7c7274be67851e4815..1446c3084238859a759669f3a32c7efde67dcc2b 100644
--- a/paddle/api/SequenceGenerator.cpp
+++ b/paddle/api/SequenceGenerator.cpp
@@ -138,7 +138,7 @@ struct SequenceGeneratorPrivate {
         maxLength(0UL),
         feedback(__create_feedback__()) {}
 
-private:
+ private:
   static paddle::Argument __create_feedback__() {
     paddle::Argument feedback;
     feedback.ids = paddle::IVector::create(/* size= */ 1, FLAGS_use_gpu);
@@ -157,7 +157,7 @@ SequenceGenerator::~SequenceGenerator() { delete m; }
 
 class PathSequenceResults : public ISequenceResults {
   // ISequenceResults interface
-public:
+ public:
   PathSequenceResults(const std::shared_ptr<std::vector<Path>>& path,
                       const std::shared_ptr<std::vector<std::string>>& dict)
       : path_(path), dict_(dict) {}
@@ -196,7 +196,7 @@ public:
     }
   }
 
-private:
+ private:
   std::shared_ptr<std::vector<Path>> path_;
   std::shared_ptr<std::vector<std::string>> dict_;
 };
diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt
index e06e9a2b363d1ffc6876b98bcb7304b0a54dbcaa..957b1a3e6b07b058a76605992da387b43657146a 100644
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -33,9 +33,6 @@ add_library(paddle_capi STATIC ${CAPI_HEADERS} ${CAPI_PRIVATE_HEADER}
 
 target_include_directories(paddle_capi PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
 
-add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
-  ${CAPI_PRIVATE_HEADER})
-
 add_dependencies(paddle_capi paddle_proto paddle_gserver)
 
 # TODO: paddle_capi_whole will be removed.
diff --git a/paddle/capi/gradient_machine.cpp b/paddle/capi/gradient_machine.cpp
index ea9aab00e3d05f1e2ef0c91eab93b67e0a3d5f37..8c3f504e5a2d807c0cc664af486ebab4a82ddec3 100644
--- a/paddle/capi/gradient_machine.cpp
+++ b/paddle/capi/gradient_machine.cpp
@@ -26,7 +26,7 @@ enum GradientMatchineCreateMode {
 namespace paddle {
 
 class MyNeuralNetwork : public NeuralNetwork {
-public:
+ public:
   MyNeuralNetwork(const std::string& name, NeuralNetwork* network)
       : NeuralNetwork(name, network) {}
 };
diff --git a/paddle/contrib/CMakeLists.txt b/paddle/contrib/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4b19256ef4533a09162edf907f6cd51146517e46
--- /dev/null
+++ b/paddle/contrib/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+add_subdirectory(inference)
diff --git a/contrib/float16/.gitignore b/paddle/contrib/float16/.gitignore
similarity index 100%
rename from contrib/float16/.gitignore
rename to paddle/contrib/float16/.gitignore
diff --git a/contrib/float16/float16_inference_report.md b/paddle/contrib/float16/README.md
similarity index 50%
rename from contrib/float16/float16_inference_report.md
rename to paddle/contrib/float16/README.md
index 67623a4d8d58b2ab84031ac4b290931ab540973b..58b4a50666bfb622af8acbce29355f2a4a870a82 100644
--- a/contrib/float16/float16_inference_report.md
+++ b/paddle/contrib/float16/README.md
@@ -1,33 +1,40 @@
+# Float16 Inference in PaddlePaddle Fluid
+
+Kexin Zhao <zhaokexin01@baidu.com>
+
 ## Introduction
-Working with deep neural networks (DNN) is a two-stage process. First we train DNN using labeled examples of inputs and desired outputs to obtain the model parameters (weights), then we deploy DNN along with the trained weights to run inference on unknown inputs. Typically, these weights are in float data type and hence we run inference in float mode using these weights. This post focuses on the discussion of how to use low precision float16 data type to represent these trained weights and run inference in float16 mode as well as the advantages of float16 inference over its float counterpart by showing some experiment results. 
+Deep learning is usually a two-stage work: training and inference. The training stage estimates model parameters (weights) from data.  The inference stage loads the weights and uses them to interpret inputs. Typically, weights are 32-bit float values (float32).  Some new devices, including NVIDIA Volta GPUs, support higher speed computation using 16-bit float values (float16).
+
+This article explains our efforts with PaddlePaddle to train using float32 and to inference using float16. We describe a [*transpiler*](https://github.com/PaddlePaddle/Paddle/blob/a4d3de0071e1f3912230c3ab3f9ac74cf06b093a/doc/fluid/design/motivation/fluid_compiler.md), which converts a PaddlePaddle Fluid model, which, to be precise, should be called a [Fluid *program*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), into the inference program, and converts the weights from float32 into float16.
+
 
 ## What is float16?
-float16 (or FP16) is a half-precision floating-point format that uses 16 bits in memory to represent a value. The advantage over 32-bit single-precision floating-point format (commonly known as float data type) is that it requires half the storage and bandwidth at the expense of precision and range. Fortunately, DNN inference has high tolerance against the loss of precision and range when using float16 to represent the weights and the inference accuracy will only be minimally affected in most cases. This gives us the opportunity to use float16 data type to speedup the inference.
+float16 (or FP16) is a half-precision floating-point format that uses 16 bits in memory to represent a value. The advantage over 32-bit single-precision floating-point format (commonly known as float or float32 data type) is that it requires half the storage and bandwidth at the expense of precision and range. Fortunately, DNN inference has a high tolerance for the loss of precision and range when using float16 to represent the weights, and the inference accuracy will only be minimally affected in most cases, which gives us the opportunity to use float16 data type to speed up the inference.
 
 Interested readers can refer to our [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md) and [code](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/float16.h) for more details on how we implement the float16 data type.
 
 ## Why float16?
-The trend in today's deep learning community is to use bigger and deeper model. This translates to larger memory footprint, higher computation demands, and as a result higher energy consumption on computing devices. The advantages of float16 over float are correspondingly three-fold:
+The trend in today's deep learning community is to use bigger and deeper model, which translates to larger memory footprint, higher computation demands, and as a result higher energy consumption on computing devices. The advantages of float16 over float32 are correspondingly three-fold:
 
-1. We only need half the memory size to load the same model using float16 representations. Moreover, most of the intermediate results generated during float16 inference are also of float16 data type. This makes the whole memory footprint of float16 inference roughly about half of its float counterpart. This is especially useful when deploying inference on mobile devices with limited available memory. Also given the same available memory, the maximum batch size for float16 inference is about twice that for float inference.
+1. We only need half the memory size to load the same model using float16 representations. Moreover, most of the intermediate results generated during float16 inference are also of the float16 data type. As a result, the whole memory footprint of float16 inference is roughly half of its float counterpart, which is especially useful when deploying inference on mobile devices with limited available memory. Also given the same available memory, the maximum batch size for float16 inference is about twice that for float inference.
 
-2. Because float16 occupies less memory than float, in theory hardware devices can achieve much higher floating point operators per second (FLOPS) for float16 data than float data. Right now, an outstanding example of hardware devices that actually deliver such advantages is Nvidia's latest Volta architecture GPUs, including Tesla V100 and Titan V. Moreover float16 takes less time to read from or write to memory and hence float16 can make inference more efficient especially in memory-bound applications where the performance is largely affected by how fast it is to read and write data.
+2. Because float16 occupies less memory than float, in theory, hardware devices can achieve much higher floating point operators per second (FLOPS) for float16 data than float data. Right now, NVIDIA's latest Volta GPUs, including Tesla V100 and Titan V, can deliver significantly higher FLOPS for float16 using Tensor Cores. Moreover, float16 takes less time to read from or write to memory, and hence float16 can make inference more efficient especially in memory-bound applications where the performance is mostly affected by how fast it is to read and write data.
 
-3. From the energy efficiency perspective, the energy needed to read, write, and compute float16 data is much less that its float counterpart, which can significantly reduce the battery power consumption on mobile devices or the total cost of ownership (TCO) of data centers.
+3. From the energy efficiency perspective, the energy needed to read, write, and compute float16 data is much less than its float counterpart, which can significantly reduce the battery power consumption on mobile devices or the total cost of ownership (TCO) of data centers.
 
 ## Fluid implementation of float16 inference
 ### Overview
 Fluid use [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#program) instead of computation graph to describe a neural network model and the optimization procedure. Fluid program is a python wrapper around a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md). Similar to programming languages, the basic structure of a Fluid program is some nested [blocks](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program by sequentially executing the operators in the entrance block. 
 
 ### Basic requirement
-When an operator is run by an executor, it uses a kernel to perform computations on tensors contained in the input variables, and then write the results to the tensors in the output variables. Each operator has multiple kernels for different combinations of data types, devices, and library types, respectively. The operator will select the appropriate kernel to run based on, among other things, the data type of the input tensors. By default, every Fluid operator has a kernel for float data type that takes float inputs and generates float outputs.
+When an executor runs an operator, it uses a kernel to perform computations on tensors contained in the input variables, and then writes the results to the tensors in the output variables. Each operator has multiple kernels for different combinations of data types, devices, and library types, respectively. The operator will select the appropriate kernel to run based on, among other things, the data type of the input tensors. By default, every Fluid operator has a kernel for float data type that takes float inputs and generates float outputs.
 
-This means that if we provide float input to the first operator in a program, then each operator will use float kernel to compute float output and send it as input to the next operator to trigger its float kernel. This chain effect will makes the program run in float mode and gives us a final output of float data type. 
+If we provide float input to the first operator in a program, then each operator will use float kernel to compute float output and send it as input to the next operator to trigger its float kernel. This chain effect will make the program run in float mode and gives us a final output of float data type. 
 
-The same principle applies if we want a program to run in float16 mode. We provide input variable of float16 data type to the first operator and every subsequent operator will invoke the float16 kernel until we get the final output in float16 data type. So the preliminary requirements for float16 inference is to add float16 kernels to operators that are needed in a specific kind of neural networks. Our current focus is on Convolutional Neural Networks (CNN) and hence we have added float16 kernels to the following operators: convolution, pooling, GEMM, elementwise addition, batch norm, dropout, various activations including relu and tanh, and softmax.
+The same principle applies if we want a program to run in float16 mode. We provide input variable of the float16 data type to the first operator, and every subsequent operator will invoke the float16 kernel until we get the final output in float16. So the preliminary requirements for float16 inference are to add float16 kernels to operators that are needed in a specific kind of neural networks. Our current focus is on Convolutional Neural Networks (CNN) and hence we have added float16 kernels to the following operators: convolution, pooling, GEMM, elementwise addition, batch norm, dropout, various activations including relu and tanh, and softmax.
 
 ### float16 transpiler
-Furthermore, we need a float16 transpiler to achieve the following usage code:
+Furthermore, we need a transpiler to write float16 inference code similar to the following:
 
 ```python
 # Get the float32 inference program and load the associated float32 weights
@@ -64,14 +71,15 @@ fluid.io.save_inference_model(fp16_save_dirname, feed_target_names,
                               float16_inference_program)
 ```
 
-In this scenario, we already have a float32 inference program and some associated float32 weights that can do float32 inference. We can easily use the `transpile` method of the `Float16Transpiler` class to do certain modifications to the existing program and weights so that we have a new float16 program and the associated float16 weights.
+In this scenario, we already have a float32 inference program and some associated float32 weights. We can simply use the `transpile` method of the `Float16Transpiler` class to do certain modifications to the existing program and weights so that we have a new float16 program and the associated float16 weights.
 
-We can then run various inference experiments in float16 mode and save the float16 program and weights on disk for future deployment. To enhance the code usability, we maintain a consistent API so that user can use the same float32 input data to run inference program in either float32 and float16 mode and obtain output data both of float32 data type. This requires us to add some cast operators in the program to convert between float16 tensor and float32 tensor.
+We can then run various inference experiments in float16 mode and save the float16 program and weights on disk for future deployment. To enhance the code usability, we maintain a consistent API so that user can use the same float32 input data to run inference program in either float32 and float16 mode and obtain output data both of float32 data type. Consequently, we need to add cast operators in the float16 inference program for conversions between the float16 tensor and float32 tensor.
 
 The float16 transpiler is implemented to fulfill the requirements mentioned above. The details of the float16 transpiler can be found [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md#float16-inference).
 
 ### Experiment results
-We provide demo codes that can be used to reproduce the experiment results by doing:
+Simply running the following commands to reproduce the experiment results presented in this section:
+
 ```bash
 git clone https://github.com/PaddlePaddle/Paddle.git
 cd Paddle
@@ -81,11 +89,11 @@ cd Paddle
 # to `FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04` and similarly for other configurations
 nvidia-docker build -t paddle:float16 .
 # After running this, different results will be written to different log files in Paddle/contrib/float16/
-nvidia-docker run -it -v $PWD:/paddle paddle:float16 /paddle/contrib/float16/run_float16_demo.sh
+nvidia-docker run -it -v $PWD:/paddle paddle:float16 /paddle/paddle/contrib/float16/run_float16_demo.sh
 ```
 
-#### Correctness
-As is mentioned before, DNN inference has been found to be tolerant against the loss of precision and range incured by float16 and we want to see how good this tolerance is.
+#### Accuracy
+As is mentioned before, DNN inference has been found to be tolerant against the loss of precision and range incurred by float16, and we want to see how good this tolerance is.
 
 We train a resnet32 model using cifar10 data set, save it when test set accuracy is above 60%, and then test the inference accuracy on the 10000 examples of the cifar10 test set in float16 and float32 mode, respectively.
 
@@ -105,18 +113,18 @@ We repeat the test ten times and get the following results:
 | #10    | 62.53%  | 62.48%   |
 | average| 62.63%  | 62.62%   |
 
-We can see that the accuracy of float16 inference is very close to that of float32 inference in every experiment (within 0.05% difference) and is overall 0.01% better than its float32 counterpart averaged over 10 tests. 
+We can see that the accuracy of float16 inference is very close to that of float32 inference in every experiment (within 0.05% difference) and is overall 0.01% better than its float32 counterpart averaged over ten tests. 
 
 #### Performance benchmark
-Currently, Fluid inference in float16 mode is only supported on Nvidia GPU device. There is no motivation to support float16 inference on non-ARM CPUs because float16 is not natively supported there and float16 calculation will only be slower than its float counterpart. 
+Currently, Fluid only supports float16 inference on NVIDIA GPUs. There is no motivation to support float16 inference on non-ARM CPUs where float16 is not natively supported, and float16 calculation will only be slower than its float32 counterpart. 
 
-Nvidia started to support its native float16 data type (which has the same internal memory representation as Fluid float16 class) on CUDA 7.5. Moreover, float16 speedups on common computational intensive tasks including GEMM (general matrix-matrix multiplication) and convolution are supported since cublas 7.5 and cuDNN 5.0.
+NVIDIA started to support its native float16 data type (which has the same internal memory representation as Fluid's float16 class) on CUDA 7.5. Moreover, float16 speedups on computationally intensive tasks including GEMM (general matrix-matrix multiplication) and convolution are supported since cuBLAS 7.5 and cuDNN 5.0.
 
-Recently, the introduction of [tensor core](https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/) in volta architecture GPUs and the support of tensor core calculation in CUDA 9.0 and cuDNN 7 make float16 truly superior to float in certain deep learning applications.
+Recently, the introduction of [Tensor Core](https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/) in Volta architecture GPUs and the support of Tensor Core computation in CUDA 9.0 and cuDNN 7 make float16 genuinely superior to float in some deep learning applications.
 
-We thus benchmark the float16 inference performance on a single Nvidia Tesla V100 GPU (volta architecture and with tensor cores) and compare it with its float32 counterpart. All the following results are in ms (millisecond) averaged over 1000 mini-batches with respective to different mini-batch(mb) sizes.
+We thus benchmark the float16 inference performance on a single NVIDIA Tesla V100 GPU (Volta architecture and with Tensor Cores) and compare it with its float32 counterpart. All the following results are in ms (millisecond) averaged over 1000 mini-batches with respective to different mini-batch(mb) sizes.
 
-Average inference time for one mini-batch on Vgg16 model tested on imagenet data set:
+Average inference time for one mini-batch on Vgg16 model tested on ImageNet dataset:
 
 | total | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  |
 |-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
@@ -124,7 +132,7 @@ Average inference time for one mini-batch on Vgg16 model tested on imagenet data
 |float16|  3.32 | 4.11  |  5.88 |  9.41 | 16.54  | 30.47 |  60.23 |
 |Speedup|  4.22 | 2.36  |  3.91 |  3.00 |  3.26  |  2.77 |   2.97 |
 
-We can see that float16 inference provides 2x ~ 4x speedup on different batch sizes. 
+We can see that float16 inference provides **2x ~ 4x** speedup on different batch sizes. 
 
 Convolution operation is ususally the computational bottleneck of CNN, so we also check the average time spent on the Fluid convolution operators for one mini-batch as follows:
 
@@ -134,9 +142,9 @@ Convolution operation is ususally the computational bottleneck of CNN, so we als
 |float16|  1.78 | 2.10  |  2.93 |  4.55 |  7.99  | 14.63 |  28.67 |
 |Speedup|  6.71 | 3.31  |  6.37 |  4.71 |  5.18  |  4.14 |   4.54 |
 
-Fluid convolution operator uses cuDNN 7 to implement the kernel and we can see that with the help of tensor core, float16 convolution is significantly faster than its float32 counterpart, which makes the overall float16 inference performance much better.
+Fluid convolution operator uses cuDNN 7 to implement the kernel, and we can see that with the help of Tensor Core, float16 convolution is significantly faster than its float32 counterpart, which makes the overall float16 inference performance much better.
 
-Similarly, we also list the benchmark results of Resnet50 model tested on imagenet data set:
+Similarly, we also list the benchmark results of Resnet50 model tested on the ImageNet dataset:
 
 | total | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | mb=128 |
 |-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
@@ -150,14 +158,14 @@ Similarly, we also list the benchmark results of Resnet50 model tested on imagen
 |float16| 4.19  | 4.30  | 3.96  | 4.21  |  5.63  |  8.77 | 15.24  | 28.40  |
 |Speedup| 1.30  | 1.27  | 1.64  | 1.99  |  2.45  |  2.79 |  2.70  |  2.59  |
 
-We find that the speedup provided by float16 inference starts relatively small at 1.15x for batch size 1 and gradually increase to about 2x for larger batch sizes. Similar trend can be found for the time spent on the convolution operator. Note that right now the tensor core will only be utilized in the convolution operation when certain dimentional requirements are met for the input data and filter. The speedup by float16 inference for Resnet50 is smaller than the Vgg16 counterpart partially because the convolution operation in Resnet is much simpler than the Vgg counterpart and this makes the tensor core less utilized in Resnet than in Vgg.
+We find that the speedup provided by float16 inference starts relatively small at 1.15x for batch size 1 and gradually increases to about 2x for larger batch sizes. A similar trend can be found for the time spent on the convolution operator. Note that right now Tensor Cores will only be utilized in the convolution operation when the input data and filter meet specific dimensional requirements. The speedup by float16 inference for Resnet50 is smaller than the Vgg16 counterpart partially because the convolution operation in Resnet is much simpler than its Vgg counterpart and this makes the tensor core less utilized in Resnet than in Vgg.
 
-We also did the same benchmark on a Nvidia GeForce GTX 1080 Ti GPU that does not support tensor core. The results show that for Vgg16, float16 inference provides consistent small speedup (around 1.15x) for all mini-batch sizes, while for Resnet50, float16 inference is slower than its float32 counterpart in small batch sizes (mb = 1 and 2) and then deliver around 1.15x speedup for all larger batch sizes. By comparing the benchmarks on 1080 Ti and V100, we find that tensor core, which is specialized for float16 computations, is a critical component for high performance float16 inference.
+We also did the same benchmark on a single NVIDIA GeForce GTX 1080 Ti GPU that does not support Tensor Core. The results show that for Vgg16, float16 inference provides consistent small speedup (around 1.15x) for all mini-batch sizes, while for Resnet50, float16 inference is slower than its float32 counterpart in small batch sizes (mb = 1 and 2) and then delivers around 1.15x speedup for all larger batch sizes. By comparing the benchmarks on 1080 Ti and V100, we find that Tensor Core, which is specialized for float16 computations, is a critical component of high performance float16 inference.
 
-Please refer to [here](https://github.com/PaddlePaddle/Paddle/blob/develop/contrib/float16/float16_benchmark.md) for comprehensive benchmark results.
+Please refer to [here](https://github.com/PaddlePaddle/Paddle/blob/develop/contrib/float16/float16_benchmark.md) for complete benchmark results.
 
 ### Summary
 1. Fluid is now able to run inference in float16 mode via a float16 transpiler. We currently support CNN programs, including Vgg and Resnet, to run in float16 inference mode.
-2. The accuracy of float16 inference is verified to be almost identical to the float32 counterpart at least on CNNs.
-3. float16 inference provides significant speedup on large and computationally intensive Vgg16 network on image net data set. For the much smaller and simpler Resnet50, the speedup provided by float16 inference is less significant than on Vgg16 but still favorable especially for large batch size.
-4. We cannot achieve the superior float16 inference performance without the help of the newly introduced tensor cores on the Nvidia Volta architecture GPUs.
+2. The accuracy of float16 inference is verified to be almost identical to its float32 counterpart at least on CNN models.
+3. float16 inference provides a significant speedup on large and computationally intensive Vgg16 model on ImageNet dataset. For the much smaller and simpler Resnet50 model, the speedup provided by float16 inference is less significant than for Vgg16 model but still favorable, especially for large batch sizes.
+4. We cannot achieve the superior float16 inference performance without the help of the newly introduced Tensor Cores on NVIDIA Volta architecture GPUs.
diff --git a/contrib/float16/float16_benchmark.md b/paddle/contrib/float16/float16_benchmark.md
similarity index 100%
rename from contrib/float16/float16_benchmark.md
rename to paddle/contrib/float16/float16_benchmark.md
diff --git a/contrib/float16/float16_inference_demo.py b/paddle/contrib/float16/float16_inference_demo.py
similarity index 100%
rename from contrib/float16/float16_inference_demo.py
rename to paddle/contrib/float16/float16_inference_demo.py
diff --git a/contrib/float16/float16_transpiler.py b/paddle/contrib/float16/float16_transpiler.py
similarity index 100%
rename from contrib/float16/float16_transpiler.py
rename to paddle/contrib/float16/float16_transpiler.py
diff --git a/contrib/float16/run_float16_demo.sh b/paddle/contrib/float16/run_float16_demo.sh
similarity index 95%
rename from contrib/float16/run_float16_demo.sh
rename to paddle/contrib/float16/run_float16_demo.sh
index d8a34ee67b8fab214fa6e96104304689211f84da..031225a85dabb26e5d9ea06f58909c049e7f0c08 100755
--- a/contrib/float16/run_float16_demo.sh
+++ b/paddle/contrib/float16/run_float16_demo.sh
@@ -3,7 +3,7 @@
 BUILD_PATH=/paddle/fp16_build
 WHEEL_PATH=$BUILD_PATH/python/dist
 INFER_PATH=$BUILD_PATH/paddle/fluid/inference/tests/book
-DEMO_PATH=/paddle/contrib/float16
+DEMO_PATH=/paddle/paddle/contrib/float16
 
 # Use the single most powerful CUDA GPU on your machine
 export CUDA_VISIBLE_DEVICES=0
@@ -50,7 +50,6 @@ do
          --repeat=1 \
 
   $INFER_PATH/test_inference_image_classification_vgg \
-      --data_set=imagenet \
       --dirname=$DEMO_PATH/image_classification_imagenet_vgg.inference.model \
       --fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_vgg.inference.model \
       --repeat=$REPEAT \
@@ -68,7 +67,6 @@ do
          --repeat=1 \
 
   $INFER_PATH/test_inference_image_classification_resnet \
-      --data_set=imagenet \
       --dirname=$DEMO_PATH/image_classification_imagenet_resnet.inference.model \
       --fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_resnet.inference.model \
       --repeat=$REPEAT \
@@ -86,7 +84,6 @@ do
          --repeat=1 \
 
   $INFER_PATH/test_inference_image_classification_vgg \
-      --data_set=cifar10 \
       --dirname=$DEMO_PATH/image_classification_cifar10_vgg.inference.model \
       --fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_vgg.inference.model \
       --repeat=$REPEAT \
@@ -104,7 +101,6 @@ do
          --repeat=1 \
 
   $INFER_PATH/test_inference_image_classification_vgg \
-      --data_set=cifar10 \
       --dirname=$DEMO_PATH/image_classification_cifar10_resnet.inference.model \
       --fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_resnet.inference.model \
       --repeat=$REPEAT \
diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f279020e9334323ebdf3125a8833044cd9eccae5
--- /dev/null
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -0,0 +1,99 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+if(APPLE)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
+endif(APPLE)
+
+set(ANAKIN_INCLUDE "" CACHE STRING "root of Anakin header files")
+set(ANAKIN_LIBRARY "" CACHE STRING "path of Anakin library")
+
+
+set(inference_deps paddle_inference_api paddle_fluid_api)
+
+# if anakin is set enable anakin api implementation
+if(ANAKIN_INCLUDE AND ANAKIN_LIBRARY)
+    set(ANAKIN_FOUND ON)
+else()
+    set(ANAKIN_FOUND OFF)
+endif()
+
+function(fetch_include_recursively root_dir) 
+    if (IS_DIRECTORY ${root_dir}) 
+        include_directories(${root_dir})
+    endif()
+
+    file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*)
+    foreach(sub ${ALL_SUB})
+        if (IS_DIRECTORY ${root_dir}/${sub})
+            fetch_include_recursively(${root_dir}/${sub})
+        endif()
+    endforeach()
+endfunction()
+
+if (ANAKIN_FOUND)
+    # Anakin's code style doesn't follow google c style.
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp")
+
+    message(STATUS "Anakin for inference is enabled")
+    message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
+    fetch_include_recursively(${ANAKIN_INCLUDE})
+
+    link_directories(${ANAKIN_LIBRARY})
+
+    nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
+    target_link_libraries(inference_anakin_api anakin anakin_saber_common)
+    list(APPEND inference_deps inference_anakin_api)
+endif()
+
+
+function(inference_api_test TARGET_NAME)
+    if (WITH_TESTING)
+        set(options "")
+        set(oneValueArgs "")
+        set(multiValueArgs ARGS)
+        cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+        set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
+        cc_test(${TARGET_NAME}
+                SRCS ${TARGET_NAME}.cc
+                DEPS "${inference_deps}"
+                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
+        if(inference_test_ARGS)
+            set_tests_properties(${TARGET_NAME}
+                    PROPERTIES DEPENDS "${inference_test_ARGS}")
+        endif()
+    endif(WITH_TESTING)
+endfunction(inference_api_test)
+
+cc_library(paddle_inference_api
+    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc 
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+
+cc_test(test_paddle_inference_api
+        SRCS test_paddle_inference_api.cc
+        DEPS paddle_inference_api)
+
+inference_api_test(test_paddle_inference_api_impl
+                    ARGS test_word2vec test_image_classification)
+
+if (ANAKIN_FOUND)
+    cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
+    DEPS ${inference_deps})
+endif()
+
+if(WITH_TESTING)
+    add_subdirectory(demo)
+endif()
diff --git a/paddle/contrib/inference/README.md b/paddle/contrib/inference/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..20969fac6c8f894ffb4a02b48f795e2a0dcbd096
--- /dev/null
+++ b/paddle/contrib/inference/README.md
@@ -0,0 +1,27 @@
+# Embed Paddle Inference in Your Application
+
+Paddle inference offers the APIs in `C` and `C++` languages.
+
+One can easily deploy a model trained by Paddle following the steps as below:
+
+1. Optimize the native model;
+2. Write some codes for deployment.
+
+
+Let's explain the steps in detail.
+
+## Optimize the native Fluid Model
+
+The native model that get from the training phase needs to be optimized for that.
+
+- Clean the noise such as the cost operators that do not need inference;
+- Prune unnecessary computation fork that has nothing to do with the output;
+- Remove extraneous variables;
+- Memory reuse for native Fluid executor;
+- Translate the model storage format to some third-party engine's, so that the inference API can utilize the engine for acceleration;
+
+We have an official tool to do the optimization, call `paddle_inference_optimize --help` for more information.
+
+## Write some codes
+
+Read `paddle_inference_api.h` for more information.
diff --git a/paddle/contrib/inference/demo/CMakeLists.txt b/paddle/contrib/inference/demo/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7b0fa77ad13c19f177e5b2446bcda6551471e45f
--- /dev/null
+++ b/paddle/contrib/inference/demo/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+inference_api_test(simple_on_word2vec ARGS test_word2vec)
diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/contrib/inference/demo/simple_on_word2vec.cc
new file mode 100644
index 0000000000000000000000000000000000000000..192a6414260ce06048b8c765402d89882cabc51b
--- /dev/null
+++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc
@@ -0,0 +1,128 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains a simple demo for how to take a model for inference.
+ */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <memory>
+#include <thread>
+#include "paddle/contrib/inference/paddle_inference_api.h"
+namespace paddle {
+namespace demo {
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+void Main(bool use_gpu) {
+  //# 1. Create PaddlePredictor with a config.
+  NativeConfig config;
+  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.use_gpu = use_gpu;
+  config.fraction_of_gpu_memory = 0.15;
+  config.device = 0;
+  auto predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+
+  for (int batch_id = 0; batch_id < 3; batch_id++) {
+    //# 2. Prepare input.
+    int64_t data[4] = {1, 2, 3, 4};
+
+    PaddleBuf buf{.data = data, .length = sizeof(data)};
+    PaddleTensor tensor{.name = "",
+                        .shape = std::vector<int>({4, 1}),
+                        .data = buf,
+                        .dtype = PaddleDType::INT64};
+
+    // For simplicity, we set all the slots with the same data.
+    std::vector<PaddleTensor> slots(4, tensor);
+
+    //# 3. Run
+    std::vector<PaddleTensor> outputs;
+    CHECK(predictor->Run(slots, &outputs));
+
+    //# 4. Get output.
+    ASSERT_EQ(outputs.size(), 1UL);
+    LOG(INFO) << "output buffer size: " << outputs.front().data.length;
+    const size_t num_elements = outputs.front().data.length / sizeof(float);
+    // The outputs' buffers are in CPU memory.
+    for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+      LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
+    }
+    // TODO(Superjomn): this is should be free automatically
+    free(outputs[0].data.data);
+  }
+}
+
+void MainThreads(int num_threads, bool use_gpu) {
+  // Multi-threads only support on CPU
+  // 0. Create PaddlePredictor with a config.
+  NativeConfig config;
+  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.use_gpu = use_gpu;
+  config.fraction_of_gpu_memory = 0.15;
+  config.device = 0;
+  auto main_predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+
+  std::vector<std::thread> threads;
+  for (int tid = 0; tid < num_threads; ++tid) {
+    threads.emplace_back([&, tid]() {
+      // 1. clone a predictor which shares the same parameters
+      auto predictor = main_predictor->Clone();
+      constexpr int num_batches = 3;
+      for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
+        // 2. Dummy Input Data
+        int64_t data[4] = {1, 2, 3, 4};
+        PaddleBuf buf{.data = data, .length = sizeof(data)};
+        PaddleTensor tensor{.name = "",
+                            .shape = std::vector<int>({4, 1}),
+                            .data = buf,
+                            .dtype = PaddleDType::INT64};
+        std::vector<PaddleTensor> inputs(4, tensor);
+        std::vector<PaddleTensor> outputs;
+        // 3. Run
+        CHECK(predictor->Run(inputs, &outputs));
+
+        // 4. Get output.
+        ASSERT_EQ(outputs.size(), 1UL);
+        LOG(INFO) << "TID: " << tid << ", "
+                  << "output buffer size: " << outputs.front().data.length;
+        const size_t num_elements = outputs.front().data.length / sizeof(float);
+        // The outputs' buffers are in CPU memory.
+        for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+          LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
+        }
+        free(outputs[0].data.data);
+      }
+    });
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    threads[i].join();
+  }
+}
+
+TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_cpu_1) { MainThreads(1, false /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_cpu_4) { MainThreads(4, false /*use_gpu*/); }
+
+#ifdef PADDLE_WITH_CUDA
+TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_gpu_1) { MainThreads(1, true /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_gpu_4) { MainThreads(4, true /*use_gpu*/); }
+#endif
+
+}  // namespace demo
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api.cc b/paddle/contrib/inference/paddle_inference_api.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d67e1e7667800d6dd00cb8915b0d6dc7c664970b
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api.cc
@@ -0,0 +1,15 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/contrib/inference/paddle_inference_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..77e2d77b6b7fe3eeed865c8de0818d059cfa6c6e
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -0,0 +1,116 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains the definition of a simple Inference API for Paddle.
+ *
+ * ATTENTION: It requires some C++ features, for lower version C++ or C, we
+ * might release another API.
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace paddle {
+
+enum PaddleDType {
+  FLOAT32,
+  INT64,
+};
+
+struct PaddleBuf {
+  void* data;     // pointer to the data memory.
+  size_t length;  // number of memory bytes.
+};
+
+struct PaddleTensor {
+  std::string name;  // variable name.
+  std::vector<int> shape;
+  // TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
+  PaddleBuf data;  // blob of data.
+  PaddleDType dtype;
+};
+
+enum class PaddleEngineKind {
+  kNative = 0,  // Use the native Fluid facility.
+  kAnakin,      // Use Anakin for inference.
+  // TODO(Superjomn) support following engines latter.
+  // kTensorRT,           // Use TensorRT for inference.
+  // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
+  // kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
+};
+
+/*
+ * A simple Inference API for Paddle. Currently this API can be used by
+ * non-sequence scenerios.
+ */
+class PaddlePredictor {
+ public:
+  struct Config;
+  PaddlePredictor() = default;
+  PaddlePredictor(const PaddlePredictor&) = delete;
+  PaddlePredictor& operator=(const PaddlePredictor&) = delete;
+
+  // Predict an record.
+  // The caller should be responsible for allocating and releasing the memory of
+  // `inputs`. `inputs` should be alive until Run returns. caller should be
+  // responsible for releasing the memory of `output_data`.
+  virtual bool Run(const std::vector<PaddleTensor>& inputs,
+                   std::vector<PaddleTensor>* output_data) = 0;
+
+  // Clone a predictor that share the model weights, the Cloned predictor should
+  // be thread-safe.
+  virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
+
+  // Destroy the Predictor.
+  virtual ~PaddlePredictor() = default;
+
+  // The common configs for all the predictors.
+  struct Config {
+    std::string model_dir;      // path to the model directory.
+    bool enable_engine{false};  // Enable to execute (part of) the model on
+  };
+};
+
+struct NativeConfig : public PaddlePredictor::Config {
+  // GPU related fields.
+  bool use_gpu{false};
+  int device{0};
+  float fraction_of_gpu_memory{-1.f};  // Negative to notify initialization.
+
+  std::string prog_file;
+  std::string param_file;
+};
+
+// Configurations for Anakin engine.
+struct AnakinConfig : public PaddlePredictor::Config {
+  int device;
+  std::string model_file;
+  int max_batch_size{-1};
+};
+
+// A factory to help create different predictors.
+//
+// FOR EXTENSION DEVELOPER:
+// Different predictors are designated by config type and engine kind. Similar
+// configs can be merged, but there shouldn't be a huge config containing
+// different fields for more than one kind of predictors.
+//
+// Similarly, each engine kind should map to a unique predictor implementation.
+template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea7781f691da81befd5d11c226c35e1da79baaaa
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
@@ -0,0 +1,114 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda.h>
+
+#include "paddle/contrib/inference/paddle_inference_api_anakin_engine.h"
+
+namespace paddle {
+
+PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor(
+    const AnakinConfig &config) {
+  CHECK(Init(config));
+}
+
+bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {
+  if (!(graph_.load(config.model_file))) {
+    return false;
+  }
+  graph_.ResetBatchSize("input_0", config.max_batch_size);
+  // optimization for graph
+  if (!(graph_.Optimize())) {
+    return false;
+  }
+  // construct executer
+  executor_.init(graph_);
+  return true;
+}
+
+bool PaddleInferenceAnakinPredictor::Run(
+    const std::vector<PaddleTensor> &inputs,
+    std::vector<PaddleTensor> *output_data) {
+  for (const auto &input : inputs) {
+    if (input.dtype != PaddleDType::FLOAT32) {
+      LOG(ERROR) << "Only support float type inputs. " << input.name
+                 << "'s type is not float";
+      return false;
+    }
+    auto d_tensor_in_p = executor_.get_in(input.name);
+    float *d_data_p = d_tensor_in_p->mutable_data();
+    if (cudaMemcpy(d_data_p,
+                   static_cast<float *>(input.data.data),
+                   d_tensor_in_p->valid_size() * sizeof(float),
+                   cudaMemcpyHostToDevice) != 0) {
+      LOG(ERROR) << "copy data from CPU to GPU error";
+      return false;
+    }
+  }
+
+  executor_.prediction();
+
+  if (output_data->empty()) {
+    LOG(ERROR) << "At least one output should be set with tensors' names.";
+    return false;
+  }
+  for (auto &output : *output_data) {
+    auto *tensor = executor_.get_out(output.name);
+    output.shape = tensor->shape();
+    // Copy data from GPU -> CPU
+    if (cudaMemcpy(output.data.data,
+                   tensor->mutable_data(),
+                   tensor->valid_size() * sizeof(float),
+                   cudaMemcpyDeviceToHost) != 0) {
+      LOG(ERROR) << "copy data from GPU to CPU error";
+      return false;
+    }
+  }
+  return true;
+}
+
+anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
+    &PaddleInferenceAnakinPredictor::get_executer() {
+  return executor_;
+}
+
+// the cloned new Predictor of anakin share the same net weights from original
+// Predictor
+std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
+  VLOG(3) << "Anakin Predictor::clone";
+  std::unique_ptr<PaddlePredictor> cls(new PaddleInferenceAnakinPredictor());
+  // construct executer from other graph
+  auto anakin_predictor_p =
+      dynamic_cast<PaddleInferenceAnakinPredictor *>(cls.get());
+  if (!anakin_predictor_p) {
+    LOG(ERROR) << "fail to call Init";
+    return nullptr;
+  }
+  anakin_predictor_p->get_executer().init(graph_);
+
+  return std::move(cls);
+}
+
+// A factory to help create difference predictor.
+template <>
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(
+    const AnakinConfig &config) {
+  VLOG(3) << "Anakin Predictor create.";
+  std::unique_ptr<PaddlePredictor> x(
+      new PaddleInferenceAnakinPredictor(config));
+  return x;
+};
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..181784cbdf91fe2f50e20f4d447448a42a18d301
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains the implementation of inference API with Anakin engine
+ * embeded, this API can only support Anakin models.
+ */
+
+#pragma once
+
+// NOTE This header file do not have namespace.
+//#include <test/framework/net/paddle_api.h>
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+#include "framework/core/net/net.h"
+#include "saber/saber_types.h"
+
+namespace paddle {
+
+class PaddleInferenceAnakinPredictor : public PaddlePredictor {
+ public:
+  PaddleInferenceAnakinPredictor() {}
+
+  PaddleInferenceAnakinPredictor(const AnakinConfig& config);
+
+  // NOTE Unlike the native engine, the buffers of anakin engine's output_data
+  // should be allocated first.
+  bool Run(const std::vector<PaddleTensor>& inputs,
+           std::vector<PaddleTensor>* output_data) override;
+
+  std::unique_ptr<PaddlePredictor> Clone() override;
+
+  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
+  get_executer();
+
+  ~PaddleInferenceAnakinPredictor() override{};
+
+ private:
+  bool Init(const AnakinConfig& config);
+
+  anakin::graph::Graph<anakin::NV,
+                       anakin::saber::AK_FLOAT,
+                       anakin::Precision::FP32>
+      graph_;
+  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
+      executor_;
+  AnakinConfig config_;
+};
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..47b9c6fa285b623d2b08f45917cb3474dbc2ab83
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "gflags/gflags.h"
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+namespace paddle {
+
+AnakinConfig GetConfig() {
+  AnakinConfig config;
+  config.model_file = "./mobilenet_v2.anakin.bin";
+  config.device = 0;
+  config.max_batch_size = 1;
+  return config;
+}
+
+TEST(inference, anakin) {
+  AnakinConfig config = GetConfig();
+  auto predictor =
+      CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
+
+  float data[1 * 3 * 224 * 224] = {1.0f};
+
+  PaddleBuf buf{.data = data, .length = sizeof(data)};
+  PaddleTensor tensor{.name = "input_0",
+                      .shape = std::vector<int>({1, 3, 224, 224}),
+                      .data = buf,
+                      .dtype = PaddleDType::FLOAT32};
+
+  // For simplicity, we set all the slots with the same data.
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+
+  float data_out[1000];
+
+  PaddleBuf buf_out{.data = data_out, .length = sizeof(data)};
+  PaddleTensor tensor_out{.name = "prob_out",
+                          .shape = std::vector<int>({1000, 1}),
+                          .data = buf_out,
+                          .dtype = PaddleDType::FLOAT32};
+
+  std::vector<PaddleTensor> outputs(1, tensor_out);
+
+  ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
+
+  float* data_o = static_cast<float*>(outputs[0].data.data);
+  for (size_t j = 0; j < 1000; ++j) {
+    LOG(INFO) << "output[" << j << "]: " << data_o[j];
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.cc b/paddle/contrib/inference/paddle_inference_api_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bda2981a14482e2c4a29773d37b074506cc344b1
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
@@ -0,0 +1,285 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <sys/time.h>
+#include <algorithm>
+#include <map>
+#include <set>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/contrib/inference/paddle_inference_api_impl.h"
+
+namespace paddle {
+namespace {
+
+// Timer for timer
+class Timer {
+ public:
+  double start;
+  double startu;
+  void tic() {
+    struct timeval tp;
+    gettimeofday(&tp, NULL);
+    start = tp.tv_sec;
+    startu = tp.tv_usec;
+  }
+  double toc() {
+    struct timeval tp;
+    gettimeofday(&tp, NULL);
+    double used_time_ms =
+        (tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
+    return used_time_ms;
+  }
+};
+
+template <class T>
+std::string num2str(T a) {
+  std::stringstream istr;
+  istr << a;
+  return istr.str();
+}
+}  // namespace
+
+bool NativePaddlePredictor::Init(
+    std::shared_ptr<framework::Scope> parent_scope) {
+  VLOG(3) << "Predictor::init()";
+
+  if (config_.use_gpu) {
+    place_ = paddle::platform::CUDAPlace(config_.device);
+  } else {
+    place_ = paddle::platform::CPUPlace();
+  }
+  if (parent_scope) {
+    scope_ = parent_scope;
+    sub_scope_ = &(parent_scope->NewScope());
+  } else {
+    paddle::framework::InitDevices(false);
+    scope_.reset(new paddle::framework::Scope());
+  }
+
+  executor_.reset(new paddle::framework::Executor(place_));
+
+  // Initialize the inference program
+  if (!config_.model_dir.empty()) {
+    // Parameters are saved in separate files sited in
+    // the specified `dirname`.
+    inference_program_ = paddle::inference::Load(
+        executor_.get(), scope_.get(), config_.model_dir);
+  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
+    // All parameters are saved in a single file.
+    // The file names should be consistent with that used
+    // in Python API `fluid.io.save_inference_model`.
+    inference_program_ = paddle::inference::Load(
+        executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
+  } else {
+    LOG(ERROR) << "fail to load inference model.";
+    return false;
+  }
+  ctx_ = executor_->Prepare(*inference_program_, 0);
+  executor_->CreateVariables(
+      *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
+
+  // Get the feed_target_names and fetch_target_names
+  feed_target_names_ = inference_program_->GetFeedTargetNames();
+  fetch_target_names_ = inference_program_->GetFetchTargetNames();
+  return true;
+}
+
+NativePaddlePredictor::~NativePaddlePredictor() {
+  if (sub_scope_) {
+    PADDLE_ENFORCE_NOT_NULL(scope_, "Should have parent scope!");
+    scope_->DeleteScope(sub_scope_);
+  }
+};
+
+bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
+                                std::vector<PaddleTensor> *output_data) {
+  VLOG(3) << "Predictor::predict";
+  Timer timer;
+  timer.tic();
+  // set feed variable
+  std::map<std::string, const framework::LoDTensor *> feed_targets;
+  std::vector<framework::LoDTensor> feeds;
+  if (!SetFeed(inputs, &feeds)) {
+    LOG(ERROR) << "fail to set feed";
+    return false;
+  }
+  for (size_t i = 0; i < feed_target_names_.size(); ++i) {
+    feed_targets[feed_target_names_[i]] = &feeds[i];
+  }
+  // get fetch variable
+  std::map<std::string, framework::LoDTensor *> fetch_targets;
+  std::vector<framework::LoDTensor> fetchs;
+  fetchs.resize(fetch_target_names_.size());
+  for (size_t i = 0; i < fetch_target_names_.size(); ++i) {
+    fetch_targets[fetch_target_names_[i]] = &fetchs[i];
+  }
+  // Run the inference program
+  // if share variables, we need not create variables
+  executor_->RunPreparedContext(
+      ctx_.get(),
+      sub_scope_ != nullptr ? sub_scope_ : scope_.get(),
+      &feed_targets,
+      &fetch_targets,
+      false /* don't create variable eatch time */);
+  if (!GetFetch(fetchs, output_data)) {
+    LOG(ERROR) << "fail to get fetchs";
+    return false;
+  }
+  VLOG(3) << "predict cost: " << timer.toc() << "ms";
+  return true;
+}
+
+std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
+  VLOG(3) << "Predictor::clone";
+  std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
+
+  if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init(scope_)) {
+    LOG(ERROR) << "fail to call Init";
+    return nullptr;
+  }
+  // fix manylinux compile error.
+  return std::move(cls);
+}
+
+bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
+                                    std::vector<framework::LoDTensor> *feeds) {
+  VLOG(3) << "Predictor::set_feed";
+  if (inputs.size() != feed_target_names_.size()) {
+    LOG(ERROR) << "wrong feed input size.";
+    return false;
+  }
+  for (size_t i = 0; i < feed_target_names_.size(); ++i) {
+    framework::LoDTensor input;
+    framework::DDim ddim = framework::make_ddim(inputs[i].shape);
+    void *input_ptr;
+    if (inputs[i].dtype == PaddleDType::INT64) {
+      input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
+    } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
+      input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
+    } else {
+      LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
+      return false;
+    }
+
+    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
+    std::memcpy(static_cast<void *>(input_ptr),
+                inputs[i].data.data,
+                inputs[i].data.length);
+    feeds->push_back(input);
+  }
+  return true;
+}
+
+bool NativePaddlePredictor::GetFetch(
+    const std::vector<framework::LoDTensor> &fetchs,
+    std::vector<PaddleTensor> *outputs) {
+  VLOG(3) << "Predictor::get_fetch";
+  outputs->resize(fetchs.size());
+  for (size_t i = 0; i < fetchs.size(); ++i) {
+    // TODO(panyx0718): Support fetch of other types.
+    if (fetchs[i].type() != typeid(float)) {
+      LOG(ERROR) << "only support fetching float now.";
+      return false;
+    }
+    std::vector<int> shape;
+    auto dims_i = fetchs[i].dims();
+    auto lod = fetchs[i].lod();
+    const float *output_ptr = fetchs[i].data<float>();
+    // const int64_t* output_ptr = fetchs[i].data<int64_t>();
+    auto num = fetchs[i].numel();
+    std::vector<float> data;
+    if (0 == lod.size()) {
+      std::copy(output_ptr, output_ptr + num, std::back_inserter(data));
+      for (int j = 0; j < dims_i.size(); ++j) {
+        shape.push_back(dims_i[j]);
+      }
+    } else {
+      // for batch detection
+      // image[0] -> output[0] shape {145, 6}
+      // image[1] -> output[1] shape {176, 6}
+      // then,
+      // the batch output shape {321, 6}
+      // the lod {{0, 145, 321}}
+      // so we should append output[0] to {176, 6}
+      size_t max_dim = 0;
+      for (size_t j = 1; j < lod[0].size(); j++) {
+        max_dim = std::max(max_dim, lod[0][j] - lod[0][j - 1]);
+      }
+      size_t common_dim = lod[0].back() == 0 ? 0 : num / lod[0].back();
+      if (max_dim > 0) {
+        data.resize((lod[0].size() - 1) * max_dim * common_dim, 0);
+      }
+      for (size_t j = 1; j < lod[0].size(); j++) {
+        size_t start = lod[0][j - 1] * common_dim;
+        size_t end = lod[0][j] * common_dim;
+        if (end > start) {
+          std::copy(output_ptr + start,
+                    output_ptr + end,
+                    data.begin() + (j - 1) * max_dim * common_dim);
+        }
+      }
+      shape.push_back(lod[0].size() - 1);
+      shape.push_back(max_dim);
+      for (int j = 1; j < dims_i.size(); ++j) {
+        shape.push_back(dims_i[j]);
+      }
+    }
+
+    outputs->at(i).shape = shape;
+    outputs->at(i).data.length = sizeof(float) * data.size();
+    outputs->at(i).data.data = malloc(outputs->at(i).data.length);
+    std::memcpy(
+        outputs->at(i).data.data, data.data(), outputs->at(i).data.length);
+    outputs->at(i).dtype = PaddleDType::FLOAT32;
+    // TODO(panyx0718): support other types? fill tensor name? avoid a copy.
+  }
+  return true;
+}
+
+template <>
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
+    const NativeConfig &config) {
+  VLOG(3) << "create NativePaddlePredictor";
+  if (config.use_gpu) {
+    // 1. GPU memeroy
+    PADDLE_ENFORCE_GT(
+        config.fraction_of_gpu_memory,
+        0.f,
+        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
+    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
+    std::vector<std::string> flags;
+    if (config.fraction_of_gpu_memory >= 0.0f ||
+        config.fraction_of_gpu_memory <= 0.95f) {
+      flags.push_back("dummpy");
+      std::string flag = "--fraction_of_gpu_memory_to_use=" +
+                         num2str<float>(config.fraction_of_gpu_memory);
+      flags.push_back(flag);
+      VLOG(3) << "set flag: " << flag;
+      framework::InitGflags(flags);
+    }
+  }
+
+  std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config));
+  if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
+    return nullptr;
+  }
+  return std::move(predictor);
+}
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.h b/paddle/contrib/inference/paddle_inference_api_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..86d1db7bcc7567e104cd20c9f767ed4513f611f5
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_impl.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+
+class NativePaddlePredictor : public PaddlePredictor {
+ public:
+  explicit NativePaddlePredictor(const NativeConfig &config)
+      : config_(config) {}
+
+  // will only create sub scope if have global scope
+  bool Init(std::shared_ptr<framework::Scope> parent_scope);
+
+  bool Run(const std::vector<PaddleTensor> &inputs,
+           std::vector<PaddleTensor> *output_data) override;
+
+  std::unique_ptr<PaddlePredictor> Clone() override;
+
+  ~NativePaddlePredictor() override;
+
+ private:
+  bool SetFeed(const std::vector<PaddleTensor> &input_datas,
+               std::vector<framework::LoDTensor> *feeds);
+  bool GetFetch(const std::vector<framework::LoDTensor> &fetchs,
+                std::vector<PaddleTensor> *output_data);
+
+  NativeConfig config_;
+  platform::Place place_;
+  std::unique_ptr<framework::Executor> executor_;
+  std::shared_ptr<framework::Scope> scope_;
+  std::unique_ptr<framework::ExecutorPrepareContext> ctx_;
+  std::unique_ptr<framework::ProgramDesc> inference_program_;
+  std::vector<std::string> feed_target_names_;
+  std::vector<std::string> fetch_target_names_;
+  // Do not use unique_ptr, use parent scope to delete
+  framework::Scope *sub_scope_{nullptr};
+};
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/test_paddle_inference_api.cc b/paddle/contrib/inference/test_paddle_inference_api.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bc7faab6e208a66d7a56e41a56bd743c7644eea2
--- /dev/null
+++ b/paddle/contrib/inference/test_paddle_inference_api.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+namespace paddle {
+
+/*
+ * Do not use this, just a demo indicating how to customize a config for a
+ * specific predictor.
+ */
+struct DemoConfig : public PaddlePredictor::Config {
+  float other_config;
+};
+
+/*
+ * Do not use this, just a demo indicating how to customize a Predictor.
+ */
+class DemoPredictor : public PaddlePredictor {
+ public:
+  explicit DemoPredictor(const DemoConfig &config) {
+    LOG(INFO) << "I get other_config " << config.other_config;
+  }
+  bool Run(const std::vector<PaddleTensor> &inputs,
+           std::vector<PaddleTensor> *output_data) override {
+    LOG(INFO) << "Run";
+    return false;
+  }
+
+  std::unique_ptr<PaddlePredictor> Clone() override { return nullptr; }
+
+  ~DemoPredictor() override {}
+};
+
+template <>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<DemoConfig>(
+    const DemoConfig &config) {
+  std::unique_ptr<PaddlePredictor> x(new DemoPredictor(config));
+  return x;
+}
+
+TEST(paddle_inference_api, demo) {
+  DemoConfig config;
+  config.other_config = 1.7;
+  auto predictor = CreatePaddlePredictor(config);
+  std::vector<PaddleTensor> outputs;
+  predictor->Run({}, &outputs);
+}
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4b6cb7b051d1ad2c63e895017c7faf1245c22612
--- /dev/null
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -0,0 +1,299 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <thread>
+
+#include "gflags/gflags.h"
+#include "paddle/contrib/inference/paddle_inference_api_impl.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+namespace paddle {
+
+PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
+  PaddleTensor pt;
+  pt.data.data = t->data<void>();
+
+  if (t->type() == typeid(int64_t)) {
+    pt.data.length = t->numel() * sizeof(int64_t);
+    pt.dtype = PaddleDType::INT64;
+  } else if (t->type() == typeid(float)) {
+    pt.data.length = t->numel() * sizeof(float);
+    pt.dtype = PaddleDType::FLOAT32;
+  } else {
+    LOG(FATAL) << "unsupported type.";
+  }
+  pt.shape = framework::vectorize2int(t->dims());
+  return pt;
+}
+
+NativeConfig GetConfig() {
+  NativeConfig config;
+  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  LOG(INFO) << "dirname  " << config.model_dir;
+  config.fraction_of_gpu_memory = 0.15;
+#ifdef PADDLE_WITH_CUDA
+  config.use_gpu = true;
+#else
+  config.use_gpu = false;
+#endif
+  config.device = 0;
+  return config;
+}
+
+void MainWord2Vec(bool use_gpu) {
+  NativeConfig config = GetConfig();
+  auto predictor = CreatePaddlePredictor<NativeConfig>(config);
+  config.use_gpu = use_gpu;
+
+  framework::LoDTensor first_word, second_word, third_word, fourth_word;
+  framework::LoD lod{{0, 1}};
+  int64_t dict_size = 2073;  // The size of dictionary
+
+  SetupLoDTensor(&first_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&second_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&third_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
+
+  std::vector<PaddleTensor> paddle_tensor_feeds;
+  paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&first_word));
+  paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&second_word));
+  paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&third_word));
+  paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&fourth_word));
+
+  std::vector<PaddleTensor> outputs;
+  ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
+  ASSERT_EQ(outputs.size(), 1UL);
+  size_t len = outputs[0].data.length;
+  float* data = static_cast<float*>(outputs[0].data.data);
+  for (size_t j = 0; j < len / sizeof(float); ++j) {
+    ASSERT_LT(data[j], 1.0);
+    ASSERT_GT(data[j], -1.0);
+  }
+
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&first_word);
+  cpu_feeds.push_back(&second_word);
+  cpu_feeds.push_back(&third_word);
+  cpu_feeds.push_back(&fourth_word);
+
+  framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  TestInference<platform::CPUPlace>(config.model_dir, cpu_feeds, cpu_fetchs1);
+
+  float* lod_data = output1.data<float>();
+  for (int i = 0; i < output1.numel(); ++i) {
+    EXPECT_LT(lod_data[i] - data[i], 1e-3);
+    EXPECT_GT(lod_data[i] - data[i], -1e-3);
+  }
+
+  free(outputs[0].data.data);
+}
+
+void MainImageClassification(bool use_gpu) {
+  int batch_size = 2;
+  bool use_mkldnn = false;
+  bool repeat = false;
+  NativeConfig config = GetConfig();
+  config.use_gpu = use_gpu;
+  config.model_dir =
+      FLAGS_dirname + "image_classification_resnet.inference.model";
+
+  const bool is_combined = false;
+  std::vector<std::vector<int64_t>> feed_target_shapes =
+      GetFeedTargetShapes(config.model_dir, is_combined);
+
+  framework::LoDTensor input;
+  // Use normilized image pixels as input data,
+  // which should be in the range [0.0, 1.0].
+  feed_target_shapes[0][0] = batch_size;
+  framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]);
+  SetupTensor<float>(
+      &input, input_dims, static_cast<float>(0), static_cast<float>(1));
+  std::vector<framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&input);
+
+  framework::LoDTensor output1;
+  std::vector<framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  TestInference<platform::CPUPlace, false, true>(config.model_dir,
+                                                 cpu_feeds,
+                                                 cpu_fetchs1,
+                                                 repeat,
+                                                 is_combined,
+                                                 use_mkldnn);
+
+  auto predictor = CreatePaddlePredictor(config);
+  std::vector<PaddleTensor> paddle_tensor_feeds;
+  paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&input));
+
+  std::vector<PaddleTensor> outputs;
+  ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
+  ASSERT_EQ(outputs.size(), 1UL);
+  size_t len = outputs[0].data.length;
+  float* data = static_cast<float*>(outputs[0].data.data);
+  float* lod_data = output1.data<float>();
+  for (size_t j = 0; j < len / sizeof(float); ++j) {
+    EXPECT_NEAR(lod_data[j], data[j], 1e-3);
+  }
+  free(data);
+}
+
+void MainThreadsWord2Vec(bool use_gpu) {
+  NativeConfig config = GetConfig();
+  config.use_gpu = use_gpu;
+  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
+
+  // prepare inputs data and reference results
+  constexpr int num_jobs = 3;
+  std::vector<std::vector<framework::LoDTensor>> jobs(num_jobs);
+  std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
+  std::vector<framework::LoDTensor> refs(num_jobs);
+  for (size_t i = 0; i < jobs.size(); ++i) {
+    // each job has 4 words
+    jobs[i].resize(4);
+    for (size_t j = 0; j < 4; ++j) {
+      framework::LoD lod{{0, 1}};
+      int64_t dict_size = 2073;  // The size of dictionary
+      SetupLoDTensor(&jobs[i][j], lod, static_cast<int64_t>(0), dict_size - 1);
+      paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i][j]));
+    }
+
+    // get reference result of each job
+    std::vector<paddle::framework::LoDTensor*> ref_feeds;
+    std::vector<paddle::framework::LoDTensor*> ref_fetches(1, &refs[i]);
+    for (auto& word : jobs[i]) {
+      ref_feeds.push_back(&word);
+    }
+    TestInference<platform::CPUPlace>(config.model_dir, ref_feeds, ref_fetches);
+  }
+
+  // create threads and each thread run 1 job
+  std::vector<std::thread> threads;
+  for (int tid = 0; tid < num_jobs; ++tid) {
+    threads.emplace_back([&, tid]() {
+      auto predictor = main_predictor->Clone();
+      auto& local_inputs = paddle_tensor_feeds[tid];
+      std::vector<PaddleTensor> local_outputs;
+      ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
+
+      // check outputs range
+      ASSERT_EQ(local_outputs.size(), 1UL);
+      const size_t len = local_outputs[0].data.length;
+      float* data = static_cast<float*>(local_outputs[0].data.data);
+      for (size_t j = 0; j < len / sizeof(float); ++j) {
+        ASSERT_LT(data[j], 1.0);
+        ASSERT_GT(data[j], -1.0);
+      }
+
+      // check outputs correctness
+      float* ref_data = refs[tid].data<float>();
+      EXPECT_EQ(refs[tid].numel(), static_cast<int64_t>(len / sizeof(float)));
+      for (int i = 0; i < refs[tid].numel(); ++i) {
+        EXPECT_NEAR(ref_data[i], data[i], 1e-3);
+      }
+      free(data);
+    });
+  }
+  for (int i = 0; i < num_jobs; ++i) {
+    threads[i].join();
+  }
+}
+
+void MainThreadsImageClassification(bool use_gpu) {
+  constexpr int num_jobs = 4;  // each job run 1 batch
+  constexpr int batch_size = 1;
+  NativeConfig config = GetConfig();
+  config.use_gpu = use_gpu;
+  config.model_dir =
+      FLAGS_dirname + "image_classification_resnet.inference.model";
+
+  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
+  std::vector<framework::LoDTensor> jobs(num_jobs);
+  std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
+  std::vector<framework::LoDTensor> refs(num_jobs);
+  for (size_t i = 0; i < jobs.size(); ++i) {
+    // prepare inputs
+    std::vector<std::vector<int64_t>> feed_target_shapes =
+        GetFeedTargetShapes(config.model_dir, /*is_combined*/ false);
+    feed_target_shapes[0][0] = batch_size;
+    framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]);
+    SetupTensor<float>(&jobs[i], input_dims, 0.f, 1.f);
+    paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i]));
+
+    // get reference result of each job
+    std::vector<framework::LoDTensor*> ref_feeds(1, &jobs[i]);
+    std::vector<framework::LoDTensor*> ref_fetches(1, &refs[i]);
+    TestInference<platform::CPUPlace>(config.model_dir, ref_feeds, ref_fetches);
+  }
+
+  // create threads and each thread run 1 job
+  std::vector<std::thread> threads;
+  for (int tid = 0; tid < num_jobs; ++tid) {
+    threads.emplace_back([&, tid]() {
+      auto predictor = main_predictor->Clone();
+      auto& local_inputs = paddle_tensor_feeds[tid];
+      std::vector<PaddleTensor> local_outputs;
+      ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
+
+      // check outputs correctness
+      ASSERT_EQ(local_outputs.size(), 1UL);
+      const size_t len = local_outputs[0].data.length;
+      float* data = static_cast<float*>(local_outputs[0].data.data);
+      float* ref_data = refs[tid].data<float>();
+      EXPECT_EQ(refs[tid].numel(), len / sizeof(float));
+      for (int i = 0; i < refs[tid].numel(); ++i) {
+        EXPECT_NEAR(ref_data[i], data[i], 1e-3);
+      }
+      free(data);
+    });
+  }
+  for (int i = 0; i < num_jobs; ++i) {
+    threads[i].join();
+  }
+}
+
+TEST(inference_api_native, word2vec_cpu) { MainWord2Vec(false /*use_gpu*/); }
+TEST(inference_api_native, word2vec_cpu_threads) {
+  MainThreadsWord2Vec(false /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_cpu) {
+  MainThreadsImageClassification(false /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_cpu_threads) {
+  MainThreadsImageClassification(false /*use_gpu*/);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(inference_api_native, word2vec_gpu) { MainWord2Vec(true /*use_gpu*/); }
+TEST(inference_api_native, word2vec_gpu_threads) {
+  MainThreadsWord2Vec(true /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_gpu) {
+  MainThreadsImageClassification(true /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_gpu_threads) {
+  MainThreadsImageClassification(true /*use_gpu*/);
+}
+
+#endif
+
+}  // namespace paddle
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index efd1b7a73e1655f95eb83a5e2f59e82cbf7eba16..9bbb8de78e09829d24faf42c360811084981578f 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -87,8 +87,3 @@ else()
 endif()
 
 add_dependencies(paddle_cuda paddle_proto ${external_project_dependencies})
-
-add_style_check_target(paddle_cuda
-                       ${CUDA_SOURCES}
-                       ${CUDA_HEADERS}
-                       ${CUDA_CXX_SOURCES})
diff --git a/paddle/cuda/include/hl_activation_functions.h b/paddle/cuda/include/hl_activation_functions.h
index 29ec248420058db08bd1932f702d26074d49f38c..66a69db545b541409f895820ad621a2a9a684e20 100644
--- a/paddle/cuda/include/hl_activation_functions.h
+++ b/paddle/cuda/include/hl_activation_functions.h
@@ -31,7 +31,7 @@ namespace hppl {
  */
 template <class T>
 class Active {
-public:
+ public:
   typedef T (*forward)(T);
   typedef T (*backward)(T, T);
 };
diff --git a/paddle/cuda/include/hl_tensor_ops.h b/paddle/cuda/include/hl_tensor_ops.h
index 85a022ff5e26daab97be52b7ea9814c6b8078561..bc5e5da53d5c6ac2bae3b0067f46e39accd1b9d8 100644
--- a/paddle/cuda/include/hl_tensor_ops.h
+++ b/paddle/cuda/include/hl_tensor_ops.h
@@ -23,128 +23,128 @@ namespace unary {
 
 template <class T>
 class add_scale {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE add_scale(const T s) : p(s) {}
   INLINE T operator()(const T a) const { return a + p; }
 };
 
 template <class T>
 class sub_scale {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE sub_scale(const T s) : p(s) {}
   INLINE T operator()(const T a) const { return a - p; }
 };
 
 template <class T>
 class mul_scale {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE mul_scale(const T s) : p(s) {}
   INLINE T operator()(const T a) const { return a * p; }
 };
 
 template <class T>
 class div_scale {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE div_scale(const T s) : p(s) {}
   INLINE T operator()(const T a) const { return a / p; }
 };
 
 template <class T>
 class neg {
-public:
+ public:
   INLINE T operator()(const T a) const { return -a; }
 };
 
 template <class T>
 class exp_op {
-public:
+ public:
   INLINE T operator()(const T a) const { return std::exp(a); }
 };
 
 template <class T>
 class log_op {
-public:
+ public:
   INLINE T operator()(const T a) const { return std::log(a); }
 };
 
 template <class T>
 class sqrt_op {
-public:
+ public:
   INLINE T operator()(const T a) const { return std::sqrt(a); }
 };
 
 template <class T>
 class square {
-public:
+ public:
   INLINE T operator()(const T a) const { return a * a; }
 };
 
 template <class T>
 class reciprocal {
-public:
+ public:
   INLINE T operator()(const T a) const { return T(1) / a; }
 };
 
 template <class T>
 class abs {
-public:
+ public:
   INLINE T operator()(const T a) const { return a > 0 ? a : -a; }
 };
 
 template <class T>
 class sign {
-public:
+ public:
   INLINE T operator()(const T a) const { return (a > 0) - (a < 0); }
 };
 
 template <class T>
 class min {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE min(const T s) : p(s) {}
   INLINE T operator()(const T a) const { return a > p ? p : a; }
 };
 
 template <class T>
 class max {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE max(const T s) : p(s) {}
   INLINE T operator()(const T a) const { return a < p ? p : a; }
 };
 
 template <class T>
 class pow_op {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE pow_op(const T s) : p(s) {}
   INLINE T operator()(const T a) const { return std::pow(a, p); }
 };
 
 template <class T>
 class constant {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE constant(const T s) : p(s) {}
   INLINE T operator()(int i) const { return p; }
   INLINE T operator()(int i, int j) const { return p; }
@@ -152,80 +152,80 @@ public:
 
 template <class T>
 class cmp_eq {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE cmp_eq(const T s) : p(s) {}
   INLINE bool operator()(const T a) const { return a == p; }
 };
 
 template <class T>
 class cmp_ne {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE cmp_ne(const T s) : p(s) {}
   INLINE bool operator()(const T a) const { return a != p; }
 };
 
 template <class T>
 class cmp_le {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE cmp_le(const T s) : p(s) {}
   INLINE bool operator()(const T a) const { return a <= p; }
 };
 
 template <class T>
 class cmp_lt {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE cmp_lt(const T s) : p(s) {}
   INLINE bool operator()(const T a) const { return a < p; }
 };
 
 template <class T>
 class cmp_ge {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE cmp_ge(const T s) : p(s) {}
   INLINE bool operator()(const T a) const { return a >= p; }
 };
 
 template <class T>
 class cmp_gt {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE cmp_gt(const T s) : p(s) {}
   INLINE bool operator()(const T a) const { return a > p; }
 };
 
 template <class T>
 class and_op {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE and_op(const T s) : p(s) {}
   INLINE bool operator()(const T a) const { return a && p; }
 };
 
 template <class T>
 class or_op {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE or_op(const T s) : p(s) {}
   INLINE bool operator()(const T a) const { return a || p; }
 };
@@ -235,96 +235,96 @@ public:
 namespace binary {
 template <class T>
 class add {
-public:
+ public:
   INLINE T operator()(const T a, const T b) const { return a + b; }
 };
 
 template <class T>
 class add_scale {
-private:
+ private:
   const T p1;
   const T p2;
 
-public:
+ public:
   INLINE add_scale(const T s1, const T s2) : p1(s1), p2(s2) {}
   INLINE T operator()(const T a, const T b) const { return p1 * a + p2 * b; }
 };
 
 template <class T>
 class sub {
-public:
+ public:
   INLINE T operator()(const T a, const T b) const { return a - b; }
 };
 
 template <class T>
 class mul {
-public:
+ public:
   INLINE T operator()(const T a, const T b) const { return a * b; }
 };
 
 template <class T>
 class div {
-public:
+ public:
   INLINE T operator()(const T a, const T b) const { return a / b; }
 };
 
 template <class T>
 class cmp_eq {
-public:
+ public:
   INLINE bool operator()(const T a, const T b) const { return a == b; }
 };
 
 template <class T>
 class cmp_ne {
-public:
+ public:
   INLINE bool operator()(const T a, const T b) const { return a != b; }
 };
 
 template <class T>
 class cmp_le {
-public:
+ public:
   INLINE bool operator()(const T a, const T b) const { return a <= b; }
 };
 
 template <class T>
 class cmp_lt {
-public:
+ public:
   INLINE bool operator()(const T a, const T b) const { return a < b; }
 };
 
 template <class T>
 class cmp_ge {
-public:
+ public:
   INLINE bool operator()(const T a, const T b) const { return a >= b; }
 };
 
 template <class T>
 class cmp_gt {
-public:
+ public:
   INLINE bool operator()(const T a, const T b) const { return a > b; }
 };
 
 template <class T>
 class and_op {
-public:
+ public:
   INLINE bool operator()(const T a, const T b) const { return a && b; }
 };
 
 template <class T>
 class or_op {
-public:
+ public:
   INLINE bool operator()(const T a, const T b) const { return a || b; }
 };
 
 template <class T>
 class min {
-public:
+ public:
   INLINE T operator()(const T a, const T b) const { return a > b ? b : a; }
 };
 
 template <class T>
 class max {
-public:
+ public:
   INLINE T operator()(const T a, const T b) const { return a < b ? b : a; }
 };
 
@@ -332,7 +332,7 @@ public:
 #ifndef PADDLE_TYPE_DOUBLE
 template <>
 class add<__m128> {
-public:
+ public:
   INLINE __m128 operator()(const __m128 a, const __m128 b) const {
     return _mm_add_ps(a, b);
   }
@@ -340,11 +340,11 @@ public:
 
 template <>
 class add_scale<__m128> {
-private:
+ private:
   const __m128 p1;
   const __m128 p2;
 
-public:
+ public:
   INLINE add_scale(const __m128 s1, const __m128 s2) : p1(s1), p2(s2) {}
   INLINE __m128 operator()(const __m128 a, const __m128 b) const {
     return _mm_add_ps(_mm_mul_ps(p1, a), _mm_mul_ps(p2, b));
@@ -353,7 +353,7 @@ public:
 
 template <>
 class sub<__m128> {
-public:
+ public:
   INLINE __m128 operator()(const __m128 a, const __m128 b) const {
     return _mm_sub_ps(a, b);
   }
@@ -361,7 +361,7 @@ public:
 
 template <>
 class mul<__m128> {
-public:
+ public:
   INLINE __m128 operator()(const __m128 a, const __m128 b) const {
     return _mm_mul_ps(a, b);
   }
@@ -369,7 +369,7 @@ public:
 
 template <>
 class div<__m128> {
-public:
+ public:
   INLINE __m128 operator()(const __m128 a, const __m128 b) const {
     return _mm_div_ps(a, b);
   }
@@ -377,7 +377,7 @@ public:
 
 template <>
 class min<__m128> {
-public:
+ public:
   INLINE __m128 operator()(const __m128 a, const __m128 b) const {
     return _mm_min_ps(a, b);
   }
@@ -385,7 +385,7 @@ public:
 
 template <>
 class max<__m128> {
-public:
+ public:
   INLINE __m128 operator()(const __m128 a, const __m128 b) const {
     return _mm_max_ps(a, b);
   }
@@ -393,7 +393,7 @@ public:
 #else
 template <>
 class add<__m128d> {
-public:
+ public:
   INLINE __m128d operator()(const __m128d a, const __m128d b) const {
     return _mm_add_pd(a, b);
   }
@@ -401,11 +401,11 @@ public:
 
 template <>
 class add_scale<__m128d> {
-private:
+ private:
   const __m128d p1;
   const __m128d p2;
 
-public:
+ public:
   INLINE add_scale(const __m128d s1, const __m128d s2) : p1(s1), p2(s2) {}
   INLINE __m128d operator()(const __m128d a, const __m128d b) const {
     return _mm_add_pd(_mm_mul_pd(p1, a), _mm_mul_pd(p2, b));
@@ -414,7 +414,7 @@ public:
 
 template <>
 class sub<__m128d> {
-public:
+ public:
   INLINE __m128d operator()(const __m128d a, const __m128d b) const {
     return _mm_sub_pd(a, b);
   }
@@ -422,7 +422,7 @@ public:
 
 template <>
 class mul<__m128d> {
-public:
+ public:
   INLINE __m128d operator()(const __m128d a, const __m128d b) const {
     return _mm_mul_pd(a, b);
   }
@@ -430,7 +430,7 @@ public:
 
 template <>
 class div<__m128d> {
-public:
+ public:
   INLINE __m128d operator()(const __m128d a, const __m128d b) const {
     return _mm_div_pd(a, b);
   }
@@ -438,7 +438,7 @@ public:
 
 template <>
 class min<__m128d> {
-public:
+ public:
   INLINE __m128d operator()(const __m128d a, const __m128d b) const {
     return _mm_min_pd(a, b);
   }
@@ -446,7 +446,7 @@ public:
 
 template <>
 class max<__m128d> {
-public:
+ public:
   INLINE __m128d operator()(const __m128d a, const __m128d b) const {
     return _mm_max_pd(a, b);
   }
@@ -458,7 +458,7 @@ public:
 #ifndef PADDLE_TYPE_DOUBLE
 template <>
 class add<float32x4_t> {
-public:
+ public:
   INLINE float32x4_t operator()(const float32x4_t a,
                                 const float32x4_t b) const {
     return vaddq_f32(a, b);
@@ -467,11 +467,11 @@ public:
 
 template <>
 class add_scale<float32x4_t> {
-private:
+ private:
   const float32x4_t p1;
   const float32x4_t p2;
 
-public:
+ public:
   INLINE add_scale(const float32x4_t s1, const float32x4_t s2)
       : p1(s1), p2(s2) {}
   INLINE float32x4_t operator()(const float32x4_t a,
@@ -482,7 +482,7 @@ public:
 
 template <>
 class sub<float32x4_t> {
-public:
+ public:
   INLINE float32x4_t operator()(const float32x4_t a,
                                 const float32x4_t b) const {
     return vsubq_f32(a, b);
@@ -491,7 +491,7 @@ public:
 
 template <>
 class mul<float32x4_t> {
-public:
+ public:
   INLINE float32x4_t operator()(const float32x4_t a,
                                 const float32x4_t b) const {
     return vmulq_f32(a, b);
@@ -500,7 +500,7 @@ public:
 
 template <>
 class div<float32x4_t> {
-public:
+ public:
   INLINE float32x4_t operator()(const float32x4_t a,
                                 const float32x4_t b) const {
     float32x4_t tmp = vrecpeq_f32(b);
@@ -510,7 +510,7 @@ public:
 
 template <>
 class min<float32x4_t> {
-public:
+ public:
   INLINE float32x4_t operator()(const float32x4_t a,
                                 const float32x4_t b) const {
     return vminq_f32(a, b);
@@ -519,7 +519,7 @@ public:
 
 template <>
 class max<float32x4_t> {
-public:
+ public:
   INLINE float32x4_t operator()(const float32x4_t a,
                                 const float32x4_t b) const {
     return vmaxq_f32(a, b);
diff --git a/paddle/cuda/src/hl_cuda_lstm.cu b/paddle/cuda/src/hl_cuda_lstm.cu
index e30fcddffdf99417a4b9b811a0b0cb0a12e79b99..b8c4e433a118fb1c5af753751f91c34543b1114c 100644
--- a/paddle/cuda/src/hl_cuda_lstm.cu
+++ b/paddle/cuda/src/hl_cuda_lstm.cu
@@ -30,7 +30,7 @@ bool hl_lstm_sequence_parallel(int frameSize) {
 }
 
 class frameValue {
-public:
+ public:
   real *value_;
   __device__ frameValue(real *value) : value_(value) {}
   template <int reversed, int frameSize>
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 340b891e41671df7e61a4a66ec538d4603bb9842..627370cd2df7317b4d32aa967565aaf9cf0c7a08 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -5,11 +5,11 @@ proto_library(framework_proto SRCS framework.proto)
 cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-
+cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
 if(WITH_GPU)
-  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place memory device_context framework_proto)
+  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type)
 else()
-  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place memory device_context framework_proto)
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type)
 endif()
 
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
@@ -57,7 +57,7 @@ cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
 cc_library(attribute SRCS attribute.cc DEPS framework_proto boost)
 cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
 device_context)
-cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
+cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute glog)
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
@@ -87,7 +87,7 @@ cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
 framework_proto glog lod_rank_table feed_fetch_method)
 
 
-cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc
index 1b6f656a006489485a55b5c13b5e2de93c3da0ed..f537e4b9e569dd4c513ac0efde7240833bcf04b6 100644
--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -134,6 +134,11 @@ OpDesc *BlockDesc::PrependOp() {
   return ops_.front().get();
 }
 
+void BlockDesc::PrependAllocatedOp(std::unique_ptr<OpDesc> &&op_desc) {
+  need_update_ = true;
+  ops_.emplace_front(std::move(op_desc));
+}
+
 OpDesc *BlockDesc::InsertOp(size_t index) {
   need_update_ = true;
   auto it = ops_.begin() + index;
@@ -164,17 +169,13 @@ void BlockDesc::Flush() {
   }
 
   if (need_update_) {
-    auto &op_field = *this->desc_->mutable_ops();
-    this->ClearPBOps();
-    op_field.Reserve(static_cast<int>(ops_.size()));
+    this->desc_->mutable_ops()->Clear();
     for (auto &op_desc : ops_) {
-      op_field.AddAllocated(op_desc->Proto());
+      this->desc_->mutable_ops()->Add()->CopyFrom(*op_desc->Proto());
     }
-    auto &var_field = *this->desc_->mutable_vars();
-    this->ClearPBVars();
-    var_field.Reserve(static_cast<int>(vars_.size()));
+    this->desc_->mutable_vars()->Clear();
     for (auto &var_desc : vars_) {
-      var_field.AddAllocated(var_desc.second->Proto());
+      this->desc_->mutable_vars()->Add()->CopyFrom(*var_desc.second->Proto());
     }
     need_update_ = false;
   }
@@ -195,7 +196,7 @@ BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc)
     vars_[var_desc.name()].reset(new VarDesc(var_desc));
   }
   for (const proto::OpDesc &op_desc : desc_->ops()) {
-    ops_.emplace_back(new OpDesc(op_desc, prog, this));
+    ops_.emplace_back(new OpDesc(op_desc, this));
   }
 }
 
@@ -204,7 +205,7 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
     : prog_(prog), desc_(desc) {
   need_update_ = true;
   for (auto &op : other.ops_) {
-    ops_.emplace_back(new OpDesc(*op->Proto(), prog, this));
+    ops_.emplace_back(new OpDesc(*op, this));
   }
   for (auto &it : other.vars_) {
     auto *var = new VarDesc(*it.second);
@@ -212,22 +213,6 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
   }
 }
 
-void BlockDesc::ClearPBOps() {
-  auto ops = this->desc_->mutable_ops();
-  while (!ops->empty()) {
-    // we do not own the OpDesc, so release the ownership.
-    ops->ReleaseLast();
-  }
-}
-
-void BlockDesc::ClearPBVars() {
-  auto vars = this->desc_->mutable_vars();
-  while (!vars->empty()) {
-    // we do not own the VarDesc, so release the ownership.
-    vars->ReleaseLast();
-  }
-}
-
 void BlockDesc::SetForwardBlockID(int32_t forward_block_id) {
   PADDLE_ENFORCE(!desc_->has_forward_block_idx(),
                  "Parent block ID has been set to %d. Cannot set to %d",
diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index eef19c4f09c60b9df18f154c85c421f5bff9413f..ce48548418478cc5c9f9ca1244df9e66dca884e6 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -41,11 +41,6 @@ class BlockDesc {
 
   BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, ProgramDesc *prog);
 
-  ~BlockDesc() {
-    this->ClearPBVars();
-    this->ClearPBOps();
-  }
-
   int32_t ID() const { return desc_->idx(); }
 
   int32_t Parent() const { return desc_->parent_idx(); }
@@ -88,6 +83,8 @@ class BlockDesc {
 
   OpDesc *PrependOp();
 
+  void PrependAllocatedOp(std::unique_ptr<OpDesc> &&op_desc);
+
   OpDesc *InsertOp(size_t index);
 
   /*
@@ -103,7 +100,7 @@ class BlockDesc {
 
   size_t OpSize() const { return ops_.size(); }
 
-  OpDesc *Op(int idx) { return ops_.at(idx).get(); }
+  OpDesc *Op(int idx) const { return ops_.at(idx).get(); }
 
   void Flush();
 
@@ -111,10 +108,6 @@ class BlockDesc {
 
   ProgramDesc *Program() const { return this->prog_; }
 
- private:
-  void ClearPBOps();
-  void ClearPBVars();
-
  private:
   ProgramDesc *prog_;       // not_own
   proto::BlockDesc *desc_;  // not_own
diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc
index 85dbb39e6fba735471446b5e5e71a612282c498a..6bcfc6cd55f02f0d4f0f6e3170e7cc19ce666a28 100644
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -16,29 +16,25 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-static const platform::DeviceContext* GetDeviceContext(
-    const platform::Place& src_place, const platform::Place& dst_place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-
-  if (platform::is_gpu_place(src_place) && platform::is_cpu_place(dst_place)) {
-    return pool.Get(src_place);
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
-    return pool.Get(dst_place);
-  } else {
-    PADDLE_THROW(
-        "Currently, model parallelism is only supported between CPU and CUDA");
-  }
-}
-
-void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
-                     Tensor* out) {
+void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
+                     Tensor *out) {
   VLOG(3) << "DeviceTransform in, src_place " << in.place()
           << " dst_place: " << dst_place;
-  auto* dev_ctx = GetDeviceContext(in.place(), dst_place);
-  dev_ctx->Wait();
-  TensorCopy(in, dst_place, *dev_ctx, out);
-  dev_ctx->Wait();
+
+  PADDLE_ENFORCE_NE(
+      in.place().which(), dst_place.which(),
+      "Currently, model parallelism is only supported between CPU and CUDA");
+
+  // FIXME(zcd): TransDataDevice is used to transform data from GPU to CPU and
+  // the enforced checkings have been done in GetDeviceContext, so the
+  // `dev_ctx->Wait()` is necessary. But `dev_ctx->Wait()` will make the program
+  // slow, especially when the number of elements is little, for example,
+  // the elements of learning rate are one and it's CPU side.
+  // One solution is to use a CUDA kernel to complete the copy operation when
+  // the transforming is from CPU to GPU and the number of elements is little.
+  // But the embarrassment is that this solution this solution makes training
+  // slower.
+  TensorCopySync(in, dst_place, out);
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index df4caa45eba2470f7528d2fbd99cca39cae0b596..a91fe5c99d397ef1bf04f6d22e988b6d3f33e500 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -32,8 +32,7 @@ struct AddFunctor {
 
 class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  public:
-  OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() {
     AddInput("input", "input1 of test op");
     AddOutput("output", "output of test op");
     AddAttr<bool>("use_gpu", "force to use gpu kernel").SetDefault(false);
diff --git a/paddle/fluid/framework/data_layout.h b/paddle/fluid/framework/data_layout.h
index 9c5e2cf7ccdcea2822da42210ff1fdb915a9a4ec..b611bb77b4e1ec05b8bd029ac37cefba346c6eb0 100644
--- a/paddle/fluid/framework/data_layout.h
+++ b/paddle/fluid/framework/data_layout.h
@@ -27,6 +27,7 @@ enum class DataLayout {
   kNHWC = 0,
   kNCHW = 1,
   kAnyLayout = 2,
+  kMKLDNN = 3,  // all layouts supported by MKLDNN internally
 };
 
 inline DataLayout StringToDataLayout(const std::string& str) {
@@ -41,6 +42,8 @@ inline DataLayout StringToDataLayout(const std::string& str) {
     return DataLayout::kNCHW;
   } else if (s == "ANYLAYOUT") {
     return DataLayout::kAnyLayout;
+  } else if (s == "MKLDNNLAYOUT") {
+    return DataLayout::kMKLDNN;
   } else {
     PADDLE_THROW("Unknown storage order string: %s", s);
   }
@@ -54,8 +57,10 @@ inline std::string DataLayoutToString(const DataLayout& data_layout) {
       return "NCHW";
     case DataLayout::kAnyLayout:
       return "ANY_LAYOUT";
+    case DataLayout::kMKLDNN:
+      return "MKLDNNLAYOUT";
     default:
-      PADDLE_THROW("unknown DataLayou %d", data_layout);
+      PADDLE_THROW("unknown DataLayout %d", data_layout);
   }
 }
 
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 60ec60a427ba9046ce690eb75c27cd322fdd726d..5b8dfc57ba020cea259041f55a66472ea26b4eec 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -16,6 +16,9 @@
 #include <vector>
 
 #include "paddle/fluid/operators/math/math_function.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 
 namespace paddle {
 namespace framework {
@@ -88,5 +91,85 @@ void TransDataLayout(const OpKernelType& kernel_type_for_var,
   out->set_layout(expected_kernel_type.data_layout_);
 }
 
+#ifdef PADDLE_WITH_MKLDNN
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::reorder;
+
+void* GetDataFromTensor(const Tensor& tensor, mkldnn::memory::data_type type) {
+  switch (type) {
+    case mkldnn::memory::data_type::f32:
+      return platform::to_void_cast(tensor.data<float>());
+    case mkldnn::memory::data_type::s8:
+      return platform::to_void_cast(tensor.data<char>());
+    case mkldnn::memory::data_type::u8:
+      return platform::to_void_cast(tensor.data<unsigned char>());
+    case mkldnn::memory::data_type::s16:
+      return platform::to_void_cast(tensor.data<int16_t>());
+    case mkldnn::memory::data_type::s32:
+      return platform::to_void_cast(tensor.data<int32_t>());
+    default:
+      PADDLE_THROW("wrong mkldnn type provided");
+  }
+}
+#endif
+
+void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
+                               const OpKernelType& expected_kernel_type,
+                               const Tensor& in, Tensor* out) {
+  auto in_layout = kernel_type_for_var.data_layout_;
+  auto out_layout = expected_kernel_type.data_layout_;
+
+  PADDLE_ENFORCE(
+      in_layout == DataLayout::kMKLDNN && out_layout != DataLayout::kMKLDNN,
+      "TransDataLayoutFromMKLDNN only supports transform from MKLDNN to "
+      "non-MKLDNN");
+
+#ifdef PADDLE_WITH_MKLDNN
+  PADDLE_ENFORCE(in.format() != memory::format::format_undef &&
+                     in.format() != memory::format::any,
+                 "Input tensor should have specified memory format");
+
+  // Set default as NCHW in case not specified
+  out_layout =
+      out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;
+
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(
+      pool.Get(expected_kernel_type.place_));
+  auto& cpu_engine = dev_ctx->GetEngine();
+
+  std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims());
+  std::vector<int> out_tz = in_tz;
+
+  memory::data_type in_type = ToMKLDNNDataType(in.type());
+  PADDLE_ENFORCE(in_type != memory::data_type::data_undef,
+                 "Input tensor type is not supported: ", in.type().name());
+  memory::data_type out_type = in_type;
+
+  memory::format in_format =
+      in_tz.size() == 2 ? memory::format::nc : in.format();
+  memory::format out_format =
+      out_tz.size() == 2 ? memory::format::nc : ToMKLDNNFormat(out_layout);
+
+  void* in_data = GetDataFromTensor(in, in_type);
+
+  // output tensor has the same dims as input. Reorder don't change dims
+  out->Resize(in.dims());
+
+  auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
+
+  auto in_memory = memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
+  auto out_memory =
+      memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
+
+  platform::Reorder(in_memory, out_memory);
+
+  out->set_layout(out_layout);
+  // reset format since the out tensor will be feed to non-MKLDNN OPkernel
+  out->set_format(memory::format::format_undef);
+#endif
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
index 06b638663dd334837a3bcb7737e507fcbc871c7a..2ba84ce57fd8aa3d9aa651bdaa2930e459c74e88 100644
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <map>
 #include <vector>
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -22,6 +23,50 @@
 namespace paddle {
 namespace framework {
 
+#ifdef PADDLE_WITH_MKLDNN
+using MKLDNNFormat = mkldnn::memory::format;
+using MKLDNNDataType = mkldnn::memory::data_type;
+
+inline MKLDNNFormat ToMKLDNNFormat(const DataLayout& layout) {
+  switch (layout) {
+    case DataLayout::kNHWC:
+      return MKLDNNFormat::nhwc;
+    case DataLayout::kNCHW:
+      return MKLDNNFormat::nchw;
+    default:
+      PADDLE_THROW("Fail to convert layout %s to MKLDNN format",
+                   DataLayoutToString(layout));
+  }
+}
+
+inline DataLayout ToPaddleLayout(const MKLDNNFormat& format) {
+  switch (format) {
+    case MKLDNNFormat::nhwc:
+      return DataLayout::kNHWC;
+    case MKLDNNFormat::nchw:
+      return DataLayout::kNCHW;
+    default:
+      PADDLE_THROW("Fail to convert MKLDNN format to paddle layout");
+  }
+}
+
+inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) {
+  static const std::map<std::type_index, MKLDNNDataType> dict{
+      {std::type_index(typeid(float)), MKLDNNDataType::f32},  // NOLINT
+      {std::type_index(typeid(char)), MKLDNNDataType::s8},    // NOLINT
+      {std::type_index(typeid(unsigned char)), MKLDNNDataType::u8},
+      {std::type_index(typeid(int16_t)), MKLDNNDataType::s16},
+      {std::type_index(typeid(int32_t)), MKLDNNDataType::s32}};
+  auto iter = dict.find(type);
+  if (iter != dict.end()) return iter->second;
+  return MKLDNNDataType::data_undef;
+}
+#endif
+
+void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
+                               const OpKernelType& expected_kernel_type,
+                               const Tensor& in, Tensor* out);
+
 std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to);
 
 void TransDataLayout(const OpKernelType& kernel_type_for_var,
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index 9c277a27da5af34fc9fb18ca073e369c05ecdf22..b8fcc92697ca1bf1d971f8fef020f31d405605a9 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -33,11 +33,34 @@ void DataTransform(const OpKernelType& expected_kernel_type,
   Tensor in;
   in.ShareDataWith(input_tensor);
   Tensor out;
+  DataLayout lin = kernel_type_for_var.data_layout_;
+  DataLayout lout = expected_kernel_type.data_layout_;
 
   // do layout transform
-  if (NeedTransformLayout(expected_kernel_type.data_layout_,
-                          kernel_type_for_var.data_layout_)) {
-    TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out);
+  if (NeedTransformLayout(lout, lin)) {
+    if (lin == DataLayout::kMKLDNN || lout == DataLayout::kMKLDNN) {
+      PADDLE_ENFORCE(
+          !(lin == DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN),
+          "No layout transform needed between two MKLDNN OPKernels");
+
+      if (lin != DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN) {
+#ifdef PADDLE_WITH_MKLDNN
+        // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
+        // Just set layout/format. No real transform occur
+        out.ShareDataWith(input_tensor);
+        out.set_layout(DataLayout::kMKLDNN);
+        out.set_format(ToMKLDNNFormat(lin));
+#endif
+      } else {
+        // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
+        // Do transform via MKLDNN lib
+        TransDataLayoutFromMKLDNN(kernel_type_for_var, expected_kernel_type, in,
+                                  &out);
+      }
+    } else {
+      // Case3 - transfrom between Non-MKLDNN OPKernels
+      TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out);
+    }
     transformed = true;
     PassTensorData(&out, &in);
   }
diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b6b93cf422a60c1d8e9cb8b477efd562f9fe4758
--- /dev/null
+++ b/paddle/fluid/framework/data_type.cc
@@ -0,0 +1,102 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/data_type.h"
+#include <stdint.h>
+#include <string>
+#include <unordered_map>
+
+namespace paddle {
+namespace framework {
+
+struct DataTypeMap {
+  std::unordered_map<std::type_index, proto::VarType::Type> cpp_to_proto_;
+  std::unordered_map<int, std::type_index> proto_to_cpp_;
+  std::unordered_map<int, std::string> proto_to_str_;
+  std::unordered_map<std::type_index, size_t> cpp_to_size_;
+};
+
+static DataTypeMap* InitDataTypeMap();
+static DataTypeMap& gDataTypeMap() {
+  static DataTypeMap* g_data_type_map_ = InitDataTypeMap();
+  return *g_data_type_map_;
+}
+
+template <typename T>
+static inline void RegisterType(DataTypeMap* map,
+                                proto::VarType::Type proto_type,
+                                const std::string& name) {
+  map->proto_to_cpp_.emplace(static_cast<int>(proto_type), typeid(T));
+  map->cpp_to_proto_.emplace(typeid(T), proto_type);
+  map->proto_to_str_.emplace(static_cast<int>(proto_type), name);
+  map->cpp_to_size_.emplace(typeid(T), sizeof(T));
+}
+
+static DataTypeMap* InitDataTypeMap() {
+  auto retv = new DataTypeMap();
+
+#define RegType(cc_type, proto_type) \
+  RegisterType<cc_type>(retv, proto_type, #cc_type)
+
+  // NOTE: Add your customize type here.
+  RegType(platform::float16, proto::VarType::FP16);
+  RegType(float, proto::VarType::FP32);
+  RegType(double, proto::VarType::FP64);
+  RegType(int, proto::VarType::INT32);
+  RegType(int64_t, proto::VarType::INT64);
+  RegType(bool, proto::VarType::BOOL);
+  RegType(size_t, proto::VarType::SIZE_T);
+  RegType(int16_t, proto::VarType::INT16);
+  RegType(uint8_t, proto::VarType::UINT8);
+
+#undef RegType
+  return retv;
+}
+
+proto::VarType::Type ToDataType(std::type_index type) {
+  auto it = gDataTypeMap().cpp_to_proto_.find(type);
+  if (it != gDataTypeMap().cpp_to_proto_.end()) {
+    return it->second;
+  }
+  PADDLE_THROW("Not support %s as tensor type", type.name());
+}
+
+std::type_index ToTypeIndex(proto::VarType::Type type) {
+  auto it = gDataTypeMap().proto_to_cpp_.find(static_cast<int>(type));
+  if (it != gDataTypeMap().proto_to_cpp_.end()) {
+    return it->second;
+  }
+  PADDLE_THROW("Not support proto::VarType::Type(%d) as tensor type",
+               static_cast<int>(type));
+}
+
+std::string DataTypeToString(const proto::VarType::Type type) {
+  auto it = gDataTypeMap().proto_to_str_.find(static_cast<int>(type));
+  if (it != gDataTypeMap().proto_to_str_.end()) {
+    return it->second;
+  }
+  PADDLE_THROW("Not support proto::VarType::Type(%d) as tensor type",
+               static_cast<int>(type));
+}
+
+size_t SizeOfType(std::type_index type) {
+  auto it = gDataTypeMap().cpp_to_size_.find(type);
+  if (it != gDataTypeMap().cpp_to_size_.end()) {
+    return it->second;
+  }
+  PADDLE_THROW("Not support %s as tensor type", type.name());
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index 2a528eb3aa562568c92059250f2c9bc5a75ec103..491413db8c8d66fd907801131e89d9303bdef9f2 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -17,51 +17,14 @@ limitations under the License. */
 #include <typeindex>
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/platform/enforce.h"
+
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace framework {
 
-inline proto::VarType::Type ToDataType(std::type_index type) {
-  if (typeid(platform::float16).hash_code() == type.hash_code()) {
-    return proto::VarType::FP16;
-  } else if (typeid(const float).hash_code() == type.hash_code()) {
-    // CPPLint complains Using C-style cast.  Use static_cast<float>() instead
-    // One fix to this is to replace float with const float because
-    // typeid(T) == typeid(const T)
-    // http://en.cppreference.com/w/cpp/language/typeid
-    return proto::VarType::FP32;
-  } else if (typeid(const double).hash_code() == type.hash_code()) {
-    return proto::VarType::FP64;
-  } else if (typeid(const int).hash_code() == type.hash_code()) {
-    return proto::VarType::INT32;
-  } else if (typeid(const int64_t).hash_code() == type.hash_code()) {
-    return proto::VarType::INT64;
-  } else if (typeid(const bool).hash_code() == type.hash_code()) {
-    return proto::VarType::BOOL;
-  } else {
-    PADDLE_THROW("Not supported");
-  }
-}
-
-inline std::type_index ToTypeIndex(proto::VarType::Type type) {
-  switch (type) {
-    case proto::VarType::FP16:
-      return typeid(platform::float16);
-    case proto::VarType::FP32:
-      return typeid(float);
-    case proto::VarType::FP64:
-      return typeid(double);
-    case proto::VarType::INT32:
-      return typeid(int);
-    case proto::VarType::INT64:
-      return typeid(int64_t);
-    case proto::VarType::BOOL:
-      return typeid(bool);
-    default:
-      PADDLE_THROW("Not support type %d", type);
-  }
-}
+extern proto::VarType::Type ToDataType(std::type_index type);
+extern std::type_index ToTypeIndex(proto::VarType::Type type);
 
 template <typename Visitor>
 inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
@@ -84,37 +47,23 @@ inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
     case proto::VarType::BOOL:
       visitor.template operator()<bool>();
       break;
-    default:
-      PADDLE_THROW("Not supported");
-  }
-}
-
-inline std::string DataTypeToString(const proto::VarType::Type type) {
-  switch (type) {
-    case proto::VarType::FP16:
-      return "float16";
-    case proto::VarType::FP32:
-      return "float32";
-    case proto::VarType::FP64:
-      return "float64";
+    case proto::VarType::UINT8:
+      visitor.template operator()<uint8_t>();
+      break;
     case proto::VarType::INT16:
-      return "int16";
-    case proto::VarType::INT32:
-      return "int32";
-    case proto::VarType::INT64:
-      return "int64";
-    case proto::VarType::BOOL:
-      return "bool";
+      visitor.template operator()<int16_t>();
+      break;
     default:
-      PADDLE_THROW("Not support type %d", type);
+      PADDLE_THROW("Not supported %d", type);
   }
 }
 
+extern std::string DataTypeToString(const proto::VarType::Type type);
+extern size_t SizeOfType(std::type_index type);
 inline std::ostream& operator<<(std::ostream& out,
                                 const proto::VarType::Type& type) {
   out << DataTypeToString(type);
   return out;
 }
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index c0523f3c795b103c0c27081ec5dc717f6a0f11e0..5a57ec20585c26dbcd4251464718fc819148a7a5 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -91,6 +91,12 @@ void TransDataType(const OpKernelType& kernel_type_for_var,
     case proto::VarType::BOOL:
       framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
       break;
+    case proto::VarType::INT16:
+      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
+      break;
+    case proto::VarType::UINT8:
+      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
+      break;
     default:
       PADDLE_THROW("Not support type %d", src_type);
   }
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 9de44beafbb69b3510b97afcc43d4b489a029c35..c43826b64cc5140c539df17fdd13d9bee7fefdcd 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -3,16 +3,17 @@ cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
-cc_library(send_op_handle SRCS send_op_handle.cc DEPS framework_proto scope place operator op_registry)
+cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)
 
 cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
 cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
+cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder)
 
 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
 
 if(WITH_GPU)
     nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-            dynload_cuda)
+            dynload_cuda variable_visitor)
     set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
     nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda)
     nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
@@ -24,9 +25,13 @@ else()
 endif()
 
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
+cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
 
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
-        scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle)
+        scale_loss_grad_op_handle rpc_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle)
+
+
+cc_library(graph_builder_factory SRCS graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer)
 
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
@@ -36,5 +41,6 @@ cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_ha
         device_context broadcast_op_handle)
 cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
         device_context gather_op_handle)
-cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
-        device_context reduce_op_handle )
+cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor)
+#cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
+#        device_context reduce_op_handle )
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 2afa47c81bead6fb104f49886713bf75dc1b4dc0..d5ca061944f33939cea59a5275e691b1966194fa 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -38,9 +38,7 @@ void BroadcastOpHandle::RunImpl() {
       out_var_handles.size(), places_.size(),
       "The number of output should equal to the number of places.");
 
-  // Wait input done, this Wait is asynchronous operation platform::Place
-  // &in_place;
-  WaitInputVarGenerated(*in_var_handle);
+  WaitInputVarGenerated();
 
   std::vector<const Scope *> var_scopes;
   for (auto *s : local_scopes_) {
@@ -50,29 +48,9 @@ void BroadcastOpHandle::RunImpl() {
   auto *in_var =
       var_scopes.at(in_var_handle->scope_idx_)->FindVar(in_var_handle->name_);
   PADDLE_ENFORCE_NOT_NULL(in_var);
-
   Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
 
-  // NOTE: The tensors' Place of input and output must be all on GPU or all on
-  // CPU.
-  for (auto *out_var_handle : out_var_handles) {
-    if (out_var_handle->IsTheSameVar(*in_var_handle)) {
-      continue;
-    }
-    auto t_out_p = out_var_handle->place_;
-    auto *out_var = var_scopes.at(out_var_handle->scope_idx_)
-                        ->FindVar(out_var_handle->name_);
-    PADDLE_ENFORCE_NOT_NULL(out_var);
-    if (platform::is_gpu_place(in_tensor.place())) {
-      PADDLE_ENFORCE(platform::is_gpu_place(t_out_p),
-                     "Places of input and output must be all on GPU.");
-    } else {
-      t_out_p = platform::CPUPlace();
-    }
-    VariableVisitor::ShareDimsAndLoD(*in_var, out_var);
-    VariableVisitor::GetMutableTensor(out_var).mutable_data(t_out_p,
-                                                            in_tensor.type());
-  }
+  InitOutputValue(*in_var_handle, out_var_handles);
 
   if (platform::is_cpu_place(in_tensor.place())) {
     for (auto *out_var_handle : out_var_handles) {
@@ -147,11 +125,37 @@ void BroadcastOpHandle::RunImpl() {
   }
 }
 
-void BroadcastOpHandle::WaitInputVarGenerated(const VarHandle &in_var) {
-  if (in_var.generated_op_) {
-    for (auto &pair : dev_ctxes_) {
-      in_var.generated_op_->Wait(pair.second);
+void BroadcastOpHandle::InitOutputValue(
+    const VarHandle &in_var_handle,
+    const std::vector<VarHandle *> &out_var_handles) const {
+  std::vector<const Scope *> var_scopes;
+  for (auto *s : local_scopes_) {
+    var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
+  }
+  auto *in_var =
+      var_scopes.at(in_var_handle.scope_idx_)->FindVar(in_var_handle.name_);
+
+  Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
+
+  // NOTE: The tensors' Place of input and output must be all on GPU or all on
+  // CPU.
+  for (auto *out_var_handle : out_var_handles) {
+    if (out_var_handle->IsTheSameVar(in_var_handle)) {
+      continue;
     }
+    auto t_out_p = out_var_handle->place_;
+    auto *out_var = var_scopes.at(out_var_handle->scope_idx_)
+                        ->FindVar(out_var_handle->name_);
+    PADDLE_ENFORCE_NOT_NULL(out_var);
+    if (is_gpu_place(in_tensor.place())) {
+      PADDLE_ENFORCE(platform::is_gpu_place(t_out_p),
+                     "Places of input and output must be all on GPU.");
+    } else {
+      t_out_p = platform::CPUPlace();
+    }
+    VariableVisitor::ShareDimsAndLoD(*in_var, out_var);
+    VariableVisitor::GetMutableTensor(out_var).mutable_data(t_out_p,
+                                                            in_tensor.type());
   }
 }
 
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index 984a95008c0393eff01c2d419cc98949aed14980..8036f756b6d6506684c109ab881d546f38176a10 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -57,14 +57,16 @@ struct BroadcastOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
-  void WaitInputVarGenerated(const VarHandle &in_var);
 
  private:
-  const std::vector<Scope *> &local_scopes_;
-  const std::vector<platform::Place> &places_;
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
 #ifdef PADDLE_WITH_CUDA
   const platform::NCCLContextMap *nccl_ctxs_;
 #endif
+
+  void InitOutputValue(const VarHandle &in_var_handle,
+                       const std::vector<VarHandle *> &out_var_handles) const;
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
new file mode 100644
index 0000000000000000000000000000000000000000..64e83acb4dc1995800c4ca3caf81668b24a7c9fe
--- /dev/null
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct BuildStrategy {
+  enum class ReduceStrategy { kAllReduce = 0, kReduce = 1 };
+
+  enum class GradientScaleStrategy {
+    kCoeffNumDevice = 0,
+    kOne = 1,
+    kCustomized = 2,
+  };
+
+  ReduceStrategy reduce_{ReduceStrategy::kAllReduce};
+  GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice};
+
+  std::string debug_graphviz_path_{""};
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index 7ff0efe09387b7e5d7cfe0dfe5e129ca9914d90b..df05bb06333d6b964f2f5434c3d43214e5d2cb7a 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -26,20 +26,20 @@ ComputationOpHandle::ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
       place_(place) {}
 
 void ComputationOpHandle::RunImpl() {
-  auto *cur_ctx = dev_ctxes_[place_];
-  for (auto *in : inputs_) {
-    bool need_wait = in->generated_op_ &&
-                     in->generated_op_->DeviceContext(place_) != cur_ctx;
-    if (need_wait) {
-      in->generated_op_->Wait(cur_ctx);
-    }
-  }
+  WaitInputVarGenerated(place_);
 
   this->RunAndRecordEvent([this] {
     op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
   });
 }
 
+bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) {
+  bool need_wait =
+      in_var && in_var->generated_op_ &&
+      in_var->generated_op_->DeviceContext(place_) != dev_ctxes_[place_];
+  return need_wait;
+}
+
 std::string ComputationOpHandle::Name() const { return op_->Type(); }
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h
index c363b973d9abbae6bea76c2458fbe82a37a342ca..f048f973fdeb6cf7d1485cda8cea7d530d9ba465 100644
--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -36,6 +36,8 @@ struct ComputationOpHandle : public OpHandleBase {
  protected:
   void RunImpl() override;
 
+  bool NeedWait(VarHandleBase *in_var) override;
+
  private:
   std::unique_ptr<OperatorBase> op_;
   Scope *scope_;
diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h
new file mode 100644
index 0000000000000000000000000000000000000000..e7aa74742f827efabff1189d3213edd748d9082d
--- /dev/null
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct ExecutionStrategy {
+  size_t num_threads_{0};
+  bool use_event_{true};
+  bool allow_op_delay_{false};
+  size_t num_iteration_per_drop_scope_{100};
+};
+
+}  //  namespace details
+}  //  namespace framework
+}  //  namespace paddle
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 1e8ca20b51d43554cf1898b41b31c27b90e6c642..224e8e1f6efd7a894591ac51c929517cae7539ce 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -31,7 +31,7 @@ FetchOpHandle::~FetchOpHandle() {
   }
 }
 
-void FetchOpHandle::Wait(platform::DeviceContext *waited_dev) {
+void FetchOpHandle::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
   PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error");
 }
 
@@ -45,24 +45,21 @@ void FetchOpHandle::WaitAndMergeCPUTensors() const {
 }
 
 void FetchOpHandle::RunImpl() {
-  auto cpu_ctx =
-      platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
-  for (auto *input : inputs_) {
-    auto *var = static_cast<VarHandle *>(input);
-    var->generated_op_->Wait(cpu_ctx);
-  }
+  WaitInputVarGenerated(platform::CPUPlace());
+
   tensors_.resize(inputs_.size());
-  auto *var_handle = static_cast<VarHandle *>(inputs_[0]);
-  auto &var_name = var_handle->name_;
   platform::CPUPlace cpu;
   auto &scopes = *local_scopes_;
 
-  for (size_t i = 0; i < scopes.size(); ++i) {
-    auto &scope = scopes[i];
-    auto *var =
-        scope->FindVar(kLocalExecScopeName)->Get<Scope *>()->FindVar(var_name);
+  for (size_t i = 0; i < inputs_.size(); ++i) {
+    auto *var_handle = static_cast<VarHandle *>(inputs_[i]);
+    auto &scope = scopes.at(var_handle->scope_idx_);
+    auto *var = scope->FindVar(kLocalExecScopeName)
+                    ->Get<Scope *>()
+                    ->FindVar(var_handle->name_);
     PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope",
-                            var_name);
+                            var_handle->name_);
+
     auto &t = var->Get<framework::LoDTensor>();
     if (platform::is_gpu_place(t.place())) {
 #ifdef PADDLE_WITH_CUDA
@@ -77,6 +74,15 @@ void FetchOpHandle::RunImpl() {
   this->WaitAndMergeCPUTensors();
 }
 
+void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) {
+  auto cpu_ctx = platform::DeviceContextPool::Instance().Get(place);
+  for (auto *input : inputs_) {
+    if (input->generated_op_) {
+      input->generated_op_->RecordWaitEventOnCtx(cpu_ctx);
+    }
+  }
+}
+
 std::string FetchOpHandle::Name() const { return "Fetch"; }
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h
index b49f3df338dc11310a4a0c27c8aaae3602373fcc..e09bdd1d3338bb175c1ddae35b53f98197b68e9a 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
@@ -33,7 +33,7 @@ struct FetchOpHandle : public OpHandleBase {
 
   ~FetchOpHandle();
 
-  void Wait(platform::DeviceContext *waited_dev) override;
+  void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) override;
 
   void WaitAndMergeCPUTensors() const;
 
@@ -42,6 +42,8 @@ struct FetchOpHandle : public OpHandleBase {
  protected:
   void RunImpl() override;
 
+  void WaitInputVarGenerated(const platform::Place &place) override;
+
  private:
   FeedFetchList *data_;
   size_t offset_;
diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.cc b/paddle/fluid/framework/details/fuse_vars_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..32415c192f0be51bf0850fe533c212c635779a30
--- /dev/null
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.cc
@@ -0,0 +1,51 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/fuse_vars_op_handle.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+void FuseVarsOpHandle::RunImpl() {
+  WaitInputVarGenerated(place_);
+
+  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
+  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
+  PADDLE_ENFORCE_EQ(in_var_handles.size(), 0);
+  PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), "");
+
+  auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+
+  auto out_var_handle = out_var_handles[0];
+  auto out_var = scope->Var(out_var_handle->name_);
+
+  auto out_tensor = out_var->GetMutable<LoDTensor>();
+  out_tensor->Resize({total_numel_}).mutable_data(this->place_, type_);
+
+  int64_t s = 0;
+  for (size_t i = 1; i < out_var_handles.size(); ++i) {
+    auto out_name = out_var_handles[i]->name_;
+    auto out_t = scope->Var(out_name)->GetMutable<LoDTensor>();
+    auto numel = this->inputs_numel_.at(out_name);
+    out_t->ShareDataWith(out_tensor->Slice(s, s + numel));
+    s += numel;
+  }
+  this->RunAndRecordEvent([this] {});
+}
+
+std::string FuseVarsOpHandle::Name() const { return "fuse vars"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.h b/paddle/fluid/framework/details/fuse_vars_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..140fb5bb49a33146de974b6d79559b4cf15bdd7b
--- /dev/null
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.h
@@ -0,0 +1,63 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct FuseVarsOpHandle : public OpHandleBase {
+ public:
+  FuseVarsOpHandle(Scope *local_scope, const platform::Place &place,
+                   const std::unordered_map<std::string, int64_t> &inputs_numel,
+                   const std::type_index &var_type)
+      : local_scope_(local_scope),
+        place_(place),
+        inputs_numel_(inputs_numel),
+        type_(var_type) {
+    total_numel_ = 0;
+    for (auto in_numel : inputs_numel) {
+      PADDLE_ENFORCE_GT(in_numel.second, 0);
+      total_numel_ += in_numel.second;
+    }
+  }
+
+  std::string Name() const override;
+
+  bool IsMultiDeviceTransfer() override { return false; };
+
+ protected:
+  void RunImpl() override;
+
+ private:
+  Scope *local_scope_;
+  const platform::Place place_;
+  const std::unordered_map<std::string, int64_t> inputs_numel_;
+  const std::type_index type_;
+  int64_t total_numel_;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc
index 3dfc972a44c62bd2adfc1331f29ffb1cca537652..2be02304566cf5dbe348fa01fc4171990eafd158 100644
--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -55,7 +55,7 @@ void GatherOpHandle::RunImpl() {
                  "Currently, gather_op only can gather SelectedRows.");
 
   // Wait input done, this Wait is asynchronous operation
-  WaitInputVarGenerated(in_var_handles);
+  WaitInputVarGenerated();
 
   auto &pre_in_value = pre_in_var->Get<framework::SelectedRows>();
   std::vector<int64_t> out_rows;
@@ -111,17 +111,6 @@ void GatherOpHandle::RunImpl() {
   });
 }
 
-void GatherOpHandle::WaitInputVarGenerated(
-    const std::vector<VarHandle *> &in_var_handles) {
-  for (auto *in : in_var_handles) {
-    if (in->generated_op_) {
-      for (auto pair : dev_ctxes_) {
-        in->generated_op_->Wait(pair.second);
-      }
-    }
-  }
-}
-
 std::string GatherOpHandle::Name() const { return "gather"; }
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/gather_op_handle.h b/paddle/fluid/framework/details/gather_op_handle.h
index c394dd7a14b07cb956aa1aedfc0df4fa25744dd7..d11ef8556aa8840949ca8dc7aa176413f70b9f22 100644
--- a/paddle/fluid/framework/details/gather_op_handle.h
+++ b/paddle/fluid/framework/details/gather_op_handle.h
@@ -39,7 +39,6 @@ struct GatherOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
-  void WaitInputVarGenerated(const std::vector<VarHandle *> &in_var_handles);
 
  private:
   const std::vector<Scope *> &local_scopes_;
diff --git a/paddle/fluid/framework/details/graph_builder_factory.cc b/paddle/fluid/framework/details/graph_builder_factory.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a04b9bb63c06b40ff5c30c9792cdfad5d64d404c
--- /dev/null
+++ b/paddle/fluid/framework/details/graph_builder_factory.cc
@@ -0,0 +1,47 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/graph_builder_factory.h"
+#include <fstream>
+#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/ssa_graph_printer.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+std::unique_ptr<SSAGraphBuilder> SSAGraphBuilderFactory::Create() {
+  std::unique_ptr<SSAGraphBuilder> res(
+#ifdef PADDLE_WITH_CUDA
+      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
+                                  local_scopes_, nccl_ctxs_, strategy_)
+#else
+      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
+                                  local_scopes_, strategy_)
+#endif
+          );  // NOLINT
+
+  if (!strategy_.debug_graphviz_path_.empty()) {
+    std::unique_ptr<std::ostream> fout(
+        new std::ofstream(strategy_.debug_graphviz_path_));
+    PADDLE_ENFORCE(fout->good());
+    std::unique_ptr<GraphvizSSAGraphPrinter> graphviz_printer(
+        new GraphvizSSAGraphPrinter());
+    res.reset(new SSAGraghBuilderWithPrinter(
+        std::move(fout), std::move(graphviz_printer), std::move(res)));
+  }
+  return res;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/graph_builder_factory.h b/paddle/fluid/framework/details/graph_builder_factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..857ab12d684e19788597e144fc0c46571d06aafc
--- /dev/null
+++ b/paddle/fluid/framework/details/graph_builder_factory.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include "paddle/fluid/platform/place.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace details {
+
+class SSAGraphBuilderFactory {
+ public:
+  SSAGraphBuilderFactory(const std::vector<platform::Place>& places,
+                         const std::string& loss_var_name,
+                         const std::unordered_set<std::string>& param_names,
+                         const std::vector<Scope*>& local_scopes,
+                         const BuildStrategy& strategy)
+      : places_(places),
+        loss_var_name_(loss_var_name),
+        param_names_(param_names),
+        local_scopes_(local_scopes),
+        strategy_(strategy) {}
+
+#ifdef PADDLE_WITH_CUDA
+  void SetNCCLContextMap(platform::NCCLContextMap* nccl_ctxs) {
+    nccl_ctxs_ = nccl_ctxs;
+  }
+#endif
+
+  std::unique_ptr<SSAGraphBuilder> Create();
+
+ private:
+  std::vector<platform::Place> places_;
+  std::string loss_var_name_;
+  std::unordered_set<std::string> param_names_;
+  std::vector<Scope*> local_scopes_;
+  BuildStrategy strategy_;
+
+#ifdef PADDLE_WITH_CUDA
+  platform::NCCLContextMap* nccl_ctxs_;
+#endif
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 21197d587b772aa046d6b3ce4394d3057ed6bf35..97242ebf2af304e1498e2ef37cd87d1ef07fb6df 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -11,22 +11,25 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include <algorithm>
+#include <fstream>
+#include <string>
 #include <utility>
+#include <vector>
+
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
+#include "paddle/fluid/framework/details/rpc_op_handle.h"
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
-#include "paddle/fluid/framework/details/send_op_handle.h"
+#include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/scope.h"
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
 #endif
 
-#include <string>
-#include <vector>
-
 namespace paddle {
 namespace framework {
 namespace details {
@@ -37,25 +40,26 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
     const std::string &loss_var_name,
     const std::unordered_set<std::string> &params,
     const std::vector<Scope *> &local_scopes,
-    platform::NCCLContextMap *nccl_ctxs, bool use_default_grad_scale)
+    platform::NCCLContextMap *nccl_ctxs, const BuildStrategy &strategy)
     : loss_var_name_(loss_var_name),
       places_(places),
       local_scopes_(local_scopes),
-      nccl_ctxs_(nccl_ctxs) {
+      nccl_ctxs_(nccl_ctxs),
+      strategy_(strategy) {
 #else
 MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
     const std::vector<platform::Place> &places,
     const std::string &loss_var_name,
     const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes, bool use_default_grad_scale)
+    const std::vector<Scope *> &local_scopes, const BuildStrategy &strategy)
     : loss_var_name_(loss_var_name),
       places_(places),
-      local_scopes_(local_scopes) {
+      local_scopes_(local_scopes),
+      strategy_(strategy) {
 #endif
   for (auto &p : params) {
     grad_names_.insert(GradVarName(p));
   }
-  use_default_grad_scale_ = use_default_grad_scale;
 }
 
 void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
@@ -77,9 +81,44 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
   }
 }
 
-bool MultiDevSSAGraphBuilder::IsDistTrainOp(const OpDesc &op,
-                                            OpDesc *send_op) const {
-  if (send_op == nullptr) {
+std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars(
+    const ProgramDesc &program) const {
+  std::vector<std::string> send_vars;
+  // since parameters are all in block 0,
+  // it's enough to only scan send ops in block 0
+  for (auto *op : program.Block(0).AllOps()) {
+    // TODO(Yancey1989): use a graceful method to find send op,
+    // instead of the the hard code string
+    if (op->Type() == "send_vars") {
+      auto op_vars = op->InputArgumentNames();
+      send_vars.reserve(send_vars.size() +
+                        std::distance(op_vars.begin(), op_vars.end()));
+      send_vars.insert(send_vars.end(), op_vars.begin(), op_vars.end());
+    }
+  }
+  return send_vars;
+}
+
+std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainRecvVars(
+    const ProgramDesc &program) const {
+  std::vector<std::string> recv_vars;
+  for (auto *op : program.Block(0).AllOps()) {
+    // TODO(Yancey1989): use a graceful method to find recv op,
+    // instead of the hard code string
+    if (op->Type() == "recv") {
+      auto op_vars = op->OutputArgumentNames();
+      recv_vars.reserve(recv_vars.size() +
+                        std::distance(op_vars.begin(), op_vars.end()));
+      recv_vars.insert(recv_vars.end(), op_vars.begin(), op_vars.end());
+    }
+  }
+  return recv_vars;
+}
+
+bool MultiDevSSAGraphBuilder::IsDistTrainOp(
+    const OpDesc &op, const std::vector<std::string> &send_vars,
+    const std::vector<std::string> &recv_vars) const {
+  if (send_vars.size() == 0 || recv_vars.size() == 0) {
     return false;
   }
 
@@ -87,29 +126,28 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(const OpDesc &op,
    * Check any of opvars contains `.block` and in sendvars
    */
   auto checker = [](const std::vector<std::string> &opvars,
-                    const std::vector<std::string> &sendvars) -> bool {
+                    const std::vector<std::string> &rpc_vars) -> bool {
     for (auto &var : opvars) {
+      // a variable name with the suffix `.block` means it's a splited
+      // variable by (DistributeTranspiler)
+      // [python/paddle/fluid/transpiler/distribute_transpiler.py]
       if (var.find(".block") != std::string::npos &&
-          std::find(sendvars.begin(), sendvars.end(), var) != sendvars.end()) {
+          std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) {
         return true;
       }
     }
     return false;
   };
 
-  if (op.Type() == "split") {
-    return checker(op.OutputArgumentNames(), send_op->InputArgumentNames());
-  } else if (op.Type() == "concat") {
-    return checker(op.InputArgumentNames(), send_op->OutputArgumentNames());
-  }
-  return false;
+  return checker(op.OutputArgumentNames(), send_vars) ||
+         checker(op.InputArgumentNames(), recv_vars);
 }
 
 std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
     const ProgramDesc &program) const {
-  std::unordered_map<std::string, proto::VarType::Type> var_types;
+  std::unordered_map<std::string, VarDesc *> all_vars;
   for (auto *var : program.Block(0).AllVars()) {
-    var_types[var->Name()] = var->GetType();
+    all_vars[var->Name()] = var;
   }
 
   auto graph = new SSAGraph();
@@ -121,42 +159,109 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
       std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>(
       places_.size());
 
-  // Find "send" op first for split is in front of send.
-  OpDesc *send_op = GetSendOpDesc(program);
+  // find send/recv vars so that we can place the distributed training
+  // realted op in the place 0
+  auto send_vars = FindDistTrainSendVars(program);
+  auto recv_vars = FindDistTrainRecvVars(program);
+
+  std::vector<std::unordered_set<std::string>> var_name_on_devices;
+  std::vector<std::unordered_set<std::string>> bcast_var_name_set;
+  var_name_on_devices.resize(places_.size());
+  bcast_var_name_set.resize(places_.size());
+
+  size_t cur_device_id = 0;
+  std::vector<int64_t> balance_grads(places_.size(), 0);
+
+  auto get_appropriate_dev = [&](std::string &g_name) -> size_t {
+    auto var_desc = all_vars.at(g_name);
+    PADDLE_ENFORCE_NOT_NULL(var_desc);
+    auto dim = framework::make_ddim(var_desc->GetShape());
+    int64_t numel = framework::product(dim);
+    PADDLE_ENFORCE_GE(numel, 0);
+    auto smallest =
+        std::min_element(std::begin(balance_grads), std::end(balance_grads));
+    size_t dev_id =
+        static_cast<size_t>(std::distance(std::begin(balance_grads), smallest));
+    balance_grads[dev_id] += numel;
+    return dev_id;
+  };
 
   bool is_forwarding = true;
   for (auto *op : program.Block(0).AllOps()) {
-    if (op->Type() == "send") {
-      // append send op if program is distributed trainer main program.
+    if (boost::get<int>(
+            op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+        static_cast<int>(OpRole::kRPC)) {
+      // append rpc op if program is distributed trainer main program.
       // always use the first device
-      CreateSendOp(&result, *op);
-    } else if (IsDistTrainOp(*op, send_op)) {
-      CreateComputationalOps(&result, *op, 1);
+      CreateRPCOp(&result, *op);
+    } else if (IsDistTrainOp(*op, send_vars, recv_vars)) {
+      CreateDistTrainOp(&result, *op);
     } else if (IsScaleLossOp(*op)) {
       // user can customize loss@grad if not use_default_grad_scale_
-      if (use_default_grad_scale_) {
+      if (strategy_.gradient_scale_ !=
+          BuildStrategy::GradientScaleStrategy::kCustomized) {
         CreateScaleLossGradOp(&result);
       }
       is_forwarding = false;
     } else {
-      CreateComputationalOps(&result, *op, places_.size());
+      int op_dev_id = GetOpDeviceID(var_name_on_devices, *op);
+      if (op_dev_id == -1) {  // var on all device
+        CreateComputationalOps(&result, *op, places_.size());
+      } else {
+        CreateComputationalOp(&result, *op, op_dev_id);
+        for (auto &var_name : op->OutputArgumentNames()) {
+          var_name_on_devices[op_dev_id].emplace(var_name);
+        }
+      }
       if (!is_forwarding && places_.size() > 1) {
         // Currently, we assume that once gradient is generated, it can be
         // broadcast, and each gradient is only broadcast once.
-        for (auto &og : op->OutputArgumentNames()) {
-          if (IsParameterGradientOnce(og, &og_has_been_broadcast)) {
-            if (IsSparseGradient(var_types, og)) {
-              CreateReduceOp(&result, og, 0);
-              CreateBroadcastOp(&result, og, 0);
-            } else {
-              InsertNCCLAllReduceOp(&result, og);
+        if (static_cast<bool>(boost::get<int>(op->GetAttr(
+                                  OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                              static_cast<int>(OpRole::kBackward))) {
+          try {
+            auto backward_vars =
+                boost::get<std::vector<std::string>>(op->GetNullableAttr(
+                    OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+
+            PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
+
+            for (size_t i = 0; i < backward_vars.size(); i += 2) {
+              auto &p_name = backward_vars[i];
+              auto &g_name = backward_vars[i + 1];
+              VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
+
+              switch (strategy_.reduce_) {
+                case BuildStrategy::ReduceStrategy::kReduce:
+                  cur_device_id = get_appropriate_dev(g_name);
+                  CreateReduceOp(&result, g_name, cur_device_id);
+                  var_name_on_devices[cur_device_id].emplace(g_name);
+                  bcast_var_name_set[cur_device_id].emplace(p_name);
+                  break;
+                case BuildStrategy::ReduceStrategy::kAllReduce:
+                  if (IsSparseGradient(all_vars, g_name)) {
+                    CreateReduceOp(&result, g_name, 0);
+                    CreateBroadcastOp(&result, g_name, 0);
+                  } else {
+                    InsertNCCLAllReduceOp(&result, g_name);
+                  }
+                  break;
+              }
             }
+          } catch (boost::bad_get e) {
           }
         }
       }
     }
   }
 
+  // Insert BCast Ops
+  for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
+    auto &to_bcast_set = bcast_var_name_set[dev_id];
+    for (auto &bcast_name : to_bcast_set) {
+      CreateBroadcastOp(&result, bcast_name, dev_id);
+    }
+  }
   /*
     Dependency graph has been constructed. However, there are still data
     harzaeds need to be handled.
@@ -168,20 +273,14 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
    */
   AddOutputToLeafOps(&result);
 
-  if (VLOG_IS_ON(10)) {
-    std::ostringstream sout;
-    PrintGraphviz(*graph, sout);
-    VLOG(10) << sout.str();
-  }
-
   return std::unique_ptr<SSAGraph>(graph);
 }
 
 bool MultiDevSSAGraphBuilder::IsSparseGradient(
-    const std::unordered_map<std::string, proto::VarType::Type> &var_types,
+    const std::unordered_map<std::string, VarDesc *> &all_vars,
     const std::string &og) const {
-  PADDLE_ENFORCE(var_types.count(og) != 0);
-  if (var_types.at(og) == proto::VarType::SELECTED_ROWS) {
+  PADDLE_ENFORCE(all_vars.count(og) != 0);
+  if (all_vars.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
     return true;
   }
   return false;
@@ -221,15 +320,6 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(SSAGraph *result,
   CreateOpHandleIOs(result, op, dev_id);
 }
 
-OpDesc *MultiDevSSAGraphBuilder::GetSendOpDesc(
-    const ProgramDesc &program) const {
-  for (auto *op : program.Block(0).AllOps()) {
-    if (op->Type() == "send") {
-      return op;
-    }
-  }
-  return nullptr;
-}
 void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp(
     SSAGraph *result, const std::string &og) const {
 #ifdef PADDLE_WITH_CUDA
@@ -265,6 +355,26 @@ bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
   return is_pg_once;
 }
 
+int MultiDevSSAGraphBuilder::GetOpDeviceID(
+    const std::vector<std::unordered_set<std::string>> &var_name_on_devices,
+    const OpDesc &op) const {
+  if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
+    return -1;
+  }
+
+  int var_dev_id = -1;
+  for (auto &var_name : op.InputArgumentNames()) {
+    if (var_dev_id != -1) break;
+    for (size_t i = 0; i < var_name_on_devices.size(); ++i) {
+      if (var_name_on_devices[i].count(var_name)) {
+        var_dev_id = static_cast<int>(i);
+        break;
+      }
+    }
+  }
+  return var_dev_id;
+}
+
 void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
   for (size_t i = 0; i < places_.size(); ++i) {
 // Insert ScaleCost OpHandle
@@ -332,23 +442,57 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result,
   return var;
 }
 
-void MultiDevSSAGraphBuilder::CreateSendOp(SSAGraph *result,
-                                           const OpDesc &op) const {
-  auto &p = places_[0];
-  auto *s = local_scopes_[0];
-  // FIXME(wuyi): send op always copy from GPU 0
-  result->ops_.emplace_back(new SendOpHandle(op, s, p));
-  // Create inputs for output on original place and no ssa output
-  // is created for send op.
+void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op,
+                                        const std::string &prev_op_name) const {
+  for (auto &prev_op : result->ops_) {
+    if (prev_op->Name() == prev_op_name) {
+      auto *dep_var = new DummyVarHandle();
+      prev_op->AddOutput(dep_var);
+      result->dep_vars_.emplace(dep_var);
+      op->AddInput(dep_var);
+    }
+  }
+}
+
+void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
+                                                const OpDesc &op) const {
+  CreateComputationalOp(result, op, 0);
+  if (op.Type() == "concat") {
+    ConnectOp(result, result->ops_.back().get(), "fetch_barrier");
+  }
+}
+
+void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
+                                          const OpDesc &op) const {
+  result->ops_.emplace_back(
+      new RPCOpHandle(op, local_scopes_[0], op.Type(), places_[0]));
+
+  if (op.Type() == "send_barrier") {
+    ConnectOp(result, result->ops_.back().get(), "send_vars");
+  } else if (op.Type() == "recv") {
+    ConnectOp(result, result->ops_.back().get(), "send_barrier");
+  } else if (op.Type() == "fetch_barrier") {
+    ConnectOp(result, result->ops_.back().get(), "recv");
+  } else if (op.Type() == "send_vars") {
+    // do nothing
+  } else {
+    PADDLE_THROW(
+        "rpc op should be in ["
+        "send_vars, send_barrier. recv, fetch_barrier]");
+  }
+
+  // TODO(Yancey1989): schedule rpc op on different place may
+  // increate throughput
   CreateOpHandleIOs(result, op, 0);
 }
 
 bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
-  // FIXME(yy): Do not hard code like this
-  return op.OutputArgumentNames().size() == 1 &&
-         op.OutputArgumentNames()[0] == GradVarName(loss_var_name_);
+  return boost::get<int>(
+             op.GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+             (static_cast<int>(OpRole::kBackward) |
+              static_cast<int>(OpRole::kLoss)) &&
+         !loss_var_name_.empty();  // If loss_var is empty. This is test mode
 }
-
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index 674e2779a112c26d05b84cd54df2c826e9a63373..544cbe585c7423b5f3eb98ee698ca5668376f1ca 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -17,6 +17,7 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
 
 namespace paddle {
@@ -36,13 +37,13 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
                           const std::unordered_set<std::string> &params,
                           const std::vector<Scope *> &local_scopes,
                           platform::NCCLContextMap *nccl_ctxs,
-                          bool use_default_grad_scale);
+                          const BuildStrategy &strategy);
 #else
   MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
                           const std::string &loss_var_name,
                           const std::unordered_set<std::string> &params,
                           const std::vector<Scope *> &local_scopes,
-                          bool use_default_grad_scale);
+                          const BuildStrategy &strategy);
 #endif
 
   std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
@@ -60,16 +61,27 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 #ifdef PADDLE_WITH_CUDA
   platform::NCCLContextMap *nccl_ctxs_;
 #endif
-  bool use_default_grad_scale_;
 
   bool IsScaleLossOp(const OpDesc &op) const;
 
-  void CreateSendOp(SSAGraph *result, const OpDesc &op) const;
+  void CreateRPCOp(SSAGraph *result, const OpDesc &op) const;
+  void CreateDistTrainOp(SSAGraph *result, const OpDesc &op) const;
 
   /**
    * Is this operator as the end-point operator before/after send operator.
    */
-  bool IsDistTrainOp(const OpDesc &op, OpDesc *send_op) const;
+  bool IsDistTrainOp(const OpDesc &op,
+                     const std::vector<std::string> &send_vars,
+                     const std::vector<std::string> &recv_vars) const;
+
+  std::vector<std::string> FindDistTrainSendVars(
+      const ProgramDesc &program) const;
+
+  std::vector<std::string> FindDistTrainRecvVars(
+      const ProgramDesc &program) const;
+
+  void ConnectOp(SSAGraph *result, OpHandleBase *op,
+                 const std::string &prev_op_name) const;
 
   void CreateComputationalOps(SSAGraph *result, const OpDesc &op,
                               size_t num_places) const;
@@ -84,20 +96,21 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
       const std::string &og,
       std::unordered_set<std::string> *og_has_been_broadcast) const;
 
+  int GetOpDeviceID(
+      const std::vector<std::unordered_set<std::string>> &var_name_on_devices,
+      const OpDesc &op) const;
+
   void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const;
 
   void CreateBroadcastOp(SSAGraph *result, const std::string &p_name,
                          size_t src_dev_id) const;
 
-  /**
-   * Get send op in the global block of program.
-   * nullptr if not found.
-   */
-  OpDesc *GetSendOpDesc(const ProgramDesc &program) const;
-
   bool IsSparseGradient(
-      const std::unordered_map<std::string, proto::VarType::Type> &var_types,
+      const std::unordered_map<std::string, VarDesc *> &all_vars,
       const std::string &og) const;
+
+ private:
+  BuildStrategy strategy_;
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
index b055bb48f608c9fd9cc671d175cb463d25dc489b..5bba089ade801a06e0364835efe5249105dcfcac 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -11,10 +11,12 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include <algorithm>
 
+#include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
-#include <algorithm>
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
+#include "paddle/fluid/framework/details/variable_visitor.h"
 
 namespace paddle {
 namespace framework {
@@ -30,30 +32,34 @@ NCCLAllReduceOpHandle::NCCLAllReduceOpHandle(
 }
 
 void NCCLAllReduceOpHandle::RunImpl() {
-  if (inputs_.size() == 1) {
+  if (NoDummyInputSize() == 1) {
     return;  // No need to all reduce when GPU count = 1;
   } else {
     // Wait input done
-    for (auto *in : inputs_) {
-      auto &p = static_cast<VarHandle *>(in)->place_;
-      in->generated_op_->Wait(dev_ctxes_[p]);
-    }
-
-    auto &var_name = static_cast<VarHandle *>(this->inputs_[0])->name_;
-    int dtype = -1;
-    size_t numel = 0;
+    WaitInputVarGenerated();
+    auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
+    auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
+    PADDLE_ENFORCE_EQ(
+        in_var_handles.size(), places_.size(),
+        "The NoDummyInputSize should be equal to the number of places.");
+    PADDLE_ENFORCE_EQ(
+        in_var_handles.size(), out_var_handles.size(),
+        "The NoDummyInputSize and NoDummyOutputSize should be equal.");
 
     std::vector<const LoDTensor *> lod_tensors;
-
     for (size_t i = 0; i < local_scopes_.size(); ++i) {
       auto *s = local_scopes_[i];
       auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
-
-      auto &lod_tensor = local_scope.FindVar(var_name)->Get<LoDTensor>();
+      auto &lod_tensor =
+          local_scope.FindVar(in_var_handles[i]->name_)->Get<LoDTensor>();
       lod_tensors.emplace_back(&lod_tensor);
+      PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_,
+                        "The name of input and output should be equal.");
     }
 
     if (platform::is_gpu_place(lod_tensors[0]->place())) {
+      int dtype = -1;
+      size_t numel = 0;
       std::vector<std::function<void()>> all_reduce_calls;
       for (size_t i = 0; i < local_scopes_.size(); ++i) {
         auto &p = places_[i];
@@ -99,7 +105,7 @@ void NCCLAllReduceOpHandle::RunImpl() {
         auto &scope =
             *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
         auto &p = places_[i];
-        auto *var = scope.FindVar(var_name);
+        auto *var = scope.FindVar(in_var_handles[i]->name_);
         auto *dev_ctx = dev_ctxes_[p];
 
         RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
index a0c321843e3fc5abcbd1ef2ce2e153250269aa7d..8e98d894b828b4162059b30f5c6a74cfc06f402e 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
@@ -41,8 +41,8 @@ struct NCCLAllReduceOpHandle : public OpHandleBase {
   void RunImpl() override;
 
  private:
-  const std::vector<Scope *> &local_scopes_;
-  const std::vector<platform::Place> &places_;
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
   const platform::NCCLContextMap &nccl_ctxs_;
 };
 
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 534d77860f87be08c8834efd373d90eb199ed6a2..3849cca59a3347137b769f97261cfbf97da8d6ff 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -56,15 +56,15 @@ void OpHandleBase::Run(bool use_event) {
   RunImpl();
 }
 
-void OpHandleBase::Wait(platform::DeviceContext *waited_dev) {
+void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
 #ifdef PADDLE_WITH_CUDA
-  if (platform::is_cpu_place(waited_dev->GetPlace()) || events_.empty()) {
+  if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) {
     for (auto &dev_ctx : dev_ctxes_) {
       dev_ctx.second->Wait();
     }
   } else {
     auto stream =
-        static_cast<platform::CUDADeviceContext *>(waited_dev)->stream();
+        static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream();
     for (auto &ev : events_) {
       PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0));
     }
@@ -86,6 +86,38 @@ void OpHandleBase::AddOutput(VarHandleBase *out) {
   out->generated_op_ = this;
 }
 
+void OpHandleBase::WaitInputVarGenerated() {
+  for (auto in_var : inputs_) {
+    if (NeedWait(in_var)) {
+      for (auto &pair : dev_ctxes_) {
+        in_var->generated_op_->RecordWaitEventOnCtx(pair.second);
+      }
+    }
+  }
+}
+
+void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
+  for (auto *in : inputs_) {
+    if (NeedWait(in)) {
+      in->generated_op_->RecordWaitEventOnCtx(dev_ctxes_[place]);
+    }
+  }
+}
+
+size_t OpHandleBase::NoDummyInputSize() const {
+  size_t cnt = 0;
+  for (auto *in : inputs_) {
+    if (dynamic_cast<DummyVarHandle *>(in) == nullptr) {
+      ++cnt;
+    }
+  }
+  return cnt;
+}
+
+bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
+  return in_var && in_var->generated_op_;
+}
+
 void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 #ifdef PADDLE_WITH_CUDA
   if (!events_.empty()) {  // Use event
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 00f213f3ed294adcce7c540e3ff346de8e2be7fb..dc92b0fe9f760d95d4869fdd56c0400b6710437f 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -38,12 +38,24 @@ class OpHandleBase {
 
   void Run(bool use_event);
 
-  virtual void Wait(platform::DeviceContext *waited_dev);
+  virtual void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx);
 
   void AddInput(VarHandleBase *in);
 
   void AddOutput(VarHandleBase *out);
 
+  // This method adds the wait events of all the input on all the device
+  // context.
+  // NODE: This Wait is asynchronous operation.
+  virtual void WaitInputVarGenerated();
+
+  // This method adds the wait events of all the input on the specified device
+  // context.
+  // NODE: This Wait is asynchronous operation.
+  virtual void WaitInputVarGenerated(const platform::Place &place);
+
+  virtual bool NeedWait(VarHandleBase *in_var);
+
   // If the Op involves data transfer of multiple devices that
   // will likely block other computations.
   virtual bool IsMultiDeviceTransfer() { return false; }
@@ -58,8 +70,18 @@ class OpHandleBase {
 
   const std::vector<VarHandleBase *> &Inputs() const { return inputs_; }
 
+  size_t NoDupInputSize() const {
+    std::unordered_set<VarHandleBase *> res;
+    for (auto *var : inputs_) {
+      res.emplace(var);
+    }
+    return res.size();
+  }
+
   const std::vector<VarHandleBase *> &Outputs() const { return outputs_; }
 
+  size_t NoDummyInputSize() const;
+
  protected:
   void RunAndRecordEvent(const std::function<void()> &callback);
 
diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
index 06603db31e0092382c0cc05482a038473d647ef1..eea7e712f8f6e187cdceedce77cc76d1d4ca2101 100644
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -95,8 +95,8 @@ struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
   void operator()(const char* op_type, OpInfo* info) const {
     info->proto_ = new proto::OpProto;
     info->checker_ = new OpAttrChecker();
-    auto maker = T(info->proto_, info->checker_);
-    maker.Validate();
+    T maker;
+    maker(info->proto_, info->checker_);
     info->proto_->set_type(op_type);
     PADDLE_ENFORCE(
         info->proto_->IsInitialized(),
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index 1bb04c1dfca107f4b7ce4c599e9aa132de3e5985..7160e346dad0615e2fd32b70c096880af0359e1a 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -51,7 +51,7 @@ void ReduceOpHandle::RunImpl() {
   PADDLE_ENFORCE_NOT_NULL(pre_in_var);
 
   // Wait input done, this Wait is asynchronous operation
-  WaitInputVarGenerated(in_var_handles);
+  WaitInputVarGenerated();
 
   // NOTE: The Places of all input tensor must be all on CPU or all on GPU.
   std::vector<platform::Place> in_places;  // used to get dev_ctx
@@ -80,19 +80,21 @@ void ReduceOpHandle::RunImpl() {
   }
 
   if (pre_in_var->IsType<framework::SelectedRows>()) {
-    std::vector<const SelectedRows *> in_selected_rows =
-        GetInputValues<SelectedRows>(in_var_handles, var_scopes);
-
-    GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, t_out_p,
-                       out_var->GetMutable<framework::SelectedRows>());
+    this->RunAndRecordEvent([&] {
+      std::vector<const SelectedRows *> in_selected_rows =
+          GetInputValues<SelectedRows>(in_var_handles, var_scopes);
+      GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, t_out_p,
+                         out_var->GetMutable<framework::SelectedRows>());
+    });
   } else {
     std::vector<const LoDTensor *> lod_tensors =
         GetInputValues<LoDTensor>(in_var_handles, var_scopes);
-
     if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) {
-      ReduceLoDTensor func(lod_tensors,
-                           out_var->GetMutable<framework::LoDTensor>());
-      VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+      this->RunAndRecordEvent([&] {
+        ReduceLoDTensor func(lod_tensors,
+                             out_var->GetMutable<framework::LoDTensor>());
+        VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+      });
     } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
 #ifdef PADDLE_WITH_CUDA
       auto pre_in = pre_in_var->Get<framework::LoDTensor>();
@@ -157,17 +159,6 @@ std::vector<const T *> ReduceOpHandle::GetInputValues(
   return in_selected_rows;
 }
 
-void ReduceOpHandle::WaitInputVarGenerated(
-    const std::vector<VarHandle *> &in_var_handles) {
-  for (auto *in : in_var_handles) {
-    if (in->generated_op_) {
-      for (auto pair : dev_ctxes_) {
-        in->generated_op_->Wait(pair.second);
-      }
-    }
-  }
-}
-
 std::string ReduceOpHandle::Name() const { return "reduce"; }
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
index 59731d348d17755fbd8bf3b6fa29b32bdefaf71e..4d14334cdfe06e2e805c2577458d6689e6324cc7 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -32,8 +32,8 @@ namespace framework {
 namespace details {
 
 struct ReduceOpHandle : public OpHandleBase {
-  const std::vector<Scope *> &local_scopes_;
-  const std::vector<platform::Place> &places_;
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
 
 #ifdef PADDLE_WITH_CUDA
   const platform::NCCLContextMap *nccl_ctxs_;
@@ -60,8 +60,6 @@ struct ReduceOpHandle : public OpHandleBase {
  protected:
   void RunImpl() override;
 
-  void WaitInputVarGenerated(const std::vector<VarHandle *> &in_var_handles);
-
   template <typename T>
   std::vector<const T *> GetInputValues(
       const std::vector<VarHandle *> &in_var_handles,
diff --git a/paddle/fluid/framework/details/send_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc
similarity index 67%
rename from paddle/fluid/framework/details/send_op_handle.cc
rename to paddle/fluid/framework/details/rpc_op_handle.cc
index 0763f92171e7813ec0ee8ca4f3aa42b76205130a..586465f99fd94117c821be2952bffda385fbcf75 100644
--- a/paddle/fluid/framework/details/send_op_handle.cc
+++ b/paddle/fluid/framework/details/rpc_op_handle.cc
@@ -12,27 +12,32 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/details/send_op_handle.h"
+#include "paddle/fluid/framework/details/rpc_op_handle.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
-SendOpHandle::SendOpHandle(const framework::OpDesc &op_desc,
-                           const Scope *local_scope,
-                           const platform::Place &place)
+RPCOpHandle::RPCOpHandle(const framework::OpDesc &op_desc,
+                         const Scope *local_scope, const std::string &name,
+                         const platform::Place &place)
     : op_(framework::OpRegistry::CreateOp(op_desc)),
       local_scope_(local_scope),
+      name_(name),
       place_(place) {}
 
-void SendOpHandle::RunImpl() {
+void RPCOpHandle::RunImpl() {
+  // TODO(wuyi): need further analysis whether wait VarDummyHandle.
   // Wait input done
   for (auto *in : inputs_) {
     auto &p = static_cast<VarHandle *>(in)->place_;
+    // FIXME(Yancey1989): need a better solution instead of use DebugString()
     if (in->DebugString() == "dummy") {  // HACK
       continue;
     }
-    in->generated_op_->Wait(dev_ctxes_[p]);
+    if (in->generated_op_) {
+      in->generated_op_->RecordWaitEventOnCtx(dev_ctxes_[p]);
+    }
   }
   auto &tmp_scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
   // FIXME(wuyi): can not use RunAndRecordEvent here, for it will cause dead
@@ -40,7 +45,7 @@ void SendOpHandle::RunImpl() {
   op_->Run(*tmp_scope, place_);
 }
 
-std::string SendOpHandle::Name() const { return "send"; }
+std::string RPCOpHandle::Name() const { return name_; }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/send_op_handle.h b/paddle/fluid/framework/details/rpc_op_handle.h
similarity index 85%
rename from paddle/fluid/framework/details/send_op_handle.h
rename to paddle/fluid/framework/details/rpc_op_handle.h
index 2f78811fad50642b5e45776c41910df6f4cc48f6..ae38c7fe19e102a330455d89a1068414a7835fab 100644
--- a/paddle/fluid/framework/details/send_op_handle.h
+++ b/paddle/fluid/framework/details/rpc_op_handle.h
@@ -27,9 +27,9 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-struct SendOpHandle : public OpHandleBase {
-  SendOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
-               const platform::Place& place);
+struct RPCOpHandle : public OpHandleBase {
+  RPCOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
+              const std::string& name, const platform::Place& place);
 
   std::string Name() const override;
 
@@ -43,7 +43,8 @@ struct SendOpHandle : public OpHandleBase {
  private:
   std::unique_ptr<OperatorBase> op_;
   const Scope* local_scope_;
-  const platform::Place& place_;
+  const std::string name_;
+  platform::Place place_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index 1cd3113030086104e7fc5c4ba3364a5ff027632b..d9c387e79dc71288e7330597fed57171d447f31b 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -29,6 +29,7 @@ ScaleLossGradOpHandle::ScaleLossGradOpHandle(size_t num_dev, Scope *scope,
 ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
 
 void ScaleLossGradOpHandle::RunImpl() {
+  // Doesn't wait any event
   std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
   auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
 
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb4e7ec52f907f9403e21ec2734d61824f51a58b
--- /dev/null
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -0,0 +1,76 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/executor.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
+    ExecutionStrategy strategy, std::vector<Scope *> local_scopes,
+    std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
+    std::unique_ptr<SSAGraphExecutor> &&underlying_executor)
+    : strategy_(std::move(strategy)),
+      underlying_executor_(std::move(underlying_executor)),
+      local_scopes_(std::move(local_scopes)),
+      var_infos_(std::move(var_infos)),
+      places_(std::move(places)) {}
+
+FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
+    const std::vector<std::string> &fetch_tensors) {
+  if (drop_scope_counter_ == 0) {
+    // Create local scopes.
+    for (auto it = local_scopes_.rbegin(); it != local_scopes_.rend(); ++it) {
+      auto &scope = *it;
+      Scope &local_scope = scope->NewScope();
+      *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
+          &local_scope;
+
+      for (auto &info : var_infos_) {
+        if (scope->FindVar(info.name_) != nullptr) {
+          continue;
+        }
+
+        if (info.persistable_) {  // Persistable
+          InitializeVariable(scope->Var(info.name_), info.type_);
+        } else {
+          InitializeVariable(local_scope.Var(info.name_), info.type_);
+        }
+      }
+    }
+  }
+
+  auto fetch_data = underlying_executor_->Run(fetch_tensors);
+  drop_scope_counter_ += 1;
+  if (!fetch_tensors.empty() ||
+      drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
+    drop_scope_counter_ = 0;
+    // Wait All computational streams
+    for (auto p : places_) {
+      platform::DeviceContextPool::Instance().Get(p)->Wait();
+    }
+    for (auto &scope : local_scopes_) {
+      auto &local_scope =
+          *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
+      scope->DeleteScope(local_scope);
+    }
+  }
+  return fetch_data;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..20df7a4722d589ffd168f842e927cff8411096bb
--- /dev/null
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/execution_strategy.h"
+#include "paddle/fluid/framework/details/ssa_graph_executor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/place.h"
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct VariableInfo {
+  std::string name_;
+  proto::VarType::Type type_;
+  bool persistable_;
+};
+
+class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
+ public:
+  ScopeBufferedSSAGraphExecutor(
+      ExecutionStrategy strategy, std::vector<Scope*> local_scopes,
+      std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
+      std::unique_ptr<SSAGraphExecutor>&& underlying_executor);
+  FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;
+
+ private:
+  size_t drop_scope_counter_{0};
+
+  ExecutionStrategy strategy_;
+  std::unique_ptr<SSAGraphExecutor> underlying_executor_;
+  std::vector<Scope*> local_scopes_;
+  std::vector<VariableInfo> var_infos_;
+  std::vector<platform::Place> places_;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
index 6a567527550883add08031e50aa8de2b204cf13d..211113c7979ee95d896c0a57879f7b3ad13b36ef 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -73,64 +73,6 @@ void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
   op_handle->AddOutput(var);
 }
 
-template <typename Callback>
-void IterAllVar(const SSAGraph &graph, Callback callback) {
-  for (auto &each : graph.vars_) {
-    for (auto &pair1 : each) {
-      for (auto &pair2 : pair1.second) {
-        callback(*pair2);
-      }
-    }
-  }
-
-  for (auto &var : graph.dep_vars_) {
-    callback(*var);
-  }
-}
-
-void SSAGraphBuilder::PrintGraphviz(const SSAGraph &graph, std::ostream &sout) {
-  size_t var_id = 0;
-  std::unordered_map<const VarHandleBase *, size_t> vars;
-
-  sout << "digraph G {\n";
-
-  IterAllVar(graph, [&](const VarHandleBase &var) {
-    auto *var_ptr = &var;
-    auto *var_handle_ptr = dynamic_cast<const VarHandle *>(var_ptr);
-    auto *dummy_ptr = dynamic_cast<const DummyVarHandle *>(var_ptr);
-
-    size_t cur_var_id = var_id++;
-    vars[var_ptr] = cur_var_id;
-
-    if (var_handle_ptr) {
-      sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_
-           << "\\n"
-           << var_handle_ptr->place_ << "\\n"
-           << var_handle_ptr->version_ << "\"]" << std::endl;
-    } else if (dummy_ptr) {
-      sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl;
-    }
-  });
-
-  size_t op_id = 0;
-  for (auto &op : graph.ops_) {
-    std::string op_name = "op_" + std::to_string(op_id++);
-    sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
-         << std::endl;
-    for (auto in : op->Inputs()) {
-      std::string var_name = "var_" + std::to_string(vars[in]);
-      sout << var_name << " -> " << op_name << std::endl;
-    }
-
-    for (auto out : op->Outputs()) {
-      std::string var_name = "var_" + std::to_string(vars[out]);
-      sout << op_name << " -> " << var_name << std::endl;
-    }
-  }
-
-  sout << "}\n";
-}
-
 void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) {
   for (auto &op : graph->ops_) {
     if (!op->Outputs().empty()) {
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h
index 64e5d93081eb76c56898bbeb530e37364619fdbb..5fc12a44b51fae26e5a8f5fdba952d3879e82d0f 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -55,8 +55,6 @@ class SSAGraphBuilder {
                              const platform::Place &place, size_t place_offset);
 
   static void AddOutputToLeafOps(SSAGraph *graph);
-
-  static void PrintGraphviz(const SSAGraph &graph, std::ostream &sout);
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.cc b/paddle/fluid/framework/details/ssa_graph_executor.cc
index 8da6ca889b89999e0f6f974503cea476c9de97f3..09b97bd0d98dc4ad1124dcbc495cff921bf03efc 100644
--- a/paddle/fluid/framework/details/ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/ssa_graph_executor.cc
@@ -17,10 +17,6 @@
 namespace paddle {
 namespace framework {
 namespace details {
-
-SSAGraphExecutor::SSAGraphExecutor(std::unique_ptr<SSAGraph> &&graph)
-    : graph_(std::move(graph)) {}
-
 SSAGraphExecutor::~SSAGraphExecutor() {}
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h
index a8833b7388ab907020a260d356f1484ffd227658..958086033607a4ed8fb840f5b14fe5779625bd82 100644
--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
@@ -28,15 +28,11 @@ class SSAGraphExecutor {
   DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor);
 
  public:
-  // Steal graph inside
-  explicit SSAGraphExecutor(std::unique_ptr<SSAGraph> &&graph);
+  SSAGraphExecutor() {}
 
   virtual ~SSAGraphExecutor();
 
   virtual FeedFetchList Run(const std::vector<std::string> &fetch_tensors) = 0;
-
- protected:
-  std::unique_ptr<SSAGraph> graph_;
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.cc b/paddle/fluid/framework/details/ssa_graph_printer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..22a40ca4b25cdd8ed9856b6c71bffc79561edcac
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_printer.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/ssa_graph_printer.h"
+#include <string>
+#include "paddle/fluid/framework/details/ssa_graph.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+template <typename Callback>
+static inline void IterAllVar(const SSAGraph &graph, Callback callback) {
+  for (auto &each : graph.vars_) {
+    for (auto &pair1 : each) {
+      for (auto &pair2 : pair1.second) {
+        callback(*pair2);
+      }
+    }
+  }
+
+  for (auto &var : graph.dep_vars_) {
+    callback(*var);
+  }
+}
+
+void GraphvizSSAGraphPrinter::Print(const SSAGraph &graph,
+                                    std::ostream &sout) const {
+  size_t var_id = 0;
+  std::unordered_map<const VarHandleBase *, size_t> vars;
+
+  sout << "digraph G {\n";
+
+  IterAllVar(graph, [&](const VarHandleBase &var) {
+    auto *var_ptr = &var;
+    auto *var_handle_ptr = dynamic_cast<const VarHandle *>(var_ptr);
+    auto *dummy_ptr = dynamic_cast<const DummyVarHandle *>(var_ptr);
+
+    size_t cur_var_id = var_id++;
+    vars[var_ptr] = cur_var_id;
+
+    if (var_handle_ptr) {
+      sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_
+           << "\\n"
+           << var_handle_ptr->place_ << "\\n"
+           << var_handle_ptr->version_ << "\"]" << std::endl;
+    } else if (dummy_ptr) {
+      sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl;
+    }
+  });
+
+  size_t op_id = 0;
+  for (auto &op : graph.ops_) {
+    std::string op_name = "op_" + std::to_string(op_id++);
+    sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
+         << std::endl;
+    for (auto in : op->Inputs()) {
+      std::string var_name = "var_" + std::to_string(vars[in]);
+      sout << var_name << " -> " << op_name << std::endl;
+    }
+
+    for (auto out : op->Outputs()) {
+      std::string var_name = "var_" + std::to_string(vars[out]);
+      sout << op_name << " -> " << var_name << std::endl;
+    }
+  }
+
+  sout << "}\n";
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4c90013789759d17646d95efdc81fc6a0a4f3e7
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_printer.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <iosfwd>
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+struct SSAGraph;
+class SSAGraphPrinter {
+ public:
+  virtual ~SSAGraphPrinter() {}
+  virtual void Print(const SSAGraph& graph, std::ostream& sout) const = 0;
+};
+
+class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
+ public:
+  void Print(const SSAGraph& graph, std::ostream& sout) const override;
+};
+
+class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
+ public:
+  SSAGraghBuilderWithPrinter(std::ostream& sout,
+                             std::unique_ptr<SSAGraphPrinter>&& printer,
+                             std::unique_ptr<SSAGraphBuilder>&& builder)
+      : printer_(std::move(printer)),
+        builder_(std::move(builder)),
+        stream_ref_(sout) {}
+
+  SSAGraghBuilderWithPrinter(std::unique_ptr<std::ostream>&& sout,
+                             std::unique_ptr<SSAGraphPrinter>&& printer,
+                             std::unique_ptr<SSAGraphBuilder>&& builder)
+      : printer_(std::move(printer)),
+        builder_(std::move(builder)),
+        stream_ptr_(std::move(sout)),
+        stream_ref_(*stream_ptr_) {}
+
+  std::unique_ptr<SSAGraph> Build(const ProgramDesc& program) const override {
+    auto graph = builder_->Build(program);
+    printer_->Print(*graph, stream_ref_);
+    return graph;
+  }
+
+ private:
+  std::unique_ptr<SSAGraphPrinter> printer_;
+  std::unique_ptr<SSAGraphBuilder> builder_;
+  std::unique_ptr<std::ostream> stream_ptr_;
+  std::ostream& stream_ref_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 5e6ed5cb7cdc534332d402380458f39aecd841b8..496fadd04dac982b87b9d9e14f599ed37d9709d0 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -14,24 +14,21 @@
 
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 
-#include "paddle/fluid/framework/details/fetch_op_handle.h"
-
 namespace paddle {
 namespace framework {
 namespace details {
 ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
-    size_t num_threads, bool use_event,
-    const std::vector<Scope *> &local_scopes,
+    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places,
-    std::unique_ptr<SSAGraph> &&graph, bool allow_op_delay)
-    : SSAGraphExecutor(std::move(graph)),
-      pool_(num_threads >= 2 ? new ::ThreadPool(num_threads) : nullptr),
+    std::unique_ptr<SSAGraph> &&graph)
+    : graph_(std::move(graph)),
+      pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
+                                       : nullptr),
       local_scopes_(local_scopes),
       places_(places),
       fetch_ctxs_(places),
-      use_event_(use_event),
       running_ops_(0),
-      allow_op_delay_(allow_op_delay) {}
+      strategy_(strategy) {}
 
 FeedFetchList ThreadedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
@@ -45,73 +42,33 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   // Should revisit it if overlapping is available.
   std::unordered_set<OpHandleBase *> delayed_ops;
 
-  auto InsertPendingVar = [&pending_vars, &ready_vars](VarHandleBase &var) {
-    pending_vars.insert(&var);
-    if (var.generated_op_ == nullptr) {
-      ready_vars.Push(&var);
-    }
-  };
-
-  auto InsertPendingOp = [&pending_ops](OpHandleBase &op_instance) {
-    pending_ops.insert({&op_instance, op_instance.Inputs().size()});
-  };
-
   // Transform SSAGraph to pending_ops & pending_vars
   for (auto &var_map : graph_->vars_) {
     for (auto &name_pair : var_map) {
       for (auto &version_pair : name_pair.second) {
-        InsertPendingVar(*version_pair);
+        InsertPendingVar(&pending_vars, &ready_vars, version_pair.get());
       }
     }
   }
   for (auto &var : graph_->dep_vars_) {
-    InsertPendingVar(*var);
+    InsertPendingVar(&pending_vars, &ready_vars, var.get());
   }
 
   for (auto &op : graph_->ops_) {
     if (op->Inputs().empty()) {  // Special case, Op has no input.
       ready_ops.insert(op.get());
     } else {
-      InsertPendingOp(*op);
+      InsertPendingOp(&pending_ops, op.get());
     }
   }
 
   // Step 2. Insert FetchOps
   std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
-  FeedFetchList fetch_data(fetch_tensors.size());
-
-  std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
-
-  for (auto &fetch_var_name : fetch_tensors) {
-    for (auto &var_map : graph_->vars_) {
-      auto it = var_map.find(fetch_var_name);
-      if (it != var_map.end()) {
-        fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
-      }
-    }
-  }
-
   std::unordered_set<std::unique_ptr<VarHandleBase>> fetch_dependencies;
-  for (size_t i = 0; i < fetch_tensors.size(); ++i) {
-    auto &var_name = fetch_tensors[i];
-    auto &vars = fetched_vars.at(var_name);
-    auto *op = new FetchOpHandle(&fetch_data, i, &local_scopes_);
-    fetch_ops.emplace_back(op);
-
-    for (auto &p : places_) {
-      op->SetDeviceContext(p, fetch_ctxs_.Get(p));
-    }
-
-    for (auto *var : vars) {
-      op->AddInput(var);
-    }
+  FeedFetchList fetch_data(fetch_tensors.size());
 
-    auto *fetch_dummy = new DummyVarHandle();
-    op->AddOutput(fetch_dummy);
-    fetch_dependencies.emplace(fetch_dummy);
-    InsertPendingVar(*fetch_dummy);
-    InsertPendingOp(*op);
-  }
+  InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops,
+                 &pending_vars, &ready_vars, &fetch_data);
 
   auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
     for (auto *op : set) {
@@ -128,7 +85,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     //
     // NOTE: DelayedOps have a lower priority. It will be scheduled after all
     // ready_ops have been performed.
-    if (ready_ops.empty() && allow_op_delay_ && running_ops_ == 0) {
+    if (ready_ops.empty() && strategy_.allow_op_delay_ && running_ops_ == 0) {
       run_all_ops(delayed_ops);
     } else {
       run_all_ops(ready_ops);
@@ -155,7 +112,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
         auto &deps = pending_ops[op];
         --deps;
         if (deps == 0) {
-          if (op->IsMultiDeviceTransfer() && allow_op_delay_) {
+          if (op->IsMultiDeviceTransfer() && strategy_.allow_op_delay_) {
             delayed_ops.insert(op);
           } else {
             ready_ops.insert(op);
@@ -174,12 +131,68 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   return fetch_data;
 }
 
+void ThreadedSSAGraphExecutor::InsertFetchOps(
+    const std::vector<std::string> &fetch_tensors,
+    std::vector<std::unique_ptr<FetchOpHandle>> *fetch_ops,
+    std::unordered_set<std::unique_ptr<VarHandleBase>> *fetch_dependencies,
+    std::unordered_map<OpHandleBase *, size_t> *pending_ops,
+    std::unordered_set<VarHandleBase *> *pending_vars,
+    BlockingQueue<VarHandleBase *> *ready_vars, FeedFetchList *fetch_data) {
+  std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
+
+  for (auto &fetch_var_name : fetch_tensors) {
+    for (auto &var_map : graph_->vars_) {
+      auto it = var_map.find(fetch_var_name);
+      if (it != var_map.end()) {
+        fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
+      }
+    }
+  }
+
+  for (size_t i = 0; i < fetch_tensors.size(); ++i) {
+    auto &var_name = fetch_tensors[i];
+    auto &vars = fetched_vars.at(var_name);
+    auto *op = new FetchOpHandle(fetch_data, i, &local_scopes_);
+    fetch_ops->emplace_back(op);
+
+    for (auto &p : places_) {
+      op->SetDeviceContext(p, fetch_ctxs_.Get(p));
+    }
+
+    for (auto *var : vars) {
+      op->AddInput(var);
+    }
+
+    auto *fetch_dummy = new DummyVarHandle();
+    op->AddOutput(fetch_dummy);
+    fetch_dependencies->emplace(fetch_dummy);
+    this->InsertPendingVar(pending_vars, ready_vars, fetch_dummy);
+    this->InsertPendingOp(pending_ops, op);
+  }
+}
+
+void ThreadedSSAGraphExecutor::InsertPendingOp(
+    std::unordered_map<OpHandleBase *, size_t> *pending_ops,
+    OpHandleBase *op_instance) const {
+  pending_ops->insert({op_instance, op_instance->NoDupInputSize()});
+}
+
+void ThreadedSSAGraphExecutor::InsertPendingVar(
+    std::unordered_set<VarHandleBase *> *pending_vars,
+    BlockingQueue<VarHandleBase *> *ready_vars, VarHandleBase *var) const {
+  pending_vars->insert(var);
+  if (var->generated_op_ == nullptr) {
+    ready_vars->Push(var);
+  }
+}
 void ThreadedSSAGraphExecutor::RunOp(
     BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) {
   auto op_run = [ready_var_q, op, this] {
     try {
-      VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
-      op->Run(use_event_);
+      if (VLOG_IS_ON(10)) {
+        VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
+      }
+      op->Run(strategy_.use_event_);
       VLOG(10) << op << " " << op->Name() << " Done ";
       running_ops_--;
       ready_var_q->Extend(op->Outputs());
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index d089b79d91327e38408439a8019ec5189ff6d189..4a2075f1cccb3211316567197da56c01d26f35ce 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -23,6 +23,8 @@
 #include <functional>
 #include "ThreadPool.h"  // ThreadPool in thrird party
 #include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/details/execution_strategy.h"
+#include "paddle/fluid/framework/details/fetch_op_handle.h"
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
 
 namespace paddle {
@@ -33,11 +35,10 @@ namespace details {
 
 class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  public:
-  ThreadedSSAGraphExecutor(size_t num_threads, bool use_event,
+  ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
                            const std::vector<Scope *> &local_scopes,
                            const std::vector<platform::Place> &places,
-                           std::unique_ptr<SSAGraph> &&graph,
-                           bool allow_op_delay);
+                           std::unique_ptr<SSAGraph> &&graph);
 
   // Run a SSAGraph by a thread pool
   // Use topological sort algorithm
@@ -50,14 +51,31 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
              details::OpHandleBase *op);
 
  private:
+  std::unique_ptr<SSAGraph> graph_;
   std::unique_ptr<::ThreadPool> pool_;
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
   platform::DeviceContextPool fetch_ctxs_;
-  const bool use_event_;
   std::unique_ptr<platform::EnforceNotMet> exception_;
   std::atomic<int> running_ops_;
-  bool allow_op_delay_;
+
+  void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
+                       OpHandleBase *op_instance) const;
+
+  void InsertPendingVar(std::unordered_set<VarHandleBase *> *pending_vars,
+                        BlockingQueue<VarHandleBase *> *ready_vars,
+                        VarHandleBase *var) const;
+
+  void InsertFetchOps(
+      const std::vector<std::string> &fetch_tensors,
+      std::vector<std::unique_ptr<FetchOpHandle>> *fetch_ops,
+      std::unordered_set<std::unique_ptr<VarHandleBase>> *fetch_dependencies,
+      std::unordered_map<OpHandleBase *, size_t> *pending_ops,
+      std::unordered_set<VarHandleBase *> *pending_vars,
+      BlockingQueue<VarHandleBase *> *ready_vars, FeedFetchList *fetch_data);
+
+ private:
+  ExecutionStrategy strategy_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index ce91d7a82674364560b8065277b28b51ae1b303a..3d68c5fb870d5b575f97eeb286528544402b8ed9 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -24,9 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(benchmark);
-DEFINE_bool(check_nan_inf, false,
-            "Checking whether operator produce NAN/INF or not. It will be "
-            "extremely slow so please use this flag wisely.");
 
 namespace paddle {
 namespace framework {
@@ -78,21 +75,6 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
   }
 }
 
-static void CheckTensorNANOrInf(const std::string& name,
-                                const framework::Tensor& tensor) {
-  if (tensor.memory_size() == 0) {
-    return;
-  }
-  if (tensor.type().hash_code() != typeid(float).hash_code() &&   // NOLINT
-      tensor.type().hash_code() != typeid(double).hash_code()) {  // NOLINT
-    return;
-  }
-  PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
-                 "Tensor %s contains Inf", name);
-  PADDLE_ENFORCE(!framework::TensorContainsNAN(tensor),
-                 "Tensor %s contains NAN", name);
-}
-
 void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
                                int block_id) {
   auto& global_block = pdesc.Block(block_id);
@@ -228,7 +210,8 @@ static bool has_fetch_operators(
 void Executor::Run(const ProgramDesc& program, Scope* scope,
                    std::map<std::string, const LoDTensor*>* feed_targets,
                    std::map<std::string, LoDTensor*>* fetch_targets,
-                   bool create_vars, const std::string& feed_holder_name,
+                   bool create_local_scope, bool create_vars,
+                   const std::string& feed_holder_name,
                    const std::string& fetch_holder_name) {
   platform::RecordBlock b(kProgramId);
   bool has_feed_ops =
@@ -237,8 +220,10 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
       has_fetch_operators(program.Block(0), *fetch_targets, fetch_holder_name);
 
   ProgramDesc* copy_program = const_cast<ProgramDesc*>(&program);
+  std::unique_ptr<ProgramDesc> unique_ptr_of_copy_program;
   if (!has_feed_ops || !has_fetch_ops) {
-    copy_program = std::unique_ptr<ProgramDesc>(new ProgramDesc(program)).get();
+    unique_ptr_of_copy_program.reset(new ProgramDesc(program));
+    copy_program = unique_ptr_of_copy_program.get();
   }
 
   auto* global_block = copy_program->MutableBlock(0);
@@ -290,8 +275,9 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
   }
 
   auto ctx = Prepare(*copy_program, 0);
-  RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets, create_vars,
-                     feed_holder_name, fetch_holder_name);
+  RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets,
+                     create_local_scope, create_vars, feed_holder_name,
+                     fetch_holder_name);
 }
 
 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
@@ -338,15 +324,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
       VLOG(2) << "Memory used after operator " + op->Type() + " running: "
               << memory::memory_usage(place_);
     }
-    if (FLAGS_check_nan_inf) {
-      for (auto& vname : op->OutputVars(true)) {
-        auto* var = local_scope->FindVar(vname);
-        if (var == nullptr) continue;
-        if (var->IsType<framework::LoDTensor>()) {
-          CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
-        }
-      }
-    }
   }
   platform::DeviceContextPool::Instance().Get(place_)->Wait();
   if (create_vars && create_local_scope) {
@@ -366,8 +343,9 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
 void Executor::RunPreparedContext(
     ExecutorPrepareContext* ctx, Scope* scope,
     std::map<std::string, const LoDTensor*>* feed_targets,
-    std::map<std::string, LoDTensor*>* fetch_targets, bool create_vars,
-    const std::string& feed_holder_name, const std::string& fetch_holder_name) {
+    std::map<std::string, LoDTensor*>* fetch_targets, bool create_local_scope,
+    bool create_vars, const std::string& feed_holder_name,
+    const std::string& fetch_holder_name) {
   auto& global_block = ctx->prog_.Block(ctx->block_id_);
 
   PADDLE_ENFORCE(
@@ -387,7 +365,7 @@ void Executor::RunPreparedContext(
     }
   }
 
-  RunPreparedContext(ctx, scope, create_vars, create_vars);
+  RunPreparedContext(ctx, scope, create_local_scope, create_vars);
 
   // obtain the data of fetch_targets from fetch_holder
   for (auto* op : global_block.AllOps()) {
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 4a3d637e2d79f8cbd83412eea2d73e4b497ef1e7..0c3c23611d95e0da67cabfb8fb2755a4a52c991b 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -57,7 +57,7 @@ class Executor {
   void Run(const ProgramDesc& program, Scope* scope,
            std::map<std::string, const LoDTensor*>* feed_targets,
            std::map<std::string, LoDTensor*>* fetch_targets,
-           bool create_vars = true,
+           bool create_local_scope = true, bool create_vars = true,
            const std::string& feed_holder_name = "feed",
            const std::string& fetch_holder_name = "fetch");
 
@@ -76,6 +76,7 @@ class Executor {
   void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                           std::map<std::string, const LoDTensor*>* feed_targets,
                           std::map<std::string, LoDTensor*>* fetch_targets,
+                          bool create_local_scope = true,
                           bool create_vars = true,
                           const std::string& feed_holder_name = "feed",
                           const std::string& fetch_holder_name = "fetch");
diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto
index 96f53dc1bc8747e1b8ea84166614f98ff363ae5e..d35125fe8c3c8018c38650dc87b2b1474ded6058 100644
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -101,6 +101,9 @@ message VarType {
     FP16 = 4;
     FP32 = 5;
     FP64 = 6;
+    // Tensor<size_t> is used in C++.
+    SIZE_T = 19;
+    UINT8 = 20;
 
     // Other types that may need additional descriptions
     LOD_TENSOR = 7;
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index 77e5ec4c7dd14b7ebb6d606b8c401ee714259d40..2ceffc93319359683e87e7fec2d18784c9bf02f3 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -228,11 +228,12 @@ TEST(LoD, CheckAbsLoD) {
   ASSERT_FALSE(CheckAbsLoD(abs_lod0));
 }
 
-TEST(LoDTensor, RecordIO) {
+template <typename T>
+static void TestRecordIO() {
   LoDTensor tensor;
-  int* tmp = tensor.mutable_data<int>(make_ddim({4, 5}), platform::CPUPlace());
+  T* tmp = tensor.mutable_data<T>(make_ddim({4, 5}), platform::CPUPlace());
   for (int i = 0; i < 20; ++i) {
-    tmp[i] = i;
+    tmp[i] = static_cast<T>(i);
   }
 
   std::stringstream* stream = new std::stringstream();
@@ -247,7 +248,7 @@ TEST(LoDTensor, RecordIO) {
 
   auto assert_tensor_ok = [](const LoDTensor& tensor) {
     for (int i = 0; i < 20; ++i) {
-      ASSERT_EQ(tensor.data<int>()[i], i);
+      ASSERT_EQ(tensor.data<T>()[i], static_cast<T>(i));
     }
   };
 
@@ -265,5 +266,13 @@ TEST(LoDTensor, RecordIO) {
   }
 }
 
+TEST(LoDTensor, RecordIO) {
+  TestRecordIO<int>();
+  TestRecordIO<int16_t>();
+  TestRecordIO<uint8_t>();
+  TestRecordIO<float>();
+  TestRecordIO<double>();
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 076c45713015797f86a3611dd333132bae40044d..f92769192c218eb7cdc2350ff6e4721b45005806 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <unordered_map>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/shape_inference.h"
@@ -102,7 +103,7 @@ void OpDesc::CopyFrom(const OpDesc &op_desc) {
   need_update_ = true;
 }
 
-OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block)
+OpDesc::OpDesc(const proto::OpDesc &desc, BlockDesc *block)
     : desc_(desc), need_update_(false) {
   // restore inputs_
   int input_size = desc_.inputs_size();
@@ -222,6 +223,15 @@ Attribute OpDesc::GetAttr(const std::string &name) const {
   return it->second;
 }
 
+Attribute OpDesc::GetNullableAttr(const std::string &name) const {
+  auto it = attrs_.find(name);
+  if (it != attrs_.end()) {
+    return it->second;
+  } else {
+    return Attribute();
+  }
+}
+
 int OpDesc::GetBlockAttr(const std::string &name) const {
   auto it = attrs_.find(name);
   PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
@@ -233,13 +243,8 @@ const std::unordered_map<std::string, Attribute> &OpDesc::GetAttrMap() const {
 }
 
 void OpDesc::Rename(const std::string &old_name, const std::string &new_name) {
-  for (auto &input : inputs_) {
-    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
-  }
-  for (auto &output : outputs_) {
-    std::replace(output.second.begin(), output.second.end(), old_name,
-                 new_name);
-  }
+  RenameInput(old_name, new_name);
+  RenameOutput(old_name, new_name);
   need_update_ = true;
 }
 
@@ -249,6 +254,13 @@ void OpDesc::RenameOutput(const std::string &old_name,
     std::replace(output.second.begin(), output.second.end(), old_name,
                  new_name);
   }
+
+  auto it = attrs_.find(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName());
+  if (it != attrs_.end()) {
+    auto &op_vars = boost::get<std::vector<std::string>>(it->second);
+    std::replace(op_vars.begin(), op_vars.end(), old_name, new_name);
+  }
+
   need_update_ = true;
 }
 
@@ -257,6 +269,13 @@ void OpDesc::RenameInput(const std::string &old_name,
   for (auto &input : inputs_) {
     std::replace(input.second.begin(), input.second.end(), old_name, new_name);
   }
+
+  auto it = attrs_.find(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName());
+  if (it != attrs_.end()) {
+    auto &op_vars = boost::get<std::vector<std::string>>(it->second);
+    std::replace(op_vars.begin(), op_vars.end(), old_name, new_name);
+  }
+
   need_update_ = true;
 }
 
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index 3ee36a47c156da67a9ff70852665fbbd464bea17..a02d3e269129596f65a2fb346e76c1af7fbead95 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -33,13 +33,14 @@ class OpDesc {
   OpDesc(const std::string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const AttributeMap &attrs);
 
-  OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block);
+  OpDesc(const proto::OpDesc &desc, BlockDesc *block);
 
   explicit OpDesc(BlockDesc *block) : block_(block) {}
 
   OpDesc(const OpDesc &other, BlockDesc *block) {
     *this = other;
     block_ = block;
+    need_update_ = true;
   }
 
   void CopyFrom(const OpDesc &op_desc);
@@ -78,6 +79,8 @@ class OpDesc {
 
   Attribute GetAttr(const std::string &name) const;
 
+  Attribute GetNullableAttr(const std::string &name) const;
+
   int GetBlockAttr(const std::string &name) const;
 
   void Rename(const std::string &old_name, const std::string &new_name);
diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h
index fab20d75f5a45257f243333c1998d7b2549a25f9..f51a184e7bae2283f335fe9462a77b9c5fb831a5 100644
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -87,7 +87,14 @@ inline std::string KernelTypeToString(const OpKernelType& kernel_key) {
 }
 
 inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) {
-  return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r;
+  bool ret =
+      (l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r);
+#ifdef PADDLE_WITH_MKLDNN
+  // Layout transform needed for either non-MKLDNN to MKLDNN or vice versa
+  ret |= (l != DataLayout::kMKLDNN && r == DataLayout::kMKLDNN);
+  ret |= (l == DataLayout::kMKLDNN && r != DataLayout::kMKLDNN);
+#endif
+  return ret;
 }
 
 inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) {
diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc
index d37ce149ce3df63692b41289bb03448d54e392f5..db95861c510b52a5b52229541434e6437d3fb9f4 100644
--- a/paddle/fluid/framework/op_kernel_type_test.cc
+++ b/paddle/fluid/framework/op_kernel_type_test.cc
@@ -27,7 +27,7 @@ TEST(OpKernelType, ToString) {
                               LibraryType::kCUDNN);
 
   ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type),
-            "data_type[float32]:data_layout[NCHW]:place[CPUPlace]:library_type["
+            "data_type[float]:data_layout[NCHW]:place[CPUPlace]:library_type["
             "CUDNN]");
 }
 
diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index c479d7617cfa34cd381d84d15d5e214d57af52d0..ae9f4efd44acdcdff2806deea6826e4089459a78 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -13,6 +13,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include <string>
+#include <vector>
 
 namespace paddle {
 namespace framework {
@@ -55,5 +56,28 @@ void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
   }
 }
 
+void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
+                                        OpAttrChecker* attr_checker) {
+  proto_ = proto;
+  op_checker_ = attr_checker;
+  Make();
+
+  AddAttr<int>(OpRoleAttrName(), "The role of this operator")
+      .InEnum(
+          {static_cast<int>(OpRole::kForward),
+           static_cast<int>(OpRole::kBackward),
+           static_cast<int>(OpRole::kOptimize), static_cast<int>(OpRole::kRPC),
+           static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kForward),
+           static_cast<int>(OpRole::kLoss) |
+               static_cast<int>(OpRole::kBackward),
+           static_cast<int>(OpRole::kNotSpecified)})
+      .SetDefault(static_cast<int>(OpRole::kNotSpecified));
+  AddAttr<std::vector<std::string>>(OpRoleVarAttrName(),
+                                    "Optimized for variable")
+      .SetDefault({});
+
+  Validate();
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 0beb57ce1609d2e90c05d3255647bd321bc1f6a9..8493b9d8b326c71a33b95bf95e5fc1743c686eb7 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -14,56 +14,68 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+#include "glog/logging.h"
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/framework.pb.h"
-
 namespace paddle {
 namespace framework {
 
+enum class OpRole {
+  kForward = 0x0000,
+  kBackward = 0x0001,
+  kOptimize = 0x0002,
+  kRPC = 0x0003,
+
+  kLoss = 0x0100,
+  // The default value of op's role. This should be only used for unittests and
+  // CreateOp inside a operator.
+  kNotSpecified = 0x1000,
+};
+
 // this class not only make proto but also init attribute checkers.
 class OpProtoAndCheckerMaker {
  public:
-  using OpProto = proto::OpProto;
-  using OpAttrChecker = framework::OpAttrChecker;
-  OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : proto_(proto), op_checker_(op_checker) {}
+  static const char *OpRoleAttrName() { return "op_role"; }
+  static const char *OpRoleVarAttrName() { return "op_role_var"; }
+
+  void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);
+
+  virtual void Make() = 0;
 
   virtual ~OpProtoAndCheckerMaker() {
-    PADDLE_ENFORCE(validated_, "should call Validate after build");
+    CHECK(validated_) << "should call Validate after build";
   }
 
-  void Validate();
-
  protected:
   struct VariableBuilder {
-    OpProto::Var* var_;
+    proto::OpProto::Var *var_;
 
-    VariableBuilder& AsDuplicable() {
+    VariableBuilder &AsDuplicable() {
       var_->set_duplicable(true);
       return *this;
     }
 
-    VariableBuilder& AsIntermediate() {
+    VariableBuilder &AsIntermediate() {
       var_->set_intermediate(true);
       return *this;
     }
 
-    VariableBuilder& AsDispensable() {
+    VariableBuilder &AsDispensable() {
       var_->set_dispensable(true);
       return *this;
     }
   };
 
-  VariableBuilder AddInput(const std::string& name, const std::string& comment);
+  VariableBuilder AddInput(const std::string &name, const std::string &comment);
 
-  VariableBuilder AddOutput(const std::string& name,
-                            const std::string& comment);
+  VariableBuilder AddOutput(const std::string &name,
+                            const std::string &comment);
 
   template <typename T>
-  TypedAttrChecker<T>& AddAttr(const std::string& name,
-                               const std::string& comment,
+  TypedAttrChecker<T> &AddAttr(const std::string &name,
+                               const std::string &comment,
                                bool generated = false) {
-    auto* attr = proto_->add_attrs();
+    auto *attr = proto_->add_attrs();
     attr->set_name(name);
     attr->set_comment(comment);
     attr->set_generated(generated);
@@ -71,21 +83,15 @@ class OpProtoAndCheckerMaker {
     return op_checker_->AddAttrChecker<T>(name);
   }
 
-  void AddComment(const std::string& comment) { proto_->set_comment(comment); }
+  void AddComment(const std::string &comment) { proto_->set_comment(comment); }
 
  private:
   void CheckNoDuplicatedInOutAttrs();
+  void Validate();
 
-  OpProto* proto_;
-  OpAttrChecker* op_checker_;
+  proto::OpProto *proto_;
+  OpAttrChecker *op_checker_;
   bool validated_{false};
 };
-
-class NOPMaker : public OpProtoAndCheckerMaker {
- public:
-  NOPMaker(OpProto* proto, framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {}
-};
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc
index a8d8c6386af940d4a14016b30de344e1c7877b22..a8030d377fdb4d4aef74b315e21792dad10fac96 100644
--- a/paddle/fluid/framework/op_proto_maker_test.cc
+++ b/paddle/fluid/framework/op_proto_maker_test.cc
@@ -18,9 +18,7 @@ limitations under the License. */
 
 class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
  public:
-  TestAttrProtoMaker(paddle::framework::proto::OpProto* proto,
-                     paddle::framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() {
     AddAttr<float>("scale", "scale of test op");
     AddAttr<float>("scale", "scale of test op");
   }
@@ -29,15 +27,14 @@ class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 TEST(ProtoMaker, DuplicatedAttr) {
   paddle::framework::proto::OpProto op_proto;
   paddle::framework::OpAttrChecker op_checker;
-  auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker);
-  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
+  TestAttrProtoMaker proto_maker;
+  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
+               paddle::platform::EnforceNotMet);
 }
 
 class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
  public:
-  TestInOutProtoMaker(paddle::framework::proto::OpProto* proto,
-                      paddle::framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() {
     AddInput("input", "input of test op");
     AddInput("input", "input of test op");
   }
@@ -46,6 +43,7 @@ class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 TEST(ProtoMaker, DuplicatedInOut) {
   paddle::framework::proto::OpProto op_proto;
   paddle::framework::OpAttrChecker op_checker;
-  auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
-  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
+  TestAttrProtoMaker proto_maker;
+  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
+               paddle::platform::EnforceNotMet);
 }
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 748317438b44bc4af84f13b25f8e4f88386388fb..43ab227a9478707445892c14723801992d0041aa 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -83,8 +83,14 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
 
   void operator()(const char* op_type, const char* library_type) const {
     using T = typename KERNEL_TYPE::ELEMENT_TYPE;
+    std::string library(library_type);
+    std::string data_layout = "ANYLAYOUT";
+    if (library == "MKLDNN") {
+      data_layout = "MKLDNNLAYOUT";
+    }
     OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
-                     DataLayout::kAnyLayout, StringToLibraryType(library_type));
+                     StringToDataLayout(data_layout),
+                     StringToLibraryType(library_type));
     OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
 
     constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
@@ -99,7 +105,8 @@ struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
   void operator()(const char* op_type, const char* library_type) const {}
 };
 
-// User can register many kernel in one place. The data type could be different.
+// User can register many kernel in one place. The data type could be
+// different.
 template <typename PlaceType, typename... KernelType>
 class OpKernelRegistrar : public Registrar {
  public:
@@ -149,15 +156,15 @@ class OpKernelRegistrar : public Registrar {
 /**
  * Macro to register OperatorKernel.
  */
-#define REGISTER_OP_KERNEL(op_type, LIBRARY_TYPE, place_class, ...)        \
+#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...)        \
   STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
-      __reg_op_kernel_##op_type##_##LIBRARY_TYPE##__,                      \
+      __reg_op_kernel_##op_type##_##library_type##__,                      \
       "REGISTER_OP_KERNEL must be called in global namespace");            \
   static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__>  \
-      __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__(#op_type,       \
-                                                           #LIBRARY_TYPE); \
-  int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE() {                \
-    __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__.Touch();          \
+      __op_kernel_registrar_##op_type##_##library_type##__(#op_type,       \
+                                                           #library_type); \
+  int TouchOpKernelRegistrar_##op_type##_##library_type() {                \
+    __op_kernel_registrar_##op_type##_##library_type##__.Touch();          \
     return 0;                                                              \
   }
 
diff --git a/paddle/fluid/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc
index 6dc4cf261bad3c004aa53fba5502fe166e3a47f7..18b1649cc71d5edd5b07740bbad1fe8f81128898 100644
--- a/paddle/fluid/framework/op_registry_test.cc
+++ b/paddle/fluid/framework/op_registry_test.cc
@@ -33,8 +33,7 @@ class CosineOp : public OperatorBase {
 
 class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  public:
-  CosineOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() {
     AddInput("input", "input of cosine op");
     AddOutput("output", "output of cosine op");
     AddAttr<float>("scale", "scale of cosine op")
@@ -55,8 +54,7 @@ class MyTestOp : public OperatorBase {
 
 class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  public:
-  MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() {
     AddInput("input", "input of cosine op").AsDuplicable();
     AddOutput("output", "output of cosine op").AsIntermediate();
     auto my_checker = [](int i) {
@@ -212,10 +210,7 @@ namespace framework {
 
 class OpKernelTestMaker : public OpProtoAndCheckerMaker {
  public:
-  OpKernelTestMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddComment("NoGradOp, same input output. no Grad");
-  }
+  void Make() { AddComment("NoGradOp, same input output. no Grad"); }
 };
 
 class OpWithKernelTest : public OperatorWithKernel {
@@ -275,9 +270,9 @@ TEST(OperatorRegistrar, CUDA) {
 
 static int op_test_value = 0;
 
-using paddle::platform::DeviceContext;
 using paddle::platform::CPUDeviceContext;
 using paddle::platform::CUDADeviceContext;
+using paddle::platform::DeviceContext;
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index d70f26026c28867e592a9f8e37cc53e6c1d6d85e..c633a2f847683debce08c40b0c2ed6e58c0a7ad1 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -24,6 +24,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(benchmark);
+DEFINE_bool(check_nan_inf, false,
+            "Checking whether operator produce NAN/INF or not. It will be "
+            "extremely slow so please use this flag wisely.");
 
 namespace paddle {
 namespace framework {
@@ -290,6 +293,38 @@ static Tensor* GetMutableTensorFromVar(Variable* var) {
   }
 }
 
+bool ExecutionContext::HasInput(const std::string& name) const {
+  if (!op_.HasInputs(name)) {
+    return false;
+  }
+  auto& ins = Inputs(name);
+  size_t length = ins.size();
+  if (length == 0) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Input %s should not have more than one inputs", name);
+  auto arg = ins[0];
+  auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg);
+  return var != nullptr;
+}
+
+bool ExecutionContext::HasOutput(const std::string& name) const {
+  if (!op_.HasOutputs(name)) {
+    return false;
+  }
+  auto& outs = Outputs(name);
+  size_t length = outs.size();
+  if (length == 0) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Output %s should not have more than one inputs", name);
+  auto arg = outs[0];
+  auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg);
+  return var != nullptr;
+}
+
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
   auto* var = InputVar(name);
@@ -441,10 +476,25 @@ class RuntimeInferShapeContext : public InferShapeContext {
     auto* out_tensor = out_var->GetMutable<LoDTensor>();
     out_tensor->set_lod(in_tensor.lod());
 
-    // TODO(dzhwinter) : reuse ShareLoD in most operators.
-    // Need to call ShareLayout explicitly in sequence related ops.
-    // Shall we have a better method to shared info between in/out Tensor?
-    out_tensor->set_layout(in_tensor.layout());
+// TODO(dzhwinter) : reuse ShareLoD in most operators.
+// Need to call ShareLayout explicitly in sequence related ops.
+// Shall we have a better method to shared info between in/out Tensor?
+#ifdef PADDLE_WITH_MKLDNN
+    // Fix me: ugly workaround below
+    // Correct solution:
+    //    set_layout() should NOT be called here (i.e. ShareLoD). Instead,
+    //    layout of output tensor should be set "manually" in Compute()
+    //    of each OPKernel. The reason layout should NOT be shared between
+    //    input and output "automatically" (now by InferShape()->ShareLoD())
+    //    is that layout transform may occur after InferShape().
+    // Workaround:
+    //    Skip set_layout() when input layout is kMKLDNN
+    //    This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN
+    //    OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called
+    //    in Compute()
+    if (in_tensor.layout() != DataLayout::kMKLDNN)
+#endif
+      out_tensor->set_layout(in_tensor.layout());
   }
 
   void ShareLayout(const std::string& in, const std::string& out, size_t i = 0,
@@ -466,6 +516,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
  protected:
   DDim GetDim(const std::string& name) const override {
     Variable* var = scope_.FindVar(name);
+    PADDLE_ENFORCE_NOT_NULL(var);
     if (var->IsType<LoDTensor>()) {
       return var->Get<LoDTensor>().dims();
     } else if (var->IsType<SelectedRows>()) {
@@ -513,6 +564,21 @@ class RuntimeInferShapeContext : public InferShapeContext {
   const Scope& scope_;
 };
 
+static void CheckTensorNANOrInf(const std::string& name,
+                                const framework::Tensor& tensor) {
+  if (tensor.memory_size() == 0) {
+    return;
+  }
+  if (tensor.type().hash_code() != typeid(float).hash_code() &&   // NOLINT
+      tensor.type().hash_code() != typeid(double).hash_code()) {  // NOLINT
+    return;
+  }
+  PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
+                 "Tensor %s contains Inf", name);
+  PADDLE_ENFORCE(!framework::TensorContainsNAN(tensor),
+                 "Tensor %s contains NAN", name);
+}
+
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
   RuntimeInferShapeContext infer_shape_ctx(*this, scope);
@@ -597,6 +663,16 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   if (FLAGS_benchmark) {
     new_dev_ctx->Wait();
   }
+
+  if (FLAGS_check_nan_inf) {
+    for (auto& vname : OutputVars(true)) {
+      auto* var = new_scope.FindVar(vname);
+      if (var == nullptr) continue;
+      if (var->IsType<framework::LoDTensor>()) {
+        CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
+      }
+    }
+  }
 }
 
 proto::VarType::Type OperatorWithKernel::IndicateDataType(
@@ -617,8 +693,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
         }
         if (t != nullptr) {
           int tmp = static_cast<int>(ToDataType(t->type()));
-          PADDLE_ENFORCE(tmp == data_type || data_type == -1,
-                         "DataType of Paddle Op %s must be the same.", Type());
+          PADDLE_ENFORCE(
+              tmp == data_type || data_type == -1,
+              "DataType of Paddle Op %s must be the same. Get %d != %d", Type(),
+              data_type, tmp);
           data_type = tmp;
         }
       }
@@ -636,7 +714,8 @@ OpKernelType OperatorWithKernel::GetExpectedKernelType(
 OpKernelType OperatorWithKernel::GetKernelTypeForVar(
     const std::string& var_name, const Tensor& tensor,
     const OpKernelType& expected_kernel_type) const {
-  return OpKernelType(expected_kernel_type.data_type_, tensor.place());
+  return OpKernelType(expected_kernel_type.data_type_, tensor.place(),
+                      tensor.layout());
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index d373c48b1a75c5f75c7520b56f230bc2c146b174..b1d75d0d0ff3dccc67a1e833ccfe03a4cad8df39 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -33,7 +33,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/variant.h"
-#include "paddle/utils/Error.h"
 
 namespace paddle {
 namespace framework {
@@ -192,6 +191,10 @@ class ExecutionContext {
     return op_.Attr<T>(name);
   }
 
+  bool HasInput(const std::string& name) const;
+
+  bool HasOutput(const std::string& name) const;
+
   size_t InputSize(const std::string& name) const {
     return op_.Inputs(name).size();
   }
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index 1bf8c81469bb4afdd00921cfa0acf6089dedbbaa..74043b5d7990178976baf2fad991ae03f9c8dd25 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -46,8 +46,7 @@ class OpWithoutKernelTest : public OperatorBase {
 
 class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker {
  public:
-  OpWithoutKernelCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() {
     AddInput("input", "input of test op");
     AddOutput("output", "output of test op");
     AddAttr<float>("scale", "scale of cosine op");
@@ -98,8 +97,7 @@ namespace framework {
 
 class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  public:
-  OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() {
     AddInput("x", "input of test op");
     AddOutput("y", "output of test op");
     AddAttr<float>("scale", "scale of cosine op")
@@ -137,9 +135,7 @@ class CPUKernelTest : public OpKernel<float> {
 class OpKernelTestMultiInputsProtoAndCheckerMaker
     : public OpProtoAndCheckerMaker {
  public:
-  OpKernelTestMultiInputsProtoAndCheckerMaker(OpProto* proto,
-                                              OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() {
     AddInput("xs", "inputs of test op").AsDuplicable();
     AddInput("k", "input of test op");
     AddOutput("ys", "outputs of test op").AsDuplicable();
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 9eea8d1c1861b8a7f6e49621b27c9871b0c1a590..ce56f55e4195a0625cd0754152285b80e4282183 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -22,7 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
-#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/graph_builder_factory.h"
+#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -42,8 +43,6 @@ class ParallelExecutorPrivate {
 #ifdef PADDLE_WITH_CUDA
   std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif
-
-  std::vector<std::tuple<std::string, proto::VarType::Type, bool>> var_types_;
   bool own_local_scope;
 };
 
@@ -52,13 +51,13 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
 }
 
 ParallelExecutor::ParallelExecutor(
-    size_t num_threads, bool use_event,
     const std::vector<platform::Place> &places,
     const std::unordered_set<std::string> &params,
     const std::unordered_set<std::string> &bcast_vars,
     const ProgramDesc &main_program, const std::string &loss_var_name,
-    Scope *scope, const std::vector<Scope *> &local_scopes, bool allow_op_delay,
-    bool use_default_grad_scale)
+    Scope *scope, const std::vector<Scope *> &local_scopes,
+    const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
+    size_t num_trainers, size_t trainer_id)
     : member_(new ParallelExecutorPrivate(places)) {
   member_->global_scope_ = scope;
 
@@ -80,36 +79,46 @@ ParallelExecutor::ParallelExecutor(
 
 // Bcast Parameters to all GPUs
 #ifdef PADDLE_WITH_CUDA
-  member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_));
+  auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
+  ncclUniqueId *nccl_id = nullptr;
+  if (nccl_id_var != nullptr) {
+    nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
+  }
+  member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
+      member_->places_, nccl_id, num_trainers, trainer_id));
 #endif
   if (platform::is_gpu_place(places[0]) && member_->local_scopes_.size() != 1 &&
       local_scopes.empty()) {  // Is CUDA
     BCastParamsToGPUs(bcast_vars);
   }
-// Startup Program has been run. All local scopes has correct parameters.
+  // Startup Program has been run. All local scopes has correct parameters.
 
-// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
-// ncclOp
-#ifdef PADDLE_WITH_CUDA
-  details::MultiDevSSAGraphBuilder builder(
+  // Step 2. Create vars in each scope;
+  std::vector<details::VariableInfo> var_infos;
+  for (auto *var : main_program.Block(0).AllVars()) {
+    var_infos.emplace_back();
+    var_infos.back().name_ = var->Name();
+    var_infos.back().type_ = var->GetType();
+    var_infos.back().persistable_ = var->Persistable();
+  }
+
+  // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
+  // ncclOp
+
+  details::SSAGraphBuilderFactory builder_factory(
       member_->places_, loss_var_name, params, member_->local_scopes_,
-      member_->nccl_ctxs_.get(), use_default_grad_scale);
-#else
-  details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name,
-                                           params, member_->local_scopes_,
-                                           use_default_grad_scale);
+      build_strategy);
+#ifdef PADDLE_WITH_CUDA
+  builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get());
 #endif
-  auto graph = builder.Build(main_program);
 
   member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-      num_threads, use_event, member_->local_scopes_, places, std::move(graph),
-      allow_op_delay));
+      exec_strategy, member_->local_scopes_, places,
+      builder_factory.Create()->Build(main_program)));
 
-  // Step 3. Create vars in each scope;
-  for (auto *var : main_program.Block(0).AllVars()) {
-    member_->var_types_.emplace_back(var->Name(), var->GetType(),
-                                     var->Persistable());
-  }
+  member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
+      exec_strategy, member_->local_scopes_, std::move(var_infos),
+      member_->places_, std::move(member_->executor_)));
 }
 
 void ParallelExecutor::BCastParamsToGPUs(
@@ -164,42 +173,9 @@ void ParallelExecutor::BCastParamsToGPUs(
 void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
                            const std::string &fetched_var_name) {
   platform::RecordBlock b(0);
-  // Create local scopes.
-  for (auto it = member_->local_scopes_.rbegin();
-       it != member_->local_scopes_.rend(); ++it) {
-    auto &scope = *it;
-    Scope &local_scope = scope->NewScope();
-    *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
-        &local_scope;
-
-    for (auto &name_type_pair : member_->var_types_) {
-      if (scope->FindVar(std::get<0>(name_type_pair)) != nullptr) {
-        continue;
-      }
-
-      if (std::get<2>(name_type_pair)) {  // Persistable
-        InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
-                           std::get<1>(name_type_pair));
-      } else {
-        InitializeVariable(local_scope.Var(std::get<0>(name_type_pair)),
-                           std::get<1>(name_type_pair));
-      }
-    }
-  }
-
   auto fetch_data = member_->executor_->Run(fetch_tensors);
   *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
       fetch_data;
-
-  // Wait All computational streams
-  for (auto p : member_->places_) {
-    platform::DeviceContextPool::Instance().Get(p)->Wait();
-  }
-  for (auto &scope : member_->local_scopes_) {
-    auto &local_scope =
-        *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
-    scope->DeleteScope(local_scope);
-  }
 }
 
 void ParallelExecutor::FeedTensorsIntoLocalScopes(
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index ecd107d81f8f5bf5d8b899d0c07797114a7ab767..5247e790649e76567f4527d54499d6e95dac5c27 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -14,55 +14,60 @@ limitations under the License. */
 
 #pragma once
 
+#include <paddle/fluid/framework/details/build_strategy.h>
 #include <string>
 #include <unordered_set>
 #include <vector>
+#include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
-
 namespace paddle {
 namespace framework {
 
 class ParallelExecutorPrivate;
 
+using details::BuildStrategy;
+using details::ExecutionStrategy;
+
 class ParallelExecutor {
   DISABLE_COPY_AND_ASSIGN(ParallelExecutor);
 
  public:
-  explicit ParallelExecutor(size_t num_threads, bool use_event,
-                            const std::vector<platform::Place>& places,
-                            const std::unordered_set<std::string>& params,
-                            const std::unordered_set<std::string>& bcast_vars,
-                            const ProgramDesc& main_program,
-                            const std::string& loss_var_name, Scope* scope,
-                            const std::vector<Scope*>& local_scopes,
-                            bool allow_op_delay, bool use_default_grad_scale);
+  explicit ParallelExecutor(const std::vector<platform::Place> &places,
+                            const std::unordered_set<std::string> &params,
+                            const std::unordered_set<std::string> &bcast_vars,
+                            const ProgramDesc &main_program,
+                            const std::string &loss_var_name, Scope *scope,
+                            const std::vector<Scope *> &local_scopes,
+                            const ExecutionStrategy &exec_strategy,
+                            const BuildStrategy &build_strategy,
+                            size_t num_trainers = 1, size_t trainer_id = 0);
 
   ~ParallelExecutor();
 
-  std::vector<Scope*>& GetLocalScopes();
+  std::vector<Scope *> &GetLocalScopes();
 
   /**
    * Feed tensors to local scopes. The size of tensors should be equal to the
    * size of local scopes.
    */
   void FeedTensorsIntoLocalScopes(
-      const std::vector<std::unordered_map<std::string, LoDTensor>>& tensors);
+      const std::vector<std::unordered_map<std::string, LoDTensor>> &tensors);
 
   void FeedAndSplitTensorIntoLocalScopes(
-      const std::unordered_map<std::string, LoDTensor>& tensors);
+      const std::unordered_map<std::string, LoDTensor> &tensors);
 
-  void Run(const std::vector<std::string>& fetch_tensors,
-           const std::string& fetched_var_name);
+  void Run(const std::vector<std::string> &fetch_tensors,
+           const std::string &fetched_var_name);
 
-  void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
+  void BCastParamsToGPUs(const std::unordered_set<std::string> &vars) const;
 
  private:
-  ParallelExecutorPrivate* member_;
+  ParallelExecutorPrivate *member_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index 64fb028f83a539d17885186d5d8ee6ef26f095e9..1e01a6e900404990e16674755367d2fc6d832725 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -51,12 +51,15 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
     auto *block = desc_.mutable_blocks(i);
     blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this));
   }
-  for (auto &block : blocks_) {
-    for (auto *op : block->AllOps()) {
-      for (const auto &attr : op->Proto()->attrs()) {
-        if (attr.type() == proto::AttrType::BLOCK) {
-          size_t blk_idx = attr.block_idx();
-          op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
+  for (size_t block_id = 0; block_id < blocks_.size(); ++block_id) {
+    auto all_ops = blocks_[block_id]->AllOps();
+    for (size_t op_id = 0; op_id < all_ops.size(); ++op_id) {
+      auto &op = all_ops[op_id];
+      for (const std::string &attr_name : op->AttrNames()) {
+        if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) {
+          int sub_block_id =
+              o.Block(block_id).Op(op_id)->GetBlockAttr(attr_name);
+          op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
         }
       }
     }
@@ -86,6 +89,16 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
   for (auto &block_desc : *desc_.mutable_blocks()) {
     blocks_.emplace_back(new BlockDesc(this, &block_desc));
   }
+  for (auto &block : blocks_) {
+    for (auto *op : block->AllOps()) {
+      for (const auto &attr : op->Proto()->attrs()) {
+        if (attr.type() == proto::AttrType::BLOCK) {
+          size_t blk_idx = attr.block_idx();
+          op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
+        }
+      }
+    }
+  }
 }
 
 const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc
index 76126f3dc64d71770d13f9d66bb30f176c112629..0b36f1116d15004b355e854e101abb9ad3297836 100644
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -25,8 +25,10 @@ void FileReader::ReadNext(std::vector<LoDTensor> *out) {
   if (out->empty()) {
     return;
   }
+
+  PADDLE_ENFORCE_EQ(out->size(), dims_.size());
   for (size_t i = 0; i < dims_.size(); ++i) {
-    auto &actual = out->at(i).dims();
+    auto &actual = (*out)[i].dims();
     auto &expect = dims_[i];
 
     PADDLE_ENFORCE_EQ(actual.size(), expect.size());
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 9091713158c8071d5386f14250e3c546284e7fd0..bb2d866c824e0fec1b241caea407a38c88a3cb51 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -34,13 +34,7 @@ DEFINE_bool(
 namespace paddle {
 namespace framework {
 
-Scope::~Scope() {
-  DropKids();
-  for (auto& kv : vars_) {
-    VLOG(3) << "Destroy variable " << kv.first;
-    delete kv.second;
-  }
-}
+Scope::~Scope() { DropKids(); }
 
 Scope& Scope::NewScope() const {
   std::unique_lock<std::mutex> lock(mutex_);
@@ -49,10 +43,13 @@ Scope& Scope::NewScope() const {
 }
 
 Variable* Scope::Var(const std::string& name) {
+  // acquire the lock when new var under this scope
+  std::unique_lock<std::mutex> lock(mutex_);
   auto* v = FindVarLocally(name);
   if (v != nullptr) return v;
+
   v = new Variable();
-  vars_[name] = v;
+  vars_[name].reset(v);
   VLOG(3) << "Create variable " << name;
   v->name_ = &(vars_.find(name)->first);
   return v;
@@ -67,22 +64,29 @@ Variable* Scope::Var(std::string* name) {
 }
 
 Variable* Scope::FindVar(const std::string& name) const {
+  // acquire the lock when find var
+  std::unique_lock<std::mutex> lock(mutex_);
+  return FindVarInternal(name);
+}
+
+Variable* Scope::FindVarInternal(const std::string& name) const {
   auto var = FindVarLocally(name);
   if (var != nullptr) {
     return var;
   }
-  return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
+  return (parent_ == nullptr) ? nullptr : parent_->FindVarInternal(name);
 }
 
 const Scope* Scope::FindScope(const Variable* var) const {
   for (auto& kv : vars_) {
-    if (kv.second == var) {
+    if (kv.second.get() == var) {
       return this;
     }
   }
   return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
 }
 void Scope::DropKids() {
+  std::unique_lock<std::mutex> lock(mutex_);
   for (Scope* s : kids_) delete s;
   kids_.clear();
 }
@@ -110,10 +114,10 @@ void Scope::DeleteScope(Scope* scope) const {
 }
 
 void Scope::EraseVars(const std::vector<std::string>& var_names) {
+  std::unique_lock<std::mutex> lock(mutex_);
   std::set<std::string> var_set(var_names.begin(), var_names.end());
   for (auto it = vars_.begin(); it != vars_.end();) {
     if (var_set.find(it->first) != var_set.end()) {
-      delete it->second;
       it = vars_.erase(it);
     } else {
       ++it;
@@ -129,7 +133,7 @@ void Scope::Rename(const std::string& origin_name,
   auto new_it = vars_.find(new_name);
   PADDLE_ENFORCE(new_it == vars_.end(),
                  "The variable with name %s is already in the scope", new_name);
-  vars_[new_name] = origin_it->second;
+  vars_[new_name].reset(origin_it->second.release());
   vars_.erase(origin_it);
 }
 
@@ -141,7 +145,7 @@ std::string Scope::Rename(const std::string& origin_name) const {
 
 Variable* Scope::FindVarLocally(const std::string& name) const {
   auto it = vars_.find(name);
-  if (it != vars_.end()) return it->second;
+  if (it != vars_.end()) return it->second.get();
   return nullptr;
 }
 
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index abc82e452d732638a2f7315022074850f299a7ea..98d103d867987fc02dc66df5ac855a14b66b8f03 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -47,15 +47,18 @@ class Scope {
   Scope& NewScope() const;
 
   /// Create a variable with given name if it doesn't exist.
+  /// Caller doesn't own the returned Variable.
   Variable* Var(const std::string& name);
 
   /// Create a variable with a scope-unique name.
+  /// Caller doesn't own the returned Variable.
   Variable* Var(std::string* name = nullptr);
 
   void EraseVars(const std::vector<std::string>& var_names);
 
   /// Find a variable in the scope or any of its ancestors.  Returns
   /// nullptr if cannot find.
+  /// Caller doesn't own the returned Variable.
   Variable* FindVar(const std::string& name) const;
 
   const Scope* parent() const { return parent_; }
@@ -78,13 +81,21 @@ class Scope {
   // Rename variable to a new name and return the new name
   std::string Rename(const std::string& origin_name) const;
 
-  Variable* FindVarLocally(const std::string& name) const;
-
  private:
   // Call Scope::NewScope for a sub-scope.
   explicit Scope(Scope const* parent) : parent_(parent) {}
 
-  mutable std::unordered_map<std::string, Variable*> vars_;
+  // Called by FindVar recursively.
+  // Caller doesn't own the returned Variable.
+  Variable* FindVarInternal(const std::string& name) const;
+
+  // Called by FindVarInternal and Var.
+  // Caller doesn't own the returned Variable.
+  Variable* FindVarLocally(const std::string& name) const;
+
+  mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+
+  // Scope in `kids_` are owned by this class.
   mutable std::list<Scope*> kids_;
   Scope const* parent_{nullptr};
 
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 56cf6693caf4529d6e157e6e9a0d5c27d05ee0c3..06ed87e7e8a2d5324b48a466b05207042ec1b7fa 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -18,8 +18,8 @@ namespace paddle {
 namespace framework {
 
 struct ReAllocateVisitor {
-  ReAllocateVisitor(framework::Tensor* tensor, const framework::DDim& dims)
-      : tensor_(tensor), dims_(dims) {}
+  ReAllocateVisitor(const framework::DDim& dims, framework::Tensor* tensor)
+      : dims_(dims), tensor_(tensor) {}
 
   template <typename T>
   void operator()() const {
@@ -34,8 +34,8 @@ struct ReAllocateVisitor {
     tensor_->ShareDataWith(cpu_tensor);
   }
 
-  framework::Tensor* tensor_;
   framework::DDim dims_;
+  framework::Tensor* tensor_;
 };
 
 struct TensorCopyVisitor {
@@ -121,24 +121,29 @@ bool SelectedRows::HasKey(int64_t key) const {
 }
 
 std::vector<std::pair<int64_t, int64_t>> SelectedRows::Get(
-    std::vector<int64_t> keys, framework::Tensor* value) const {
+    const std::vector<int64_t>& keys, framework::Tensor* value) const {
   PADDLE_ENFORCE(value->IsInitialized(),
                  "The value tensor should be initialized.");
   std::vector<std::pair<int64_t, int64_t>> non_keys_pair;
-  int64_t value_width = value_->numel() / value_->dims()[0];
-  PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
-                    "output tensor should have the same shape with table "
-                    "execpt the dims[0].");
-
-  for (size_t i = 0; i < keys.size(); ++i) {
-    int64_t index = Index(keys[i]);
-    if (index == -1) {
-      non_keys_pair.push_back(std::make_pair(keys[i], static_cast<int64_t>(i)));
-    } else {
-      framework::VisitDataType(
-          framework::ToDataType(value_->type()),
-          TensorCopyVisitor(value, i * value_width, *value_.get(),
-                            index * value_width, value_width));
+  if (keys.empty()) {
+    VLOG(3) << "keys is empty, please check data!";
+  } else {
+    int64_t value_width = value_->numel() / value_->dims()[0];
+    PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
+                      "output tensor should have the same shape with table "
+                      "except the dims[0].");
+
+    for (size_t i = 0; i < keys.size(); ++i) {
+      int64_t index = Index(keys[i]);
+      if (index == -1) {
+        non_keys_pair.push_back(
+            std::make_pair(keys[i], static_cast<int64_t>(i)));
+      } else {
+        framework::VisitDataType(
+            framework::ToDataType(value_->type()),
+            TensorCopyVisitor(value, i * value_width, *value_.get(),
+                              index * value_width, value_width));
+      }
     }
   }
   return non_keys_pair;
@@ -153,6 +158,7 @@ bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
   }
   PADDLE_ENFORCE_EQ(value.dims()[0], static_cast<size_t>(1),
                     "The first dim of value should be 1.");
+  std::lock_guard<std::mutex> lock(*auto_grown_mutex_.get());
   auto index = Index(key);
   bool is_new_key = false;
   if (index == -1) {
@@ -164,7 +170,7 @@ bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
       auto dims = value_->dims();
       dims[0] = (dims[0] + 1) << 1;
       framework::VisitDataType(framework::ToDataType(value.type()),
-                               ReAllocateVisitor(value_.get(), dims));
+                               ReAllocateVisitor(dims, value_.get()));
     }
   }
 
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index c27c927ee751c4392840bfb71f4814991b23a8c9..7160670ddd204c20021ea87cdd67ee4721d03451 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include <algorithm>
+#include <memory>
+#include <mutex>  // NOLINT
 #include <utility>
 #include <vector>
 
@@ -46,11 +48,13 @@ class SelectedRows {
   SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
       : rows_(rows), height_(height) {
     value_.reset(new Tensor());
+    auto_grown_mutex_.reset(new std::mutex);
   }
 
   SelectedRows() {
     height_ = 0;
     value_.reset(new Tensor());
+    auto_grown_mutex_.reset(new std::mutex);
   }
 
   platform::Place place() const { return value_->place(); }
@@ -82,7 +86,7 @@ class SelectedRows {
    * @return a list of pair which contains the non-exists key and the index in
    * the value
    */
-  std::vector<std::pair<int64_t, int64_t>> Get(std::vector<int64_t> keys,
+  std::vector<std::pair<int64_t, int64_t>> Get(const std::vector<int64_t>& keys,
                                                framework::Tensor* value) const;
 
   /*
@@ -125,6 +129,7 @@ class SelectedRows {
   Vector<int64_t> rows_;
   std::unique_ptr<Tensor> value_{nullptr};
   int64_t height_;
+  std::unique_ptr<std::mutex> auto_grown_mutex_{nullptr};
 };
 
 /*
diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h
index 46c8feec001584a872f7f62682080e0e72c06f50..5f497cafa0f75f7c23d550ef767d55274de7c900 100644
--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
@@ -63,6 +63,7 @@ class InferShapeContext {
 
   std::vector<InferShapeVarPtr> GetInputVarPtrs(const std::string &name);
   std::vector<InferShapeVarPtr> GetOutputVarPtrs(const std::string &name);
+  virtual InferShapeVarPtr GetVarPtr(const std::string &name) = 0;
 
   // Note: In while op, we need this to be public
   void SetDims(const std::vector<std::string> &names,
@@ -81,8 +82,6 @@ class InferShapeContext {
       const std::vector<std::string> &names) const;
 
   virtual proto::VarType::Type GetVarType(const std::string &name) const = 0;
-
-  virtual InferShapeVarPtr GetVarPtr(const std::string &name) = 0;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index e97ada06f06d0538f17160220e3aa3f4ffc55520..c7286dacf01659f3af0927a71856e5a6496cb877 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -15,5 +15,102 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 
 namespace paddle {
-namespace framework {}
+namespace framework {
+extern size_t SizeOfType(std::type_index type);
+void Tensor::check_memory_size() const {
+  PADDLE_ENFORCE_NOT_NULL(
+      holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
+  PADDLE_ENFORCE_LE(
+      numel() * SizeOfType(type()), memory_size(),
+      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
+      "first to re-allocate memory.\n"
+      "or maybe the required data-type mismatches the data already stored.");
+}
+
+size_t Tensor::memory_size() const {
+  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
+}
+
+void* Tensor::mutable_data(platform::Place place, std::type_index type) {
+  if (holder_ != nullptr) {
+    holder_->set_type(type);
+  }
+  PADDLE_ENFORCE_GE(numel(), 0,
+                    "When calling this method, the Tensor's numel must be "
+                    "equal or larger than zero. "
+                    "Please check Tensor::Resize has been called first.");
+  int64_t size = numel() * SizeOfType(type);
+  /* some versions of boost::variant don't have operator!= */
+  if (holder_ == nullptr || !(holder_->place() == place) ||
+      holder_->size() < size + offset_) {
+    if (platform::is_cpu_place(place)) {
+      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
+          boost::get<platform::CPUPlace>(place), size, type));
+    } else if (platform::is_gpu_place(place) ||
+               platform::is_cuda_pinned_place(place)) {
+#ifndef PADDLE_WITH_CUDA
+      PADDLE_THROW(
+          "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
+    }
+#else
+      if (platform::is_gpu_place(place)) {
+        holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
+            boost::get<platform::CUDAPlace>(place), size, type));
+      } else if (platform::is_cuda_pinned_place(place)) {
+        holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
+            boost::get<platform::CUDAPinnedPlace>(place), size, type));
+      }
+    }
+#endif
+    offset_ = 0;
+  }
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                                 offset_);
+}
+
+void* Tensor::mutable_data(platform::Place place) {
+  PADDLE_ENFORCE(this->holder_ != nullptr,
+                 "Cannot invoke mutable data if current hold nothing.");
+  return mutable_data(place, holder_->type());
+}
+
+Tensor& Tensor::ShareDataWith(const Tensor& src) {
+  src.check_memory_size();
+  *this = src;
+  return *this;
+}
+
+Tensor Tensor::Slice(int begin_idx, int end_idx) const {
+  check_memory_size();
+  PADDLE_ENFORCE_GE(begin_idx, 0,
+                    "The start row index must be greater than 0.");
+  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
+  PADDLE_ENFORCE_LT(
+      begin_idx, end_idx,
+      "The start row index must be lesser than the end row index.");
+
+  if (dims_[0] == 1) {
+    return *this;
+  } else {
+    size_t base = numel() / dims_[0];
+    Tensor dst;
+    dst.holder_ = holder_;
+    dst.set_layout(layout_);
+    DDim dst_dims = dims_;
+    dst_dims[0] = end_idx - begin_idx;
+    dst.Resize(dst_dims);
+    dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
+    return dst;
+  }
+}
+
+Tensor& Tensor::Resize(const DDim& dims) {
+  dims_ = dims;
+  return *this;
+}
+
+const DDim& Tensor::dims() const { return dims_; }
+
+int64_t Tensor::numel() const { return product(dims_); }
+}  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 6f878541e6de1deec1829145b1b325ecd176a034..ef224d68f1fc561f45e9d7a81425e62655457648 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -34,6 +34,28 @@ namespace framework {
 class LoDTensor;
 
 class Tensor {
+#ifdef PADDLE_WITH_MKLDNN
+
+ public:
+  inline mkldnn::memory::format format() const { return format_; }
+
+  inline void set_format(const mkldnn::memory::format format) {
+    format_ = format;
+  }
+
+ protected:
+  /**
+   * @brief the detail format of memory block which have layout as kMKLDNN
+   *
+   * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
+   *       nChw16c, etc. For a MKLDNN memory block, layout will be set as
+   *       DataLayout::kMKLDNN meanwhile detail memory format will be kept in
+   *       this field.
+   */
+
+  mkldnn::memory::format format_ = mkldnn::memory::format::format_undef;
+#endif
+
  public:
   template <typename T, size_t D, int MajorType, typename IndexType>
   friend struct EigenTensor;
@@ -54,26 +76,24 @@ class Tensor {
 
   /*! Return a pointer to mutable memory block. */
   template <typename T>
-  inline T* data();
+  T* data();
 
   /*! Return a pointer to constant memory block. */
   template <typename T>
-  inline const T* data() const;
+  const T* data() const;
 
-  inline bool IsInitialized() const;
-
-  inline void switch_place(platform::Place new_place);
+  bool IsInitialized() const;
 
   /**
    * @brief   Return a pointer to mutable memory block.
    * @note    If not exist, then allocation.
    */
   template <typename T>
-  inline T* mutable_data(platform::Place place);
+  T* mutable_data(platform::Place place);
 
-  inline void* mutable_data(platform::Place place, std::type_index type);
+  void* mutable_data(platform::Place place, std::type_index type);
 
-  inline void* mutable_data(platform::Place place);
+  void* mutable_data(platform::Place place);
 
   /**
    * @brief     Return a pointer to mutable memory block.
@@ -84,19 +104,19 @@ class Tensor {
    * @note      If not exist, then allocation.
    */
   template <typename T>
-  inline T* mutable_data(DDim dims, platform::Place place);
+  T* mutable_data(DDim dims, platform::Place place);
 
   /*! Return the dimensions of the memory block. */
-  inline const DDim& dims() const;
+  const DDim& dims() const;
 
   /*! Return the numel of the memory block. */
-  inline int64_t numel() const;
+  int64_t numel() const;
 
   /*! Resize the dimensions of the memory block. */
-  inline Tensor& Resize(const DDim& dims);
+  Tensor& Resize(const DDim& dims);
 
   /*! The internal of two tensors share the same memory block. */
-  inline Tensor& ShareDataWith(const Tensor& src);
+  Tensor& ShareDataWith(const Tensor& src);
 
   /**
    * @brief  Return a sub-tensor of the given tensor.
@@ -106,7 +126,7 @@ class Tensor {
    * @param[in] end_idx     The index of the end row(exclusive) to slice.
    *                        The index number begins from 0.
    */
-  inline Tensor Slice(int begin_idx, int end_idx) const;
+  Tensor Slice(int begin_idx, int end_idx) const;
 
   platform::Place place() const {
     PADDLE_ENFORCE_NOT_NULL(
@@ -123,11 +143,11 @@ class Tensor {
   // memory size returns the holding memory size in byte.
   size_t memory_size() const;
 
-  inline void check_memory_size() const;
+  void check_memory_size() const;
 
-  inline DataLayout layout() const { return layout_; }
+  DataLayout layout() const { return layout_; }
 
-  inline void set_layout(const DataLayout layout) { layout_ = layout; }
+  void set_layout(const DataLayout layout) { layout_ = layout; }
 
  private:
   /**
@@ -197,8 +217,10 @@ class Tensor {
    *       N,C,H,W for respectively the batch size, the number of
    *       feature maps, the height.
    */
-
-  DataLayout layout_ = DataLayout::kNHWC;
+  // Fix me: here just change the default layout to kNCHW
+  // it doesn't fix the real issue, i.e. feeder should set up tensor layout
+  // according to actual input data
+  DataLayout layout_ = DataLayout::kNCHW;
 
   /**
    * @brief   A PlaceHolder may be shared by more than one tensor.
@@ -210,15 +232,6 @@ class Tensor {
   size_t offset_;
 };
 
-inline void Tensor::switch_place(platform::Place new_place) {
-  if (holder_->place() == new_place) {
-    return;
-  }
-
-  // TODO(tonyyang-svail): do memcpy here.
-  PADDLE_THROW("Not Implemented");
-}
-
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index f49d1a47a325b2aac6185073203df124be18b54d..96114678a9992f2975c4173c7cc003114f04d8df 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -13,73 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace framework {
-
-template <typename... T>
-struct SizeOfTypeFunctor;
-
-template <typename T>
-struct SizeOfTypeFunctor<T> {
-  size_t operator()(std::type_index type) const {
-    if (typeid(T).hash_code() == type.hash_code()) {
-      return sizeof(T);
-    } else {
-      return 0UL;
-    }
-  }
-};
-
-template <>
-struct SizeOfTypeFunctor<> {
-  size_t operator()(std::type_index type) const { return 0UL; }
-};
-
-template <typename HEAD, typename... TAIL>
-struct SizeOfTypeFunctor<HEAD, TAIL...> {
-  size_t operator()(std::type_index type) const {
-    SizeOfTypeFunctor<HEAD> head;
-    size_t head_size = head(type);
-    if (head_size != 0) {
-      return head_size;
-    }
-    SizeOfTypeFunctor<TAIL...> tail;
-    return tail(type);
-  }
-};
-
-static inline size_t SizeOfType(std::type_index type) {
-  SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool, size_t,
-                    platform::float16>
-      functor;
-  size_t size = functor(type);
-  PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
-  return size;
-}
-
-inline void Tensor::check_memory_size() const {
-  PADDLE_ENFORCE_NOT_NULL(
-      holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
-  PADDLE_ENFORCE_LE(
-      numel() * SizeOfType(type()), memory_size(),
-      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
-      "first to re-allocate memory.\n"
-      "or maybe the required data-type mismatches the data already stored.");
-}
-
-inline size_t Tensor::memory_size() const {
-  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
-}
-
 template <typename T>
 inline const T* Tensor::data() const {
   check_memory_size();
   PADDLE_ENFORCE(std::is_same<T, void>::value ||
-                     holder_->type().hash_code() == typeid(T).hash_code(),
+                     holder_->type() == std::type_index(typeid(T)),
                  "Tensor holds the wrong type, it holds %s",
                  this->holder_->type().name());
 
@@ -93,7 +38,7 @@ template <typename T>
 inline T* Tensor::data() {
   check_memory_size();
   PADDLE_ENFORCE(std::is_same<T, void>::value ||
-                     holder_->type().hash_code() == typeid(T).hash_code(),
+                     holder_->type() == std::type_index(typeid(T)),
                  "Tensor holds the wrong type, it holds %s",
                  this->holder_->type().name());
   return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
@@ -113,88 +58,6 @@ inline T* Tensor::mutable_data(platform::Place place) {
   return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
 }
 
-inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
-  if (holder_ != nullptr) {
-    holder_->set_type(type);
-  }
-  PADDLE_ENFORCE_GE(numel(), 0,
-                    "When calling this method, the Tensor's numel must be "
-                    "equal or larger than zero. "
-                    "Please check Tensor::Resize has been called first.");
-  int64_t size = numel() * SizeOfType(type);
-  /* some versions of boost::variant don't have operator!= */
-  if (holder_ == nullptr || !(holder_->place() == place) ||
-      holder_->size() < size + offset_) {
-    if (platform::is_cpu_place(place)) {
-      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
-          boost::get<platform::CPUPlace>(place), size, type));
-    } else if (platform::is_gpu_place(place) ||
-               platform::is_cuda_pinned_place(place)) {
-#ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW(
-          "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
-    }
-#else
-      if (platform::is_gpu_place(place)) {
-        holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
-            boost::get<platform::CUDAPlace>(place), size, type));
-      } else if (platform::is_cuda_pinned_place(place)) {
-        holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
-            boost::get<platform::CUDAPinnedPlace>(place), size, type));
-      }
-    }
-#endif
-    offset_ = 0;
-  }
-  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                 offset_);
-}
-
-inline void* Tensor::mutable_data(platform::Place place) {
-  PADDLE_ENFORCE(this->holder_ != nullptr,
-                 "Cannot invoke mutable data if current hold nothing.");
-  return mutable_data(place, holder_->type());
-}
-
-inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
-  src.check_memory_size();
-  *this = src;
-  return *this;
-}
-
-inline Tensor Tensor::Slice(int begin_idx, int end_idx) const {
-  check_memory_size();
-  PADDLE_ENFORCE_GE(begin_idx, 0,
-                    "The start row index must be greater than 0.");
-  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
-  PADDLE_ENFORCE_LT(
-      begin_idx, end_idx,
-      "The start row index must be lesser than the end row index.");
-
-  if (dims_[0] == 1) {
-    return *this;
-  } else {
-    size_t base = numel() / dims_[0];
-    Tensor dst;
-    dst.holder_ = holder_;
-    dst.set_layout(layout_);
-    DDim dst_dims = dims_;
-    dst_dims[0] = end_idx - begin_idx;
-    dst.Resize(dst_dims);
-    dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
-    return dst;
-  }
-}
-
-inline Tensor& Tensor::Resize(const DDim& dims) {
-  dims_ = dims;
-  return *this;
-}
-
-inline const DDim& Tensor::dims() const { return dims_; }
-
-inline int64_t Tensor::numel() const { return product(dims_); }
-
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
   Tensor res;
   res.ShareDataWith(src);
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index e1012de2ec36eb4a858202d56a678b6a204c2f0a..0a1cb6d5703dace5e6be73285655ecd9d2ad89fb 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -209,7 +209,7 @@ TEST(Tensor, ReshapeToMatrix) {
 
 TEST(Tensor, Layout) {
   framework::Tensor src;
-  ASSERT_EQ(src.layout(), framework::DataLayout::kNHWC);
+  ASSERT_EQ(src.layout(), framework::DataLayout::kNCHW);
   src.set_layout(framework::DataLayout::kAnyLayout);
   ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout);
 }
diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc
index 9e33003b442762210c990b35f30bc3524963b8b4..14b81ddfecb8c996ae8709910c022a074e91eb3c 100644
--- a/paddle/fluid/framework/var_type_inference_test.cc
+++ b/paddle/fluid/framework/var_type_inference_test.cc
@@ -24,8 +24,7 @@ namespace framework {
 
 class SumOpMaker : public OpProtoAndCheckerMaker {
  public:
-  SumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() {
     AddInput("X", "").AsDuplicable();
     AddOutput("Out", "");
     AddComment("");
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 50f635a41a99b2ae292d13afde5637a3bf4e6f8c..ec16a1c600a3bafc1c4cbbd920360253c106e3a1 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -1,17 +1,23 @@
 set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
 
+# TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
 cc_library(paddle_fluid_api
     SRCS io.cc
     DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 
-# Create static library
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-cc_library(paddle_fluid DEPS ${fluid_modules})
 
+if(WITH_CONTRIB)
+  set(fluid_modules "${fluid_modules}" paddle_inference_api)
+endif()
+
+# Create static library
+cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api)
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
     SRCS io.cc
-    DEPS ${fluid_modules})
+    DEPS ${fluid_modules} paddle_fluid_api)
+
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
 if(NOT APPLE)
   # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
@@ -20,7 +26,9 @@ if(NOT APPLE)
 endif()
 
 if(WITH_TESTING)
+  # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book
   add_subdirectory(tests/book)
+  add_subdirectory(analysis)
 endif()
 
 if (TENSORRT_FOUND)
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..50835784440bfa177e38f9760bb4a47ad335a9e1
--- /dev/null
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -0,0 +1,23 @@
+set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
+cc_library(analysis SRCS dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc fluid_to_data_flow_graph_pass.cc
+  DEPS paddle_fluid)
+cc_test(test_node SRCS node_tester.cc DEPS analysis)
+cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
+
+set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
+
+cc_test(test_data_flow_graph SRCS data_flow_graph_tester.cc DEPS analysis ${FLUID_CORE_MODULES} paddle_fluid
+  ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model)
+set_tests_properties(test_data_flow_graph PROPERTIES DEPENDS test_word2vec)
+
+cc_test(test_subgraph_splitter
+        SRCS subgraph_splitter_tester.cc
+        DEPS analysis paddle_fluid tensor
+        ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model)
+set_tests_properties(test_subgraph_splitter PROPERTIES DEPENDS test_word2vec)
+
+cc_test(test_dfg_graphviz_draw_pass
+        SRCS dfg_graphviz_draw_pass_tester.cc
+        DEPS analysis
+        ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model)
+set_tests_properties(test_dfg_graphviz_draw_pass PROPERTIES DEPENDS test_word2vec)
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4220451e3caee62caa51af5bc33d6dd3fd891018
--- /dev/null
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -0,0 +1,205 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/dot.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+// It is a better idea that the inputs and outputs of this graph is set manully
+// before, but there must be a Pass that helps to prune the unnecessary ops that
+// do not contribute to the given targets, so in this pass, analysis and get the
+// inputs and outputs is OK.
+void DataFlowGraph::Build() {
+  inputs.clear();
+  outputs.clear();
+  std::unordered_set<Node *> ins;
+  std::unordered_set<Node *> outs;
+  for (auto &node : nodes.nodes()) {
+    for (auto *in : node->inlinks) {
+      ins.insert(in);
+    }
+    for (auto *out : node->outlinks) {
+      outs.insert(out);
+    }
+  }
+
+  // The nodes that in ins but not in outs is the graph's inputs
+  // similarly, the nodes that in outs but not in ins is the graphs' outputs
+  for (auto *in : ins) {
+    if (!outs.count(in)) {
+      inputs.push_back(in);
+    }
+  }
+  for (auto *out : outs) {
+    if (!outs.count(out)) {
+      outputs.push_back(out);
+    }
+  }
+}
+
+std::string DataFlowGraph::DotString() const {
+  Dot dot;
+
+  // Add nodes
+  for (size_t i = 0; i < nodes.size(); i++) {
+    const Node &node = nodes.Get(i);
+    switch (node.type()) {
+      case Node::Type::kValue:
+        dot.AddNode(node.repr(), node.dot_attrs());
+        break;
+      case Node::Type::kFunction:
+        dot.AddNode(node.repr(), node.dot_attrs());
+        break;
+      case Node::Type::kFunctionBlock:
+        dot.AddNode(node.repr(), node.dot_attrs());
+        break;
+      default:
+        PADDLE_THROW("unsupported Node type %d", static_cast<int>(node.type()));
+    }
+  }
+
+  // Add edges
+  for (size_t i = 0; i < nodes.size(); i++) {
+    const Node &node = nodes.Get(i);
+    for (auto &in : node.inlinks) {
+      dot.AddEdge(in->repr(), node.repr(), {});
+    }
+  }
+  return dot.Build();
+}
+
+//
+// NodesBFSIterator
+//
+
+GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
+    const std::vector<Node *> &source)
+    : queue_(source.begin(), source.end()) {}
+
+// GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
+//     GraphTraits<DataFlowGraph>::NodesBFSIterator &&other) noexcept
+//     : queue_(std::move(other.queue_)),
+//       visited_(std::move(other.visited_)) {}
+
+GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
+    const GraphTraits<DataFlowGraph>::NodesBFSIterator &other)
+    : queue_(other.queue_), visited_(other.visited_) {}
+
+Node &GraphTraits<DataFlowGraph>::NodesBFSIterator::operator*() {
+  PADDLE_ENFORCE(!queue_.empty());
+  return *queue_.front();
+}
+
+Node *GraphTraits<DataFlowGraph>::NodesBFSIterator::operator->() {
+  PADDLE_ENFORCE(!queue_.empty());
+  return queue_.front();
+}
+
+GraphTraits<DataFlowGraph>::NodesBFSIterator &
+GraphTraits<DataFlowGraph>::NodesBFSIterator::operator=(
+    const GraphTraits<DataFlowGraph>::NodesBFSIterator &other) {
+  queue_ = other.queue_;
+  visited_ = other.visited_;
+  return *this;
+}
+
+GraphTraits<DataFlowGraph>::NodesBFSIterator
+    &GraphTraits<DataFlowGraph>::NodesBFSIterator::operator++() {
+  PADDLE_ENFORCE(!queue_.empty());
+  auto *cur = queue_.front();
+  visited_.insert(cur);
+  queue_.pop_front();
+  for (auto *output : cur->outlinks) {
+    if (!visited_.count(output)) {
+      queue_.push_back(output);
+      visited_.insert(output);
+    }
+  }
+  return *this;
+}
+
+bool GraphTraits<DataFlowGraph>::NodesBFSIterator::operator==(
+    const GraphTraits<DataFlowGraph>::NodesBFSIterator &other) {
+  if (queue_.empty()) return other.queue_.empty();
+  if ((!queue_.empty()) && (!other.queue_.empty())) {
+    return queue_.front() == other.queue_.front() &&
+           visited_.size() == other.visited_.size();  // here need to check the
+                                                      // equality of queue and
+    // visited. Just a light but week implementation.
+  }
+  return false;
+}
+
+//
+// NodesDFSIterator
+//
+GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
+    const std::vector<Node *> &source) {
+  for (auto *x : source) stack_.push(x);
+}
+
+// GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
+//     GraphTraits<DataFlowGraph>::NodesDFSIterator &&other) noexcept
+//     : stack_(std::move(other.stack_)),
+//       visited_(std::move(other.visited_)) {}
+
+GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
+    const GraphTraits<DataFlowGraph>::NodesDFSIterator &other)
+    : stack_(other.stack_), visited_(other.visited_) {}
+
+Node &GraphTraits<DataFlowGraph>::NodesDFSIterator::operator*() {
+  PADDLE_ENFORCE(!stack_.empty());
+  return *stack_.top();
+}
+
+GraphTraits<DataFlowGraph>::NodesDFSIterator
+    &GraphTraits<DataFlowGraph>::NodesDFSIterator::operator++() {
+  if (stack_.empty()) return *this;
+  visited_.insert(stack_.top());
+  auto *cur = stack_.top();
+  stack_.pop();
+  for (auto *x : cur->outlinks) {
+    if (!visited_.count(x)) {
+      stack_.push(x);
+      visited_.insert(x);
+    }
+  }
+  return *this;
+}
+bool GraphTraits<DataFlowGraph>::NodesDFSIterator::operator==(
+    const GraphTraits<DataFlowGraph>::NodesDFSIterator &other) {
+  if (stack_.empty()) return other.stack_.empty();
+  if ((!stack_.empty()) && (!other.stack_.empty())) {
+    return stack_.top() == other.stack_.top();
+  }
+  return false;
+}
+
+GraphTraits<DataFlowGraph>::NodesDFSIterator &
+GraphTraits<DataFlowGraph>::NodesDFSIterator::operator=(
+    const GraphTraits<DataFlowGraph>::NodesDFSIterator &other) {
+  stack_ = other.stack_;
+  visited_ = other.visited_;
+  return *this;
+}
+Node *GraphTraits<DataFlowGraph>::NodesDFSIterator::operator->() {
+  return stack_.top();
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..913e344d371ddf3ea05a53c216e5b3bea8f11c7b
--- /dev/null
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -0,0 +1,162 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * Data flow graph is an pass that build the basic graph. It contains a graph
+ * and the iterators that enable the iteration over the graph.
+ */
+
+#pragma once
+
+#include <deque>
+#include <stack>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/inference/analysis/graph_traits.h"
+#include "paddle/fluid/inference/analysis/node.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * DataFlowGraph - A container of Value and Function Nodes.
+ */
+struct DataFlowGraph {
+  NodeMap nodes;
+  std::vector<Node *> inputs;
+  std::vector<Node *> outputs;
+
+  // Extract inputs and outputs of the graph.
+  void Build();
+
+  // Output a DOT graph file for debug.
+  std::string DotString() const;
+};
+
+/*
+ * An graph trait help to traverse the graph using BFS.
+ * The BFS start from a graph's inputs, the graph should be fully-connected, so
+ * that the iterator can reach the end.
+ */
+template <>
+struct GraphTraits<DataFlowGraph> {
+  // BFS iterator on nodes.
+  struct NodesBFSIterator
+      : public std::iterator<std::forward_iterator_tag, Node *> {
+    NodesBFSIterator() = default;
+    explicit NodesBFSIterator(const std::vector<Node *> &source);
+    // NodesBFSIterator(NodesBFSIterator &&other) noexcept;
+    // NOTE Heavy to use.
+    NodesBFSIterator(const NodesBFSIterator &other);
+
+    Node &operator*();
+    NodesBFSIterator &operator++();
+    Node *operator->();
+    // TODO(Superjomn) current implementation just compare the first
+    // element, need to compare the graph and all the elements in the queue and
+    // set.
+    NodesBFSIterator &operator=(const NodesBFSIterator &other);
+    bool operator==(const NodesBFSIterator &other);
+    bool operator!=(const NodesBFSIterator &other) { return !(*this == other); }
+
+   private:
+    std::deque<Node *> queue_;
+    std::unordered_set<Node *> visited_;
+  };
+
+  // DFS iterator on nodes.
+  struct NodesDFSIterator
+      : public std::iterator<std::forward_iterator_tag, Node *> {
+    NodesDFSIterator() = default;
+    explicit NodesDFSIterator(const std::vector<Node *> &source);
+    // NodesDFSIterator(NodesDFSIterator &&other) noexcept;
+    NodesDFSIterator(const NodesDFSIterator &other);
+
+    Node &operator*();
+    NodesDFSIterator &operator++();
+    // TODO(Superjomn) current implementation just compare the first
+    // element, need to compare the graph and all the elements in the queue and
+    // set.
+    NodesDFSIterator &operator=(const NodesDFSIterator &other);
+    bool operator==(const NodesDFSIterator &other);
+    bool operator!=(const NodesDFSIterator &other) { return !(*this == other); }
+    Node *operator->();
+
+   private:
+    std::stack<Node *> stack_;
+    std::unordered_set<Node *> visited_;
+  };
+
+  explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {}
+
+  // default use BFS to visit the nodes.
+  iterator_range<NodesBFSIterator> nodes() {
+    return iterator_range<NodesBFSIterator>(nodes_bfs_begin(), nodes_bfs_end());
+  }
+  iterator_range<NodesBFSIterator> nodes_in_BFS() {
+    return iterator_range<NodesBFSIterator>(nodes_bfs_begin(), nodes_bfs_end());
+  }
+  iterator_range<NodesDFSIterator> nodes_in_DFS() {
+    return iterator_range<NodesDFSIterator>(nodes_dfs_begin(), nodes_dfs_end());
+  }
+
+ private:
+  NodesBFSIterator nodes_bfs_begin() {
+    return NodesBFSIterator(graph_->inputs);
+  }
+  NodesBFSIterator nodes_bfs_end() { return NodesBFSIterator(); }
+  NodesDFSIterator nodes_dfs_begin() {
+    return NodesDFSIterator(graph_->inputs);
+  }
+  NodesDFSIterator nodes_dfs_end() { return NodesDFSIterator(); }
+
+ private:
+  DataFlowGraph *graph_;
+};
+
+// Extract the inputs and outputs of a graph. The inputs and outputs of a
+// sub-graph is the inputs nodes and output nodes that doesn't inside the
+// sub-graph.
+std::pair<
+    std::vector<Node *>,
+    std::vector<
+        Node *>> static ExtractInputAndOutputOfSubGraph(std::vector<Node *>
+                                                            &graph) {
+  std::unordered_set<Node *> nodes(graph.begin(), graph.end());
+  std::unordered_set<Node *> inputs;
+  std::unordered_set<Node *> outputs;
+  for (auto &node : graph) {
+    for (auto *in : node->inlinks) {
+      if (!nodes.count(in) && in->type() == Node::Type::kValue) {
+        inputs.insert(in);
+      }
+    }
+    for (auto *out : node->outlinks) {
+      if (!nodes.count(out) && out->type() == Node::Type::kValue) {
+        outputs.insert(out);
+      }
+    }
+  }
+  return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
+                        std::vector<Node *>(outputs.begin(), outputs.end()));
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9d7cceeb65888b8ba3fdf39e88fc2877abd82d11
--- /dev/null
+++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
@@ -0,0 +1,62 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST(DataFlowGraph, BFS) {
+  auto desc = LoadProgramDesc();
+  auto dfg = ProgramDescToDFG(desc);
+  dfg.Build();
+
+  for (auto* in : dfg.inputs) {
+    LOG(INFO) << "inputs: " << in->name() << " "
+              << static_cast<int>(in->type());
+  }
+  for (auto* out : dfg.outputs) {
+    LOG(INFO) << "outputs: " << out->name() << " "
+              << static_cast<int>(out->type());
+  }
+
+  GraphTraits<DataFlowGraph> trait(&dfg);
+  auto nodes = trait.nodes();
+  size_t count = 0;
+  for (auto it = nodes.begin(); it != nodes.end(); ++it) {
+    LOG(INFO) << "visiting " << it->name();
+    ++count;
+  }
+  ASSERT_EQ(count, dfg.nodes.size());
+}
+
+TEST(DataFlowGraph, DFS) {
+  auto desc = LoadProgramDesc();
+  auto dfg = ProgramDescToDFG(desc);
+  dfg.Build();
+  GraphTraits<DataFlowGraph> trait(&dfg);
+  auto nodes = trait.nodes_in_DFS();
+  size_t count = 0;
+  for (auto it = nodes.begin(); it != nodes.end(); ++it) {
+    LOG(INFO) << "visiting " << it->name();
+    ++count;
+  }
+  ASSERT_EQ(count, dfg.nodes.size());
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dcee75cee50ede1d2b660e88e06544440bd5ef77
--- /dev/null
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
@@ -0,0 +1,49 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+
+#include <glog/logging.h>
+#include <google/protobuf/text_format.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/io.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST_F(DFG_Tester, Test) {
+  framework::proto::ProgramDesc new_desc;
+  DataFlowGraph graph;
+
+  FluidToDataFlowGraphPass pass0;
+  DataFlowGraphToFluidPass pass1;
+  pass0.Initialize(desc);
+  pass1.Initialize(&new_desc);
+
+  pass0.Run(&graph);
+  pass1.Run(&graph);
+
+  pass0.Finalize();
+  pass1.Finalize();
+
+  LOG(INFO) << graph.nodes.size();
+}
+
+};  // namespace analysis
+};  // namespace inference
+};  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/device.h b/paddle/fluid/inference/analysis/device.h
new file mode 100644
index 0000000000000000000000000000000000000000..585c9923291e5f9cb6e50dbc4bcd28c374191048
--- /dev/null
+++ b/paddle/fluid/inference/analysis/device.h
@@ -0,0 +1,24 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+enum class Device { CPU, GPU };
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..41d4475382befa1bdaf7473520d64005a472a459
--- /dev/null
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file create an DFG_GraphvizDrawPass which helps to draw a data flow
+ * graph's structure using graphviz.
+ */
+
+#pragma once
+
+#include <fstream>
+#include <string>
+#include "paddle/fluid/inference/analysis/pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Output a dot file and write to some place.
+ */
+class DFG_GraphvizDrawPass : public DataFlowGraphPass {
+ public:
+  DFG_GraphvizDrawPass(const std::string& dir, const std::string& id)
+      : dir_(dir), id_(id) {}
+
+  bool Initialize() override { return Pass::Initialize(); }
+  void Run(DataFlowGraph* graph) override {
+    auto content = Draw(graph);
+    std::ofstream file(GenDotPath());
+    file.write(content.c_str(), content.size());
+    file.close();
+    LOG(INFO) << "draw dot to " << GenDotPath();
+  }
+
+  bool Finalize() override { return Pass::Finalize(); }
+
+  Pass* CreatePrinterPass(std::ostream& os,
+                          const std::string& banner) const override {
+    return nullptr;
+  }
+
+ private:
+  // Path of the dot file to output.
+  std::string GenDotPath() const {
+    return dir_ + "/" + "graph_" + id_ + ".dot";
+  }
+
+  std::string Draw(DataFlowGraph* graph) { return graph->DotString(); }
+
+  std::string dir_;
+  std::string id_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3fc1cc18b855440c54c1ed6a9ab49a104c8c21f0
--- /dev/null
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+
+#include <gtest/gtest.h>
+#include <fstream>
+#include <string>
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
+  auto dfg = ProgramDescToDFG(desc);
+  DFG_GraphvizDrawPass pass("./", "test");
+  pass.Initialize();
+  pass.Run(&dfg);
+
+  // test content
+  std::ifstream file("./graph_test.dot");
+  ASSERT_TRUE(file.is_open());
+
+  std::string line;
+  int no{0};
+  while (std::getline(file, line)) {
+    no++;
+  }
+  ASSERT_EQ(no, 82);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dot.cc b/paddle/fluid/inference/analysis/dot.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d5471ffcb594a6915e9e65c0fee5adc5f5bdf40c
--- /dev/null
+++ b/paddle/fluid/inference/analysis/dot.cc
@@ -0,0 +1,23 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/dot.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+size_t Dot::counter = 0;
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dot.h b/paddle/fluid/inference/analysis/dot.h
new file mode 100644
index 0000000000000000000000000000000000000000..4bf1840fdda8508b52d7274a338c5b1c95baf354
--- /dev/null
+++ b/paddle/fluid/inference/analysis/dot.h
@@ -0,0 +1,155 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file implements some helper classes and methods for DOT programming
+ * support. It will give a visualization of the graph and that helps to debug
+ * the logics of each Pass.
+ */
+#pragma once
+
+#include <glog/logging.h>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * A Dot template that helps to build a DOT graph definition.
+ */
+class Dot {
+ public:
+  static size_t counter;
+
+  struct Attr {
+    std::string key;
+    std::string value;
+
+    Attr(const std::string& key, const std::string& value)
+        : key(key), value(value) {}
+
+    std::string repr() const {
+      std::stringstream ss;
+      ss << key << "=" << '"' << value << '"';
+      return ss.str();
+    }
+  };
+
+  struct Node {
+    std::string name;
+    std::vector<Attr> attrs;
+
+    Node(const std::string& name, const std::vector<Attr>& attrs)
+        : name(name),
+          attrs(attrs),
+          id_("node_" + std::to_string(Dot::counter++)) {}
+
+    std::string id() const { return id_; }
+
+    std::string repr() const {
+      std::stringstream ss;
+      CHECK(!name.empty());
+      ss << id_;
+      for (size_t i = 0; i < attrs.size(); i++) {
+        if (i == 0) {
+          ss << "[label=" << '"' << name << '"' << " ";
+        }
+        ss << attrs[i].repr();
+        ss << ((i < attrs.size() - 1) ? " " : "]");
+      }
+      return ss.str();
+    }
+
+   private:
+    std::string id_;
+  };
+
+  struct Edge {
+    std::string source;
+    std::string target;
+    std::vector<Attr> attrs;
+
+    Edge(const std::string& source, const std::string& target,
+         const std::vector<Attr>& attrs)
+        : source(source), target(target), attrs(attrs) {}
+
+    std::string repr() const {
+      std::stringstream ss;
+      CHECK(!source.empty());
+      CHECK(!target.empty());
+      ss << source << "->" << target;
+      for (size_t i = 0; i < attrs.size(); i++) {
+        if (i == 0) {
+          ss << "[";
+        }
+        ss << attrs[i].repr();
+        ss << ((i < attrs.size() - 1) ? " " : "]");
+      }
+      return ss.str();
+    }
+  };
+
+  Dot() = default;
+
+  explicit Dot(const std::vector<Attr>& attrs) : attrs_(attrs) {}
+
+  void AddNode(const std::string& name, const std::vector<Attr>& attrs) {
+    CHECK(!nodes_.count(name)) << "duplicate Node '" << name << "'";
+    nodes_.emplace(name, Node{name, attrs});
+  }
+
+  void AddEdge(const std::string& source, const std::string& target,
+               const std::vector<Attr>& attrs) {
+    CHECK(!source.empty());
+    CHECK(!target.empty());
+    auto sid = nodes_.at(source).id();
+    auto tid = nodes_.at(target).id();
+    edges_.emplace_back(sid, tid, attrs);
+  }
+
+  // Compile to DOT language codes.
+  std::string Build() const {
+    std::stringstream ss;
+    const std::string indent = "   ";
+    ss << "digraph G {" << '\n';
+
+    // Add graph attrs
+    for (const auto& attr : attrs_) {
+      ss << indent << attr.repr() << '\n';
+    }
+    // add nodes
+    for (auto& item : nodes_) {
+      ss << indent << item.second.repr() << '\n';
+    }
+    // add edges
+    for (auto& edge : edges_) {
+      ss << indent << edge.repr() << '\n';
+    }
+    ss << "} // end G";
+    return ss.str();
+  }
+
+ private:
+  std::unordered_map<std::string, Node> nodes_;
+  std::vector<Edge> edges_;
+  std::vector<Attr> attrs_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dot_tester.cc b/paddle/fluid/inference/analysis/dot_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..56ceb9bd5d6f41a601d66f6124fb7b4099c9337e
--- /dev/null
+++ b/paddle/fluid/inference/analysis/dot_tester.cc
@@ -0,0 +1,62 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/dot.h"
+
+#include <gtest/gtest.h>
+#include <memory>
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+class DotTester : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    std::vector<Dot::Attr> attrs({{"title", "hello"}});
+    dot.reset(new Dot(attrs));
+    dot->AddNode("a", {Dot::Attr{"shape", "box"}, Dot::Attr("color", "blue")});
+    dot->AddNode("b", {});
+    dot->AddNode("c", {});
+    dot->AddEdge("a", "b", {});
+    dot->AddEdge("b", "c", {});
+    dot->AddEdge("a", "c", {});
+  }
+
+  std::unique_ptr<Dot> dot;
+};
+
+TEST_F(DotTester, Build) {
+  auto codes = dot->Build();
+  // Output the DOT language code, the generated codes are too long to compare
+  // the string.
+  //
+  // The output is
+  //
+  // digraph G {
+  //   title="hello"
+  //   node_1
+  //   node_2
+  //   node_0[label="a" shape="box" color="blue"]
+  //   node_0->node_1
+  //   node_1->node_2
+  //   node_0->node_2
+  // } // end G
+  LOG(INFO) << '\n' << codes;
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9f67c989cca4a936cd320b73efaae277263fb3e2
--- /dev/null
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -0,0 +1,85 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+FluidToDataFlowGraphPass::FluidToDataFlowGraphPass() {}
+
+bool FluidToDataFlowGraphPass::Initialize() { return Pass::Initialize(); }
+
+bool FluidToDataFlowGraphPass::Initialize(
+    const framework::proto::ProgramDesc &desc) {
+  desc_ = &desc;
+  return true;
+}
+
+bool FluidToDataFlowGraphPass::Finalize() { return Pass::Finalize(); }
+
+void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
+  // insert vars
+  std::unordered_map<std::string, size_t> var2id;
+  auto &main_block = desc_->blocks(framework::kRootBlockIndex);
+  for (int i = 0; i < main_block.vars_size(); i++) {
+    const auto &var = main_block.vars(i);
+    auto *v = graph->nodes.Create(Node::Type::kValue);
+    v->SetName(var.name());
+    v->SetExtraInfo(const_cast<void *>(static_cast<const void *>(&var)));
+    var2id[var.name()] = v->id();
+  }
+  for (int i = 0; i < main_block.ops_size(); i++) {
+    const auto &op = main_block.ops(i);
+    auto *o = graph->nodes.Create(Node::Type::kFunction);
+    o->SetName(op.type());
+    static_cast<Function *>(o)->SetFuncType(op.type());
+    // Link to the original protobuf message's memory, make it easier to
+    // generate from a data flow graph to fluid ProgramDesc.
+    o->SetExtraInfo(const_cast<void *>(static_cast<const void *>(&op)));
+    // set inputs and outputs
+    // TODO(Superjomn) make sure the InputNames is the real variable name.
+    for (int j = 0; j < op.inputs_size(); j++) {
+      auto &in_var = op.inputs(j);
+      for (int k = 0; k < in_var.arguments_size(); k++) {
+        auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k)));
+        in->outlinks.push_back(o);
+        o->inlinks.push_back(in);
+      }
+    }
+    for (int j = 0; j < op.outputs_size(); j++) {
+      auto &out_var = op.outputs(j);
+      for (int k = 0; k < out_var.arguments_size(); k++) {
+        auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]);
+        out->inlinks.push_back(o);
+        o->outlinks.push_back(out);
+      }
+    }
+  }
+  // Analysis and extract the inputs and outputs of this graph.
+  graph->Build();
+}
+
+Pass *FluidToDataFlowGraphPass::CreatePrinterPass(
+    std::ostream &os, const std::string &banner) const {
+  return nullptr;
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..33517e57becdffc0416f204247eac5feadb7ed82
--- /dev/null
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+/*
+ * This file implements the transformation from data flow graph to fluid
+ * ProgramDesc.
+ */
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Transform a FluidDesc to a data flow graph.
+ */
+class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
+ public:
+  FluidToDataFlowGraphPass();
+  bool Initialize() override;
+  bool Initialize(const framework::proto::ProgramDesc &desc) override;
+  bool Finalize() override;
+
+  void Run(DataFlowGraph *graph) override;
+
+  Pass *CreatePrinterPass(std::ostream &os,
+                          const std::string &banner) const override;
+
+ private:
+  framework::proto::ProgramDesc const *desc_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..817d32c92cdbdc234eef9ed5156891c2b11ced4c
--- /dev/null
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
@@ -0,0 +1,37 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST_F(DFG_Tester, Init) {
+  FluidToDataFlowGraphPass pass;
+  pass.Initialize();
+  pass.Initialize(desc);
+  DataFlowGraph graph;
+  pass.Run(&graph);
+  ASSERT_GT(graph.nodes.size(), 0);
+  pass.Finalize();
+  LOG(INFO) << '\n' << graph.DotString();
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/graph_traits.cc b/paddle/fluid/inference/analysis/graph_traits.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2ea70a1d2060e03769d67060dc6f008207342b52
--- /dev/null
+++ b/paddle/fluid/inference/analysis/graph_traits.cc
@@ -0,0 +1,15 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/graph_traits.h"
diff --git a/paddle/fluid/inference/analysis/graph_traits.h b/paddle/fluid/inference/analysis/graph_traits.h
new file mode 100644
index 0000000000000000000000000000000000000000..aed2b1e8e27d94b430201d70ecf09d4acc33c8fa
--- /dev/null
+++ b/paddle/fluid/inference/analysis/graph_traits.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file defines the GraphTraits<X> template class that should be specified
+ * by classes that want to be iteratable by generic graph iterators.
+ *
+ * This file also defines the marker class Inverse that is used to iterate over
+ * graphs in a graph defined, inverse ordering...
+ */
+
+#pragma once
+
+#include "paddle/fluid/inference/analysis/helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * This class should be specialized by different graph types...
+ * That's why the base class is empty.
+ */
+template <typename GraphType>
+struct GraphTraits {
+  // using NodesBFSIterator = xxx
+
+  // NodesBFSIterator nodes_begin();
+  // NodesBFSIterator nodes_end();
+};
+
+/*
+ * Inverse - This class is used as a marker class to tell the graph iterator to
+ * iterate in a graph defined Inverse order.
+ */
+template <typename GraphType>
+struct Inverse {
+  const GraphType &graph;
+
+  explicit Inverse(const GraphType &graph) : graph(graph) {}
+};
+
+/*
+ * Provide a partial specialization of GraphTraits so that the inverse of an
+ * inverse turns into the original graph.
+ */
+template <typename GraphType>
+struct GraphTraits<Inverse<Inverse<GraphType>>> : GraphTraits<GraphType> {};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..58eb0e715cb71d87179f3240de55021603cd7423
--- /dev/null
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+template <typename Vec>
+int AccuDims(Vec &&vec, int size) {
+  int res = 1;
+  for (int i = 0; i < size; i++) {
+    res *= std::forward<Vec>(vec)[i];
+  }
+  return res;
+}
+
+#define SET_TYPE(type__) dic_[typeid(type__).hash_code()] = #type__;
+/*
+ * Map typeid to representation.
+ */
+struct DataTypeNamer {
+  static const DataTypeNamer &Global() {
+    static auto *x = new DataTypeNamer();
+    return *x;
+  }
+
+  template <typename T>
+  const std::string &repr() const {
+    auto x = typeid(T).hash_code();
+    PADDLE_ENFORCE(dic_.count(x), "unknown type for representation");
+    return dic_.at(x);
+  }
+
+  const std::string &repr(size_t &hash) const {  // NOLINT
+    PADDLE_ENFORCE(dic_.count(hash), "unknown type for representation");
+    return dic_.at(hash);
+  }
+
+ private:
+  DataTypeNamer() {
+    SET_TYPE(int);
+    SET_TYPE(bool);
+    SET_TYPE(float);
+  }
+
+  std::unordered_map<decltype(typeid(int).hash_code()),  // NOLINT
+                     std::string>
+      dic_;
+};
+#undef SET_TYPE
+
+template <typename IteratorT>
+class iterator_range {
+  IteratorT begin_, end_;
+
+ public:
+  template <typename Container>
+  explicit iterator_range(Container &&c) : begin_(c.begin()), end_(c.end()) {}
+
+  iterator_range(const IteratorT &begin, const IteratorT &end)
+      : begin_(begin), end_(end) {}
+
+  const IteratorT &begin() const { return begin_; }
+  const IteratorT &end() const { return end_; }
+};
+
+/*
+ * An registry helper class, with its records keeps the order they registers.
+ */
+template <typename T>
+class OrderedRegistry {
+ public:
+  T *Register(const std::string &name, T *x) {
+    PADDLE_ENFORCE(!dic_.count(name));
+    dic_[name] = data_.size();
+    data_.emplace_back(std::unique_ptr<T>(x));
+    return data_.back().get();
+  }
+
+  T *Lookup(const std::string &name) {
+    auto it = dic_.find(name);
+    if (it == dic_.end()) return nullptr;
+    return data_[it->second].get();
+  }
+
+ protected:
+  std::unordered_map<std::string, int> dic_;
+  std::vector<std::unique_ptr<T>> data_;
+};
+
+template <typename T>
+T &GetFromScope(const framework::Scope &scope, const std::string &name) {
+  framework::Variable *var = scope.FindVar(name);
+  PADDLE_ENFORCE(var != nullptr);
+  return *var->GetMutable<T>();
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
+
+#define PADDLE_DISALLOW_COPY_AND_ASSIGN(type__) \
+  type__(const type__ &) = delete;              \
+  void operator=(const type__ &) = delete;
diff --git a/paddle/fluid/inference/analysis/node.cc b/paddle/fluid/inference/analysis/node.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fe060526080b1ee01aa98f2ff06fb2191eddf9da
--- /dev/null
+++ b/paddle/fluid/inference/analysis/node.cc
@@ -0,0 +1,67 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/node.h"
+#include "glog/logging.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+std::vector<Dot::Attr> Value::dot_attrs() const {
+  return std::vector<Dot::Attr>({Dot::Attr("style", "filled,rounded"),
+                                 Dot::Attr("shape", "box"),
+                                 Dot::Attr("fillcolor", "red")});
+}
+
+std::vector<Dot::Attr> Function::dot_attrs() const {
+  return std::vector<Dot::Attr>({Dot::Attr("style", "filled,rounded"),
+                                 Dot::Attr("shape", "diamond"),
+                                 Dot::Attr("fillcolor", "yellow")});
+}
+
+Node *NodeMap::Create(Node::Type type) {
+  switch (type) {
+    case Node::Type::kFunction:
+      nodes_.emplace_back(new Function);
+      break;
+    case Node::Type::kValue:
+      nodes_.emplace_back(new Value);
+      break;
+    default:
+      PADDLE_THROW("Not supported node type.");
+  }
+  nodes_.back()->id_ = size() - 1;
+  return nodes_.back().get();
+}
+
+Node *NodeMap::GetMutable(size_t id) {
+  PADDLE_ENFORCE_GT(size(), id);
+  return nodes_[id].get();
+}
+
+const Node &NodeMap::Get(size_t id) const {
+  PADDLE_ENFORCE_GT(size(), id);
+  return *nodes_[id].get();
+}
+
+void NodeMap::Delete(size_t id) {
+  PADDLE_ENFORCE_LT(id, size());
+  nodes_[id]->SetDeleted();
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/node.h b/paddle/fluid/inference/analysis/node.h
new file mode 100644
index 0000000000000000000000000000000000000000..7972ca25c92186a8c55a76de645f4fdbb089e8d3
--- /dev/null
+++ b/paddle/fluid/inference/analysis/node.h
@@ -0,0 +1,242 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file defines the Node class and its subclasses. A Node is the basis
+ * analysis element in a computation graph.
+ * There are basically two kinds of nodes, the function node and value node.
+ */
+#pragma once
+
+#include <limits>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/inference/analysis/device.h"
+#include "paddle/fluid/inference/analysis/dot.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+class NodeMap;
+
+/*
+ * Node Representation.
+ *
+ * This is a very important class for analysis. It is the base class of all
+ * nodes computed by a program that may be used as operands to other nodes.
+ * Node is the super class of other important classes such as Function and
+ * Value, some nodes can have a name.
+ */
+class Node {
+ public:
+  // Node type. NOTE the new node types should add here.
+  enum class Type { kNone = -1, kFunction, kValue, kFunctionBlock };
+
+  Node() = default;
+
+  struct Attr;
+
+  // Cast to a subclass type, Function for example.
+  template <typename Subclass>
+  Subclass &As() {
+    return *dynamic_cast<Subclass *>(this);
+  }
+
+  // Formatted representation of this Node.
+  virtual std::string repr() const {
+    return name() + "(" + std::to_string(id()) + ")";
+  }
+
+  // DOT node representation. One Node type can customize its own node
+  // representation.
+  virtual std::vector<Dot::Attr> dot_attrs() const {
+    return std::vector<Dot::Attr>({Dot::Attr("style", "filled")});
+  }
+
+  // Get an additional attribute and convert it to T data type. NOTE this will
+  // silently create a new attribute if not exists.
+  Attr &attr(const std::string &name) { return attrs_[name]; }
+
+  int id() const { return id_; }
+
+  bool deleted() const { return deleted_; }
+  void SetDeleted() { deleted_ = true; }
+
+  void SetName(const std::string &name) { name_ = name; }
+  const std::string &name() const { return name_; }
+
+  void SetType(Type type) { type_ = type; }
+  Type type() const { return type_; }
+
+  void *extra_info() const { return extra_info_; }
+  void SetExtraInfo(void *extra_info) { extra_info_ = extra_info; }
+
+  // Input links.
+  std::vector<Node *> inlinks;
+  // Output links.
+  std::vector<Node *> outlinks;
+
+  // A helper class to maintain the status from Pass.
+  // TODO(superjomn) add a checker here to ensure the T is primary.
+  struct Attr {
+    // NOTE T should be a primary type or a struct combined by several primary
+    // types.
+    // NOTE the STL containers should not use here.
+    // Some usages
+    // Attr attr;
+    // T data;
+    // attr.data.assign((char*)data, sizeof(data));
+
+    bool &Bool() { return As<bool>(); }
+    float &Float() { return As<float>(); }
+    int32_t &Int32() { return As<int32_t>(); }
+    int64_t &Int64() { return As<int64_t>(); }
+
+   private:
+    template <typename T>
+    T &As() {
+      // init storage in the first usage.
+      if (data_.empty()) {
+        VLOG(4) << "resize data to " << sizeof(T);
+        type_hash_ = typeid(T).hash_code();
+        data_.resize(sizeof(T));
+      }
+      PADDLE_ENFORCE(type_hash_ == typeid(T).hash_code(),
+                     "type not matched, origin is %s, want %s",
+                     DataTypeNamer::Global().repr(type_hash_),
+                     DataTypeNamer::Global().repr<T>());
+      PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
+      return *reinterpret_cast<T *>(&data_[0]);
+    }
+
+   private:
+    std::string data_;
+    size_t type_hash_{std::numeric_limits<size_t>::max()};
+  };
+
+  bool IsFunction() const { return type_ == Node::Type::kFunction; }
+  bool IsValue() const { return type_ == Node::Type::kValue; }
+  bool IsFunctionBlock() const { return type_ == Node::Type::kFunctionBlock; }
+
+  virtual ~Node() {}
+
+  friend class NodeMap;
+
+  PADDLE_DISALLOW_COPY_AND_ASSIGN(Node);
+
+ protected:
+  // The id number not the name is a node's unique identifier in the computation
+  // graph.
+  int id_{-1};
+  std::string name_;
+  Type type_{Type::kNone};
+  // Mark this node is deleted by some pass.
+  bool deleted_{false};
+
+  void *extra_info_;
+
+  mutable std::unordered_map<std::string, Attr> attrs_;
+};
+
+class Function;
+/*
+ * Value represents a value node, it has some attributes including dims, data
+ * type and so on.
+ */
+class Value : public Node {
+ public:
+  enum class DataType { kInt32, kInt64, kFloat32, kFloat64 };
+  using Dims = std::vector<int>;
+
+  void SetDataType(DataType data_type) { data_type_ = data_type; }
+  DataType data_type() const { return data_type_; }
+
+  void SetDims(const Dims &dims) { dims_ = dims; }
+  const Dims &dims() const { return dims_; }
+
+  Device device() const { return device_; }
+  void SetDevice(Device device) { device_ = device; }
+
+  std::vector<Dot::Attr> dot_attrs() const override;
+
+  PADDLE_DISALLOW_COPY_AND_ASSIGN(Value);
+
+ protected:
+  Value() { SetType(Node::Type::kValue); }
+  friend class NodeMap;
+
+ private:
+  DataType data_type_;
+  Dims dims_;
+  Device device_;
+};
+
+/*
+ * Function represents any kind of executable concepts that takes several Values
+ * as input, and outputs several Values.
+ */
+class Function : public Node {
+ public:
+  std::vector<Dot::Attr> dot_attrs() const override;
+
+  // Get the operator's type from Desc.
+  const std::string &func_type() const { return func_type_; }
+  // Set the operator's type.
+  void SetFuncType(const std::string &func_type) { func_type_ = func_type; }
+
+  PADDLE_DISALLOW_COPY_AND_ASSIGN(Function);
+
+ protected:
+  std::string func_type_;
+  Function() { SetType(Node::Type::kFunction); }
+  friend class NodeMap;
+};
+
+/*
+ * FunctionBlock is a Node that contains a sub-graph multiple Node.
+ */
+struct FunctionBlock : public Node {
+  std::string repr() const override { return "block-" + std::to_string(id()); }
+  std::vector<Node *> subgraph;
+};
+
+class NodeMap {
+ public:
+  // Create a new node with type.
+  Node *Create(Node::Type type);
+
+  // Get a node by its id.
+  Node *GetMutable(size_t id);
+
+  const Node &Get(size_t id) const;
+
+  void Delete(size_t id);
+
+  const std::vector<std::unique_ptr<Node>> &nodes() { return nodes_; }
+
+  size_t size() const { return nodes_.size(); }
+
+ private:
+  std::vector<std::unique_ptr<Node>> nodes_;
+  std::unordered_map<std::string, Node *> map_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/node_tester.cc b/paddle/fluid/inference/analysis/node_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea832a3a7e47758be9b6bd59a4325ddb576ec446
--- /dev/null
+++ b/paddle/fluid/inference/analysis/node_tester.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/node.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST(Node, Attr) {
+  // Node is an abstract class, use Value instead for they share the same Attr
+  // logic.
+  NodeMap nodes;
+  auto* node = nodes.Create(Node::Type::kValue);
+  node->attr("v0").Int32() = 2008;
+  ASSERT_EQ(node->attr("v0").Int32(), 2008);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass.cc b/paddle/fluid/inference/analysis/pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..121b72c0a0aa9a0c568b04f7ee9a5bc5c1d6f5f8
--- /dev/null
+++ b/paddle/fluid/inference/analysis/pass.cc
@@ -0,0 +1,15 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/pass.h"
diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa0e8667b5e4a9e6156c25fcad03bb8eee3287f6
--- /dev/null
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+#include <iosfwd>
+#include <string>
+
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/analysis/node.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+class Pass {
+ public:
+  Pass() = default;
+  virtual ~Pass() {}
+  // Virtual method overridden by subclasses to do only necessary initialization
+  // before any pass is run.
+  virtual bool Initialize() { return false; }
+  // There is some passes such as FlowToDataFlowGraphPass that needs a
+  // ProgramDesc. Here use the native ProgramDesc ProtoBuf message, so that it
+  // only couple with the proto file.
+  virtual bool Initialize(const framework::proto::ProgramDesc &desc) {
+    return false;
+  }
+  // There are some Passes such as DataFlowGraphToFluidPass that will output a
+  // ProgramDesc.
+  virtual bool Initialize(framework::proto::ProgramDesc *desc) { return false; }
+
+  // Virtual method overriden by subclasses to do any necessary clean up after
+  // all passes have run.
+  virtual bool Finalize() { return false; }
+
+  // Get a Pass appropriate to print the Node this pass operates on.
+  virtual Pass *CreatePrinterPass(std::ostream &os,
+                                  const std::string &banner) const = 0;
+
+  // Run on a single Node.
+  virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
+  // Run on a single Function.
+  virtual void Run(Function *x) { LOG(FATAL) << "not valid"; }
+  // Run on a single FunctionBlock.
+  virtual void Run(FunctionBlock *x) { LOG(FATAL) << "not valid"; }
+  // Run on a single DataFlowGraph.
+  virtual void Run(DataFlowGraph *x) { LOG(FATAL) << "not valid"; }
+};
+
+// NodePass process on any Node types.
+class NodePass : public Pass {
+ public:
+  virtual void Run(Node *node) = 0;
+};
+
+// NodePass process on any Function node types.
+class FunctionPass : public Pass {
+ public:
+  virtual void Run(Function *node) = 0;
+};
+
+// NodePass process on any FunctionBlock node types.
+class FunctionBlockPass : public Pass {
+ public:
+  virtual void Run(FunctionBlock *node) = 0;
+};
+
+// GraphPass processes on any GraphType.
+class DataFlowGraphPass : public Pass {
+ public:
+  virtual void Run(DataFlowGraph *graph) = 0;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..43ccac96c84e987ad1f494af3e314c810fc1ffe3
--- /dev/null
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -0,0 +1,154 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+const char *SubGraphSplitter::kMarkerAttrName =
+    "_sub_graph_splitter_inside_sub_graph";
+
+std::vector<std::vector<Node *>> SubGraphSplitter::operator()() {
+  MarkNodesInsideSubGraph();
+  return ExtractSubGraphs();
+}
+
+// Mark the output variables inside a subgraph with the func.
+inline void MarkOutLinksInSubGraph(const Function *func) {
+  for (auto *var : func->outlinks) {
+    var->attr(SubGraphSplitter::kMarkerAttrName).Bool() = true;
+  }
+}
+
+void SubGraphSplitter::MarkNodesInsideSubGraph() {
+  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes()) {
+    if (node_inside_subgraph_teller_(&node)) {
+      node.attr(kMarkerAttrName).Bool() = true;
+      if (node.type() == Node::Type::kFunction) {
+        // If a function is inside the sub-graph, mark all the output variables
+        // to be inside too, so that two marked functions will be inside a same
+        // sub-graph, lets take a example:  A_function->var->B_function, if
+        // A_function is marked, var should also be marked, so that B_function
+        // will be in the same sub-graph with A_function if B_function is
+        // marked.
+        MarkOutLinksInSubGraph(static_cast<const Function *>(&node));
+      }
+    }
+  }
+}
+
+const char *kUnionFindParent = "_sub_graph_splitter_union_find_parent_";
+
+// Use the Union Find(UF) algorithm to find fully connected sub-graphs, if node
+// a's output is node b, that is a and b is in the same sub-graph. The UF
+// algorithm will group them to the same cluster.
+using node_map_t = std::unordered_map<int, Node *>;
+// Find the ancestor id of a node.
+int UnionFindGetAncestor(const node_map_t &node_map, size_t id) {
+  int tmp = id;
+  do {
+    tmp = node_map.at(tmp)->attr(kUnionFindParent).Int32();
+  } while (node_map.at(tmp)->attr(kUnionFindParent).Int32() != tmp);
+  return tmp;
+}
+// Make this two node share the same ancestor.
+// TODO(Superjom) bad performance, make a balanced tree latter.
+void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) {
+  int a_ancestor = UnionFindGetAncestor(node_map, a);
+  int b_ancestor = UnionFindGetAncestor(node_map, b);
+  node_map.at(b_ancestor)->attr(kUnionFindParent).Int32() = a_ancestor;
+  node_map.at(a)->attr(kUnionFindParent).Int32() = a_ancestor;
+  node_map.at(b)->attr(kUnionFindParent).Int32() = a_ancestor;
+}
+
+std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
+  std::vector<Node *> marked_nodes;
+  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes()) {
+    if (node.attr(kMarkerAttrName).Bool()) {
+      marked_nodes.push_back(&node);
+    }
+  }
+  // extract sub-graphs in the marked node set, use Union Find algorithm.
+  node_map_t node_map;  // id to ptr
+  for (auto *n : marked_nodes) {
+    // n's parent == n.id means it is the ancestor
+    n->attr(kUnionFindParent).Int32() = n->id();
+    node_map[n->id()] = n;
+  }
+  std::unordered_set<Node *> visited;
+  for (auto *n : marked_nodes) {
+    for (auto *out : n->outlinks) {
+      if (node_map.count(out->id())) {
+        UnionFindCombine(node_map, n->id(), out->id());
+      }
+    }
+  }
+
+  std::unordered_map<int /*ancestor*/, std::vector<Node *>> clusters;
+  for (auto *n : marked_nodes) {
+    if (n->type() == Node::Type::kFunction) {
+      clusters[UnionFindGetAncestor(node_map,
+                                    n->attr(kUnionFindParent).Int32())]
+          .push_back(n);
+    }
+  }
+  std::vector<std::vector<Node *>> result;
+  std::for_each(clusters.begin(), clusters.end(),
+                [&](const decltype(clusters)::value_type &it) {
+                  result.push_back(it.second);
+                });
+
+  return result;
+}
+
+void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); }
+
+void SubGraphFuse::ReplaceNodesWithSubGraphs() {
+  auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)();
+  for (auto &subgraph : subgraphs) {
+    // replace this sub-graph with the first node. Two steps: 1. Create a Block
+    // Node that contains this subgraph 2. Mark the nodes inside the sub-graph
+    // as deleted. 3. Replace the deleted node with the new Block Node.
+    auto *block_node = graph_->nodes.Create(Node::Type::kFunctionBlock);
+    auto io = ExtractInputAndOutputOfSubGraph(subgraph);
+    block_node->inlinks = std::move(io.first);
+    block_node->outlinks = std::move(io.second);
+    for (auto *node : subgraph) {
+      // TODO(Superjomn) need a unified mechanism to treat deleted node in each
+      // pass.
+      node->SetDeleted();
+    }
+
+    std::unordered_map<Node *, Node *>
+        delelte_node_map;  // deleted node to BlockNode
+    for (auto *n : block_node->inlinks) {
+      n->inlinks.clear();
+    }
+    for (auto *n : block_node->outlinks) {
+      n->outlinks.clear();
+    }
+    for (auto *n : block_node->inlinks) {
+      n->outlinks.push_back(block_node);
+    }
+    for (auto *n : block_node->outlinks) {
+      n->inlinks.push_back(n);
+    }
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.h b/paddle/fluid/inference/analysis/subgraph_splitter.h
new file mode 100644
index 0000000000000000000000000000000000000000..a31afbe6933da8d3c7a88142cc12d63b98b55796
--- /dev/null
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.h
@@ -0,0 +1,83 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file defines the the class to partition a graph.
+ */
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/node.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Detect the nodes in a sub-graph that meet some conditions. This class doesn't
+ * modify the graph.
+ */
+class SubGraphSplitter {
+ public:
+  static const char *kMarkerAttrName;
+  // Tell whether a node is inside a sub-graph.
+  using NodeInsideSubgraphTeller = std::function<bool(const Node *)>;
+
+  SubGraphSplitter(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller)
+      : graph_(graph), node_inside_subgraph_teller_(teller) {}
+
+  std::vector<std::vector<Node *>> operator()();
+
+ protected:
+  // Mark the nodes inside the accepted sub-graph using
+  // node_inside_subgraph_teller.
+  void MarkNodesInsideSubGraph();
+
+  // Merge the marked nodes into sub-graphs and return the sub-graphs.
+  std::vector<std::vector<Node *>> ExtractSubGraphs();
+
+ private:
+  DataFlowGraph *graph_;
+  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+};
+
+/*
+ * SubGraphFuse - Replace some nodes with the sub-graph node they are inside. To
+ * some extent, the TensorRT engine is just a fusion op for a model.
+ */
+class SubGraphFuse {
+ public:
+  using NodeInsideSubgraphTeller = SubGraphSplitter::NodeInsideSubgraphTeller;
+
+  SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller)
+      : graph_(graph), node_inside_subgraph_teller_(teller) {}
+
+  // The main method which run all the logic.
+  void operator()();
+
+ protected:
+  // Remove the nodes inside sub-graphs and replace with the SubGraphNode.
+  void ReplaceNodesWithSubGraphs();
+
+ private:
+  DataFlowGraph *graph_;
+  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0644c0db12e3daabba76dbaad33847f5624b157a
--- /dev/null
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
@@ -0,0 +1,67 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST_F(DFG_Tester, Split) {
+  auto desc = LoadProgramDesc();
+  auto dfg = ProgramDescToDFG(desc);
+  LOG(INFO) << "spliter\n" << dfg.DotString();
+
+  SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
+    if (node->type() != Node::Type::kFunction) return false;
+    const auto* func = static_cast<const Function*>(node);
+    if (func->func_type() == "elementwise_add" || func->func_type() == "relu" ||
+        func->func_type() == "conv2d" || func->func_type() == "mul" ||
+        func->func_type() == "sigmoid" || func->func_type() == "softmax") {
+      LOG(INFO) << "sub-graph marked " << node->repr();
+      return true;
+    }
+    return false;
+  };
+  ASSERT_GT(dfg.nodes.size(), 5UL);
+
+  auto subgraphs = SubGraphSplitter(&dfg, teller)();
+
+  // Check the number of the marked nodes.
+  int marked_nodes = 0;
+  for (auto& node : dfg.nodes.nodes()) {
+    if (node->IsFunction() &&
+        node->attr(SubGraphSplitter::kMarkerAttrName).Bool()) {
+      ++marked_nodes;
+    }
+  }
+  EXPECT_EQ(marked_nodes, 6);
+
+  // For human debug.
+  for (auto& subgraph : subgraphs) {
+    LOG(INFO) << "subgraph size " << subgraph.size();
+    for (auto* node : subgraph) {
+      LOG(INFO) << "node " << node->repr();
+    }
+  }
+
+  ASSERT_EQ(subgraphs.size(), 1UL);
+  // The last sub-graph has 5 Functions.
+  ASSERT_EQ(subgraphs.back().size(), 6UL);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..722fa99a48a5f2b0e778904de0c35977d0ee3cc0
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ut_helper.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <string>
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/io.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+DEFINE_string(inference_model_dir, "", "inference test model dir");
+
+static framework::proto::ProgramDesc LoadProgramDesc(
+    const std::string& model_dir = FLAGS_inference_model_dir) {
+  paddle::platform::CPUPlace place;
+  paddle::framework::Executor executor(place);
+  paddle::framework::Scope scope;
+  auto program = Load(&executor, &scope, model_dir);
+  return *program->Proto();
+}
+
+static DataFlowGraph ProgramDescToDFG(
+    const framework::proto::ProgramDesc& desc) {
+  DataFlowGraph graph;
+  FluidToDataFlowGraphPass pass;
+  pass.Initialize(desc);
+  pass.Run(&graph);
+  pass.Finalize();
+  return graph;
+}
+
+class DFG_Tester : public ::testing::Test {
+ protected:
+  void SetUp() override { desc = LoadProgramDesc(FLAGS_inference_model_dir); }
+
+  framework::proto::ProgramDesc desc;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/engine.h b/paddle/fluid/inference/engine.h
index 6b0ac92fa908427a89a6a5fa74dacc3b24abd1c3..ce2b8161715a3fa2278ce950dbac82c6d0042bef 100644
--- a/paddle/fluid/inference/engine.h
+++ b/paddle/fluid/inference/engine.h
@@ -14,11 +14,15 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/framework/framework.pb.h"
 
 namespace paddle {
 namespace inference {
 
+struct Buffer;
+enum class DeviceType { UNK = -1, CPU, GPU };
+
 /*
  * EngineBase is the base class of all inference engines. An inference engine
  * takes a paddle program as input, and outputs the result in fluid Tensor
@@ -45,8 +49,20 @@ class EngineBase {
   // Execute the engine, that will run the inference network.
   virtual void Execute(int batch_size) = 0;
 
+  // Return the IO buffer that allocated in engine. One can read/write directly
+  // on the buffer. If the buffer's buffer is nullptr, one can also allocate
+  // memory and maintain it outside the engine.
+  virtual Buffer& buffer(const std::string& name) = 0;
+
   virtual ~EngineBase() {}
 };  // class EngineBase
 
+struct Buffer {
+  void* buffer{nullptr};               // buffer should be allocated only once.
+  size_t max_size;                     // buffer allocated space.
+  size_t size;                         // data size.
+  DeviceType device{DeviceType::UNK};  // tells which device this buffer is on.
+};
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
index c8b656394b403c4965e01e96c9215d9406091907..b52d083f280e5e7713600a7b748dedd37aca0a1e 100644
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -1,5 +1,4 @@
+nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto)
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
-nv_test(test_tensorrt_engine SRCS test_engine.cc engine.cc DEPS dynload_cuda)
-nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
-set(ENGINE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/engine.cc)
+nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
 add_subdirectory(convert)
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 572e29515f89685c528023cf6569ef8e559d6874..748f5a084e8c880df215a60fe51c835ba5cd3110 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,3 +1,15 @@
-nv_test(test_tensorrt_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES})
-nv_test(test_tensorrt_activation_op SRCS test_activation_op.cc ${ENGINE_FILE} activation_op.cc 
-  DEPS ${FLUID_CORE_MODULES} activation_op)
+# Add TRT tests
+nv_library(tensorrt_converter
+  SRCS mul_op.cc conv2d_op.cc fc_op.cc
+  DEPS tensorrt_engine mul_op)
+
+nv_test(test_op_converter SRCS test_op_converter.cc DEPS
+  ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter)
+
+nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
+nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
+nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
+nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL)
diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
index 543784289cfc51048057e467d36fdd1f334eb903..e1cace9cc1b06f036f52e82b7b86c99a02d50f50 100644
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
 namespace paddle {
@@ -21,20 +22,29 @@ namespace tensorrt {
 class ReluOpConverter : public OpConverter {
  public:
   ReluOpConverter() {}
-  void operator()(const framework::OpDesc& op) override {
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    // Here the two nullptr looks strange, that's because the
+    // framework::OpDesc's constructor is strange.
+    framework::OpDesc op_desc(op, nullptr);
     LOG(INFO) << "convert a fluid relu op to tensorrt activation layer whose "
                  "type is Relu";
     const nvinfer1::ITensor* input_tensor =
-        engine_->GetITensor(op.Input("X")[0]);
+        engine_->GetITensor(op_desc.Input("X")[0]);
     nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER(
         engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor),
         nvinfer1::ActivationType::kRELU);
-    engine_->SetITensor(op.Output("Out")[0], layer->getOutput(0));
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
   }
 };
 
-REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter);
-
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 431500b90e144e2a30fe705b72e93452f806ca65..8e7e23377d4b2fe7afd51f1f58048fc4ed3c6d99 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -21,14 +21,15 @@ namespace tensorrt {
 class Conv2dOpConverter : public OpConverter {
  public:
   Conv2dOpConverter() {}
-  void operator()(const framework::OpDesc& op) override {
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
     LOG(INFO)
         << "convert a fluid conv2d op to tensorrt conv layer without bias";
   }
 };
 
-REGISTER_TRT_OP_CONVERTER(conv2d, Conv2dOpConverter);
-
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(conv2d, Conv2dOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bb603efaf30bb72d74b5583abc45d01a16c076a3
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -0,0 +1,121 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+// Reorder the elements from istrides to ostrides, borrowed from TRT convert in
+// tensorflow.
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/tensorrt/convert/convert_nodes.cc#L318
+template <typename T>
+void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
+              T* odata, nvinfer1::DimsHW ostrides) {
+  for (int h = 0; h < shape.h(); ++h) {
+    for (int w = 0; w < shape.w(); ++w) {
+      odata[h * ostrides.h() + w * ostrides.w()] =
+          idata[h * ostrides.h() + w * ostrides.w()];
+    }
+  }
+}
+
+// Reorder the data layout from CK to KC.
+void ReorderCKtoKC(TensorRTEngine::Weight& iweights,
+                   TensorRTEngine::Weight* oweights) {
+  int c = iweights.dims[0];
+  int k = iweights.dims[1];
+  oweights->dims.assign({k, c});
+  nvinfer1::DimsHW istrides = {1, k};
+  nvinfer1::DimsHW ostrides = {c, 1};
+  Reorder2({k, c}, static_cast<float const*>(iweights.get().values), istrides,
+           static_cast<float*>(const_cast<void*>(oweights->get().values)),
+           ostrides);
+}
+
+/*
+ * FC converter convert a MUL op in Fluid to a FC layer in TRT.
+ */
+class FcOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert a fluid fc op to tensorrt fc layer without bias";
+
+    framework::OpDesc op_desc(op, nullptr);
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+    // Declare inputs
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+
+    // Declare weights
+    auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
+    PADDLE_ENFORCE_NOT_NULL(Y_v);
+    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+    // This may trigger a GPU->CPU copy, because TRT's weight can only be
+    // assigned from CPU memory, that can't be avoided.
+    auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
+    PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL);  // a matrix
+    size_t n_output = Y_t->dims()[1];
+
+    framework::LoDTensor tmp;
+    tmp.Resize(Y_t->dims());
+    memcpy(tmp.mutable_data<float>(platform::CPUPlace()), Y_t->data<float>(),
+           Y_t->dims()[0] * Y_t->dims()[1]);
+
+    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+                                  static_cast<void*>(weight_data),
+                                  Y_t->memory_size() / sizeof(float)};
+    TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT,
+                                      static_cast<void*>(tmp.data<float>()),
+                                      Y_t->memory_size() / sizeof(float));
+    weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]});
+    tmp_weight.dims = weight.dims;
+
+    // The data layout of TRT FC layer's weight is different from fluid's FC,
+    // need to reorder the elements.
+    ReorderCKtoKC(tmp_weight, &weight);
+
+    // Currently, the framework can only handle one fluid op -> one TRT layer,
+    // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just
+    // handle `mul`, leave `add` as another layer.
+    // DEBUG
+    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected,
+                                       *const_cast<nvinfer1::ITensor*>(X),
+                                       n_output, weight.get(), bias.get());
+
+    auto output_name = op_desc.Output("Out").front();
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter);
+USE_OP(mul);
diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.cc b/paddle/fluid/inference/tensorrt/convert/io_converter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..854f434d93e81237dc85c5df62debcf3b3824b78
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/io_converter.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
+#include <cuda.h>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+using platform::is_gpu_place;
+using platform::is_cpu_place;
+
+class DefaultIOConverter : public EngineIOConverter {
+ public:
+  DefaultIOConverter() {}
+  // NOTE out is GPU memory.
+  virtual void operator()(const LoDTensor& in, void* out,
+                          size_t max_size) override {
+    PADDLE_ENFORCE(out != nullptr);
+    PADDLE_ENFORCE(stream_ != nullptr);
+    const auto& place = in.place();
+    size_t size = in.memory_size();
+    PADDLE_ENFORCE_LE(size, max_size);
+    if (is_cpu_place(place)) {
+      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out, in.data<float>(), size,
+                                           cudaMemcpyHostToDevice, *stream_));
+    } else if (is_gpu_place(place)) {
+      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out, in.data<float>(), size,
+                                           cudaMemcpyDeviceToDevice, *stream_));
+    } else {
+      PADDLE_THROW("Unknown device for converter");
+    }
+    cudaStreamSynchronize(*stream_);
+  }
+  // NOTE in is GPU memory.
+  virtual void operator()(const void* in, LoDTensor* out,
+                          size_t max_size) override {
+    PADDLE_ENFORCE(in != nullptr);
+    PADDLE_ENFORCE(stream_ != nullptr);
+    const auto& place = out->place();
+    size_t size = out->memory_size();
+    PADDLE_ENFORCE_LE(size, max_size);
+    if (is_cpu_place(place)) {
+      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data<float>(), in, size,
+                                           cudaMemcpyDeviceToHost, *stream_));
+    } else if (is_gpu_place(place)) {
+      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data<float>(), in, size,
+                                           cudaMemcpyDeviceToDevice, *stream_));
+    } else {
+      PADDLE_THROW("Unknown device for converter");
+    }
+    cudaStreamSynchronize(*stream_);
+  }
+};
+
+// fluid LodTensor <-> tensorrt ITensor
+REGISTER_TENSORRT_IO_CONVERTER(default, DefaultIOConverter);
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.h b/paddle/fluid/inference/tensorrt/convert/io_converter.h
new file mode 100644
index 0000000000000000000000000000000000000000..71c48e085d25d2bc6720d93735f661f9e3af7b40
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/io_converter.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+using framework::LoDTensor;
+
+/*
+ * Convert Input from Fluid to TensorRT Engine.
+ * Convert Output from TensorRT Engine to Fluid.
+ *
+ * Note that TensorRT's ITensor follows row major, NCHW. Fluid is also row
+ * major,
+ * so in the default case just need to copy the data.
+ */
+class EngineIOConverter {
+ public:
+  EngineIOConverter() {}
+
+  virtual void operator()(const LoDTensor& in, void* out, size_t max_size) {}
+  virtual void operator()(const void* in, LoDTensor* out, size_t max_size) {}
+
+  void SetStream(cudaStream_t* stream) { stream_ = stream; }
+
+  static void ConvertInput(const std::string& op_type, const LoDTensor& in,
+                           void* out, size_t max_size, cudaStream_t* stream) {
+    PADDLE_ENFORCE(stream != nullptr);
+    auto* converter = Registry<EngineIOConverter>::Lookup(
+        op_type, "default" /* default_type */);
+    PADDLE_ENFORCE_NOT_NULL(converter);
+    converter->SetStream(stream);
+    (*converter)(in, out, max_size);
+  }
+
+  static void ConvertOutput(const std::string& op_type, const void* in,
+                            LoDTensor* out, size_t max_size,
+                            cudaStream_t* stream) {
+    PADDLE_ENFORCE(stream != nullptr);
+    auto* converter = Registry<EngineIOConverter>::Lookup(
+        op_type, "default" /* default_type */);
+    PADDLE_ENFORCE_NOT_NULL(converter);
+    converter->SetStream(stream);
+    (*converter)(in, out, max_size);
+  }
+
+  virtual ~EngineIOConverter() {}
+
+ protected:
+  cudaStream_t* stream_{nullptr};
+};
+
+#define REGISTER_TENSORRT_IO_CONVERTER(op_type__, Converter__)        \
+  struct trt_io_##op_type__##_converter {                             \
+    trt_io_##op_type__##_converter() {                                \
+      Registry<EngineIOConverter>::Register<Converter__>(#op_type__); \
+    }                                                                 \
+  };                                                                  \
+  trt_io_##op_type__##_converter trt_io_##op_type__##_converter__;
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
index f9834ab156c9dcc11f4e89075b7bf5457cf00268..3c342957360ad4192d838147bf37e84d233c2629 100644
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
@@ -18,16 +18,36 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
+/*
+ * MulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights.
+ */
 class MulOpConverter : public OpConverter {
  public:
-  MulOpConverter() {}
-  void operator()(const framework::OpDesc& op) override {
-    LOG(INFO) << "convert a fluid mul op to tensorrt fc layer without bias";
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias";
+
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
+    // Both the input1 and input2 do not need transpose.
+    auto* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, MatrixMultiply, *const_cast<nvinfer1::ITensor*>(input1), false,
+        *const_cast<nvinfer1::ITensor*>(input2), false);
+
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
   }
 };
 
-REGISTER_TRT_OP_CONVERTER(mul, MulOpConverter);
-
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
+
+USE_OP(mul);
+REGISTER_TRT_OP_CONVERTER(mul, MulOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index f8ca219bb83c6db3ba640464233c6cbd337926a6..c7a5a49dd02d0db022fabff5c3ae1c7800bac25c 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -17,8 +17,10 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/utils/singleton.h"
 
 namespace paddle {
 namespace inference {
@@ -30,36 +32,45 @@ namespace tensorrt {
 class OpConverter {
  public:
   OpConverter() {}
-  virtual void operator()(const framework::OpDesc& op) {}
-
-  void Execute(const framework::OpDesc& op, TensorRTEngine* engine) {
-    std::string type = op.Type();
-    auto it = converters_.find(type);
-    PADDLE_ENFORCE(it != converters_.end(), "no OpConverter for optype [%s]",
-                   type);
-    it->second->SetEngine(engine);
-    (*it->second)(op);
-  }
-
-  static OpConverter& Global() {
-    static auto* x = new OpConverter;
-    return *x;
-  }
-
-  template <typename T>
-  void Register(const std::string& key) {
-    converters_[key] = new T;
-  }
 
-  // convert fluid op to tensorrt layer
-  void ConvertOp(const framework::OpDesc& op, TensorRTEngine* engine) {
-    OpConverter::Global().Execute(op, engine);
+  // Converter logic for an op.
+  virtual void operator()(const framework::proto::OpDesc& op,
+                          const framework::Scope& scope,
+                          bool test_mode = false) {}
+
+  // Convert a single fluid operator and add the corresponding layer to TRT.
+  // test_mode: whether the instance executes in an unit test.
+  void ConvertOp(const framework::proto::OpDesc& op,
+                 const std::unordered_set<std::string>& parameters,
+                 const framework::Scope& scope, TensorRTEngine* engine,
+                 bool test_mode = false) {
+    framework::OpDesc op_desc(op, nullptr);
+
+    OpConverter* it{nullptr};
+
+    if (op_desc.Type() == "mul") {
+      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
+      std::string Y = op_desc.Input("Y")[0];
+      if (parameters.count(Y)) {
+        it = Registry<OpConverter>::Lookup("fc");
+      }
+    }
+    if (!it) {
+      it = Registry<OpConverter>::Lookup(op_desc.Type());
+    }
+    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
+                            op_desc.Type());
+    it->SetEngine(engine);
+    (*it)(op, scope, test_mode);
   }
 
   // convert fluid block to tensorrt network
-  void ConvertBlock(const framework::BlockDesc& block, TensorRTEngine* engine) {
-    for (auto op : block.AllOps()) {
-      OpConverter::Global().Execute(*op, engine);
+  void ConvertBlock(const framework::proto::BlockDesc& block,
+                    const std::unordered_set<std::string>& parameters,
+                    const framework::Scope& scope, TensorRTEngine* engine) {
+    for (int i = 0; i < block.ops_size(); i++) {
+      const auto& op = block.ops(i);
+      ConvertOp(op, parameters, scope, engine);
     }
   }
 
@@ -70,6 +81,9 @@ class OpConverter {
   // TensorRT engine
   TensorRTEngine* engine_{nullptr};
 
+ protected:
+  bool test_mode_;
+
  private:
   // registered op converter map, whose key is the fluid op type, and value is
   // the pointer position of corresponding OpConverter class.
@@ -78,13 +92,24 @@ class OpConverter {
   framework::Scope* scope_{nullptr};
 };
 
-#define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__)      \
-  struct trt_##op_type__##_converter {                         \
-    trt_##op_type__##_converter() {                            \
-      OpConverter::Global().Register<Converter__>(#op_type__); \
-    }                                                          \
-  };                                                           \
-  trt_##op_type__##_converter trt_##op_type__##_converter__;
+#define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__)                      \
+  struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \
+    trt_##op_type__##_converter() {                                            \
+      ::paddle::inference::                                                    \
+          Registry<paddle::inference::tensorrt::OpConverter>::Register<        \
+              ::paddle::inference::tensorrt::Converter__>(#op_type__);         \
+    }                                                                          \
+  };                                                                           \
+  trt_##op_type__##_converter trt_##op_type__##_converter__;                   \
+  int TouchConverterRegister_##op_type__() {                                   \
+    trt_##op_type__##_converter__.Touch();                                     \
+    return 0;                                                                  \
+  }
+
+#define USE_TRT_CONVERTER(op_type__)                                    \
+  extern int TouchConverterRegister_##op_type__();                      \
+  static int use_op_converter_trt_##op_type__ __attribute__((unused)) = \
+      TouchConverterRegister_##op_type__();
 
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
index 0f390bee1f5967547ea21e6d42a1b4300de58d04..0a02a7bebf9efbd0555707e6cfa701ef1e7d9659 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -1,94 +1,47 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 
 #include <gtest/gtest.h>
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/place.h"
-
-USE_OP(relu);
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-void compare(float input, float expect) {
+TEST(ReluOpConverter, main) {
   framework::Scope scope;
-  platform::CUDAPlace place;
-  platform::CUDADeviceContext ctx(place);
-
-  // init fluid op and variable
-  auto x_var = scope.Var("X");
-  auto x_tensor = x_var->GetMutable<framework::LoDTensor>();
-  x_tensor->Resize({1, 1});
-  std::vector<float> init;
-  init.push_back(input);
-  framework::TensorFromVector(init, ctx, x_tensor);
-
-  auto out_var = scope.Var("Out");
-  auto out_tensor = out_var->GetMutable<framework::LoDTensor>();
-  out_tensor->Resize({1, 1});
-  out_tensor->mutable_data<float>(place);
-
-  framework::OpDesc op_desc;
-  op_desc.SetType("relu");
-  op_desc.SetInput("X", {"X"});
-  op_desc.SetOutput("Out", {"Out"});
-
-  auto relu_op = framework::OpRegistry::CreateOp(op_desc);
-
-  // run fluid op
-  relu_op->Run(scope, place);
-  std::vector<float> out1;
-  framework::TensorToVector(*out_tensor, ctx, &out1);
-
-  // init tensorrt op
-  cudaStream_t stream;
-  ASSERT_EQ(0, cudaStreamCreate(&stream));
-  TensorRTEngine* engine = new TensorRTEngine(1, 1 << 10, &stream);
-  engine->InitNetwork();
-  engine->DeclareInput("X", nvinfer1::DataType::kFLOAT,
-                       nvinfer1::DimsCHW{1, 1, 1});
-
-  OpConverter op_converter;
-  op_converter.ConvertOp(op_desc, engine);
-
-  engine->DeclareOutput("Out");
-  engine->FreezeNetwork();
-  engine->SetInputFromCPU("X", &input, 1 * sizeof(float));
-
-  // run tensorrt op
-  engine->Execute(1);
-
-  float out2;
-  engine->GetOutputInCPU("Out", &out2, 1 * sizeof(float));
-
-  ASSERT_EQ(out1[0], out2);
-  ASSERT_EQ(out1[0], expect);
-
-  delete engine;
-  cudaStreamDestroy(stream);
-}
-
-TEST(OpConverter, ConvertRelu) {
-  compare(1, 1);   // relu(1) = 1
-  compare(-5, 0);  // relu(-5) = 0
+  std::unordered_set<std::string> parameters;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("relu-X", nvinfer1::Dims2(10, 6));
+  validator.DeclOutputVar("relu-Out", nvinfer1::Dims2(10, 6));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("relu");
+  desc.SetInput("X", {"relu-X"});
+  desc.SetOutput("Out", {"relu-Out"});
+
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+
+  validator.Execute(10);
 }
 
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
+
+USE_OP(relu);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a30253072ac581ceca85ca10151a176f87a7cb39
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(fc_op, test) {
+  std::unordered_set<std::string> parameters({"mul-Y"});
+  framework::Scope scope;
+  TRTConvertValidation validator(20, parameters, scope, 1000);
+
+  validator.DeclInputVar("mul-X", nvinfer1::Dims4(8, 3, 1, 1));
+  validator.DeclParamVar("mul-Y", nvinfer1::Dims2(3, 2));
+  validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(8, 2));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("mul");
+  desc.SetInput("X", {"mul-X"});
+  desc.SetInput("Y", {"mul-Y"});
+  desc.SetOutput("Out", {"mul-Out"});
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(10);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8f91309a0a00d5131268f026c319e25ba3cb964a
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
@@ -0,0 +1,76 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+void IOConverterTester(const platform::DeviceContext& ctx) {
+  cudaStream_t stream;
+  ASSERT_EQ(0, cudaStreamCreate(&stream));
+
+  // init fluid in_tensor
+  framework::LoDTensor in_tensor;
+  in_tensor.Resize({10, 10});
+  auto place = ctx.GetPlace();
+  in_tensor.mutable_data<float>(place);
+  std::vector<float> init;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init.push_back(i);
+  }
+  framework::TensorFromVector(init, ctx, &in_tensor);
+
+  // init tensorrt buffer
+  void* buffer;
+  size_t size = in_tensor.memory_size();
+  ASSERT_EQ(cudaMalloc(&buffer, size), 0);
+
+  // convert fluid in_tensor to tensorrt buffer
+  EngineIOConverter::ConvertInput("test", in_tensor, buffer, size, &stream);
+
+  // convert tensorrt buffer to fluid out_tensor
+  framework::LoDTensor out_tensor;
+  out_tensor.Resize({10, 10});
+  out_tensor.mutable_data<float>(place);
+  EngineIOConverter::ConvertOutput("test", buffer, &out_tensor, size, &stream);
+
+  // compare in_tensor and out_tensor
+  std::vector<float> result;
+  framework::TensorToVector(out_tensor, ctx, &result);
+  EXPECT_EQ(init.size(), result.size());
+  for (size_t i = 0; i < init.size(); i++) {
+    EXPECT_EQ(init[i], result[i]);
+  }
+  cudaStreamDestroy(stream);
+}
+
+TEST(EngineIOConverterTester, DefaultCPU) {
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+  IOConverterTester(ctx);
+}
+
+TEST(EngineIOConverterTester, DefaultGPU) {
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
+  IOConverterTester(ctx);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1ce1130e5d660d717a1262a1fbdb4b620462c0b3
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(MulOpConverter, main) {
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6));
+  validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10));
+  validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("mul");
+  desc.SetInput("X", {"mul-X"});
+  desc.SetInput("Y", {"mul-Y"});
+  desc.SetOutput("Out", {"mul-Out"});
+
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+
+  validator.Execute(10);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(mul);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 5c5ac10394d3cb76ac4b8409c7fa79eb5e9a90b2..9b79f86b0edba983019bd932f52b08711ff36d41 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -12,26 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-TEST(BlockConverter, ConvertBlock) {
+TEST(OpConverter, ConvertBlock) {
   framework::ProgramDesc prog;
   auto* block = prog.MutableBlock(0);
-  auto* mul_op = block->AppendOp();
-  mul_op->SetType("mul");
   auto* conv2d_op = block->AppendOp();
   conv2d_op->SetType("conv2d");
 
   OpConverter converter;
-  converter.ConvertBlock(*block, nullptr /*TensorRTEngine*/);
+  framework::Scope scope;
+  converter.ConvertBlock(*block->Proto(), {}, scope,
+                         nullptr /*TensorRTEngine*/);
 }
 
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
+
+USE_TRT_CONVERTER(conv2d)
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..236d169017f65e4c9d513c3ca4511daa2dfee06e
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -0,0 +1,173 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file implements a UT framework to make the validation of transforming
+ * Fluid Op to TRT Layer.
+ */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Get a random float value between [low, high]
+ */
+float random(float low, float high) {
+  static std::random_device rd;
+  static std::mt19937 mt(rd());
+  std::uniform_real_distribution<double> dist(1.0, 10.0);
+  return dist(mt);
+}
+
+void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place,
+                     const platform::DeviceContext& ctx) {
+  auto dims = tensor->dims();
+  size_t num_elements = analysis::AccuDims(dims, dims.size());
+  PADDLE_ENFORCE_GT(num_elements, 0);
+  auto* data = tensor->mutable_data<float>(place);
+  for (size_t i = 0; i < num_elements; i++) {
+    *(data + i) = random(0., 1.);
+  }
+}
+
+/*
+ * Help to validate the correctness between Fluid Op and the corresponding TRT
+ * layer.
+ */
+class TRTConvertValidation {
+ public:
+  TRTConvertValidation() = delete;
+
+  TRTConvertValidation(int batch_size,
+                       const std::unordered_set<std::string>& parameters,
+                       framework::Scope& scope, int workspace_size = 1 << 10)
+      : parameters_(parameters), scope_(scope) {
+    // create engine.
+    engine_.reset(new TensorRTEngine(10, 1 << 10, &stream_));
+    engine_->InitNetwork();
+
+    PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
+  }
+
+  // Declare a Variable as input with random initialization.
+  void DeclInputVar(const std::string& name, const nvinfer1::Dims& dims) {
+    DeclVar(name, dims);
+    // Declare TRT inputs.
+    engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, dims);
+  }
+
+  // Declare a parameter varaible in the scope.
+  void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) {
+    DeclVar(name, dims);
+  }
+
+  void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) {
+    DeclVar(name, dims);
+  }
+
+  // Declare a variable in a fluid Scope.
+  void DeclVar(const std::string& name, const nvinfer1::Dims& dims) {
+    platform::CPUPlace place;
+    platform::CPUDeviceContext ctx(place);
+
+    // Init Fluid tensor.
+    std::vector<int> dim_vec(dims.d, dims.d + dims.nbDims);
+    auto* x = scope_.Var(name);
+    auto* x_tensor = x->GetMutable<framework::LoDTensor>();
+    x_tensor->Resize(framework::make_ddim(dim_vec));
+    RandomizeTensor(x_tensor, place, ctx);
+  }
+
+  void SetOp(const framework::proto::OpDesc& desc) {
+    op_ = framework::OpRegistry::CreateOp(desc);
+
+    Singleton<OpConverter>::Global().ConvertOp(
+        desc, parameters_, scope_, engine_.get(), true /*test_mode*/);
+
+    engine_->FreezeNetwork();
+
+    // Declare outputs.
+    op_desc_.reset(new framework::OpDesc(desc, nullptr));
+
+    // Set Inputs.
+    for (const auto& input : op_desc_->InputArgumentNames()) {
+      if (parameters_.count(input)) continue;
+      auto* var = scope_.FindVar(input);
+      PADDLE_ENFORCE(var);
+      auto tensor = var->GetMutable<framework::LoDTensor>();
+
+      engine_->SetInputFromCPU(
+          input, static_cast<void*>(tensor->data<void>()),
+          sizeof(float) *
+              analysis::AccuDims(tensor->dims(), tensor->dims().size()));
+    }
+  }
+
+  void Execute(int batch_size) {
+    // Execute Fluid Op
+    platform::CPUPlace place;
+    platform::CPUDeviceContext ctx(place);
+    op_->Run(scope_, place);
+    // Execute TRT.
+    engine_->Execute(batch_size);
+    cudaStreamSynchronize(*engine_->stream());
+
+    ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
+    const size_t output_space_size = 200;
+    for (const auto& output : op_desc_->OutputArgumentNames()) {
+      std::vector<float> fluid_out;
+      std::vector<float> trt_out(output_space_size);
+      engine_->GetOutputInCPU(output, &trt_out[0],
+                              output_space_size * sizeof(float));
+      cudaStreamSynchronize(*engine_->stream());
+
+      auto* var = scope_.FindVar(output);
+      auto tensor = var->GetMutable<framework::LoDTensor>();
+      framework::TensorToVector(*tensor, ctx, &fluid_out);
+      // Compare two output
+      ASSERT_FALSE(fluid_out.empty());
+      for (size_t i = 0; i < fluid_out.size(); i++) {
+        // Loose the threshold for CI in different machine model.
+        EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5);
+      }
+    }
+  }
+
+  framework::Scope& scope() { return scope_; }
+
+ private:
+  std::unique_ptr<TensorRTEngine> engine_;
+  cudaStream_t stream_;
+  std::unique_ptr<framework::OperatorBase> op_;
+  std::unique_ptr<framework::OpDesc> op_desc_;
+  const std::unordered_set<std::string>& parameters_;
+  framework::Scope& scope_;
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index df123a59079acc5f549e733b412ab302aa397a92..596e0fe9da3d272ecb1c0f8dbef09a75d08a4b1a 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <cuda.h>
 #include <glog/logging.h>
 #include <string>
+#include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -30,16 +31,25 @@ void TensorRTEngine::Build(const DescType& paddle_model) {
 }
 
 void TensorRTEngine::Execute(int batch_size) {
-  infer_context_->enqueue(batch_size, buffers_.data(), *stream_, nullptr);
+  std::vector<void*> buffers;
+  for (auto& buf : buffers_) {
+    PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated");
+    PADDLE_ENFORCE_GT(buf.max_size, 0);
+    PADDLE_ENFORCE(buf.device == DeviceType::GPU);
+    buffers.push_back(buf.buffer);
+  }
+  infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
   cudaStreamSynchronize(*stream_);
 }
 
 TensorRTEngine::~TensorRTEngine() {
+  cudaStreamSynchronize(*stream_);
   // clean buffer
-  for (auto& buffer : buffers_) {
-    if (buffer != nullptr) {
-      PADDLE_ENFORCE_EQ(0, cudaFree(buffer));
-      buffer = nullptr;
+  for (auto& buf : buffers_) {
+    if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
+      PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));
+      buf.buffer = nullptr;
+      buf.max_size = 0;
     }
   }
 }
@@ -59,28 +69,37 @@ void TensorRTEngine::FreezeNetwork() {
   infer_context_.reset(infer_engine_->createExecutionContext());
 
   // allocate GPU buffers.
-  buffers_.resize(buffer_sizes_.size(), nullptr);
+  buffers_.resize(buffer_sizes_.size());
   for (auto& item : buffer_sizes_) {
     if (item.second == 0) {
       auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
+      auto dims = infer_engine_->getBindingDimensions(slot_offset);
       item.second = kDataTypeSize[static_cast<int>(
                         infer_engine_->getBindingDataType(slot_offset))] *
-                    AccumDims(infer_engine_->getBindingDimensions(slot_offset));
+                    analysis::AccuDims(dims.d, dims.nbDims);
     }
-    PADDLE_ENFORCE_EQ(0, cudaMalloc(&buffer(item.first), item.second));
+    auto& buf = buffer(item.first);
+    CHECK(buf.buffer == nullptr);  // buffer should be allocated only once.
+    PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second));
+    VLOG(4) << "buffer malloc " << item.first << " " << item.second << " "
+            << buf.buffer;
+    buf.size = buf.max_size = item.second;
+    buf.device = DeviceType::GPU;
   }
 }
 
 nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
                                                 nvinfer1::DataType dtype,
-                                                const nvinfer1::Dims& dim) {
+                                                const nvinfer1::Dims& dims) {
   PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s",
                     name);
 
   PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first");
-  auto* input = infer_network_->addInput(name.c_str(), dtype, dim);
+  auto* input = infer_network_->addInput(name.c_str(), dtype, dims);
   PADDLE_ENFORCE(input, "infer network add input %s failed", name);
-  buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] * AccumDims(dim);
+  buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
+                        analysis::AccuDims(dims.d, dims.nbDims);
+  PADDLE_ENFORCE(input->isNetworkInput());
   TensorRTEngine::SetITensor(name, input);
   return input;
 }
@@ -91,9 +110,12 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
                     name);
 
   auto* output = layer->getOutput(offset);
+  SetITensor(name, output);
   PADDLE_ENFORCE(output != nullptr);
   output->setName(name.c_str());
+  PADDLE_ENFORCE(!output->isNetworkInput());
   infer_network_->markOutput(*output);
+  PADDLE_ENFORCE(output->isNetworkOutput());
   // output buffers' size can only be decided latter, set zero here to mark this
   // and will reset latter.
   buffer_sizes_[name] = 0;
@@ -106,6 +128,7 @@ void TensorRTEngine::DeclareOutput(const std::string& name) {
   auto* output = TensorRTEngine::GetITensor(name);
   PADDLE_ENFORCE(output != nullptr);
   output->setName(name.c_str());
+  PADDLE_ENFORCE(!output->isNetworkInput());
   infer_network_->markOutput(*output);
   // output buffers' size can only be decided latter, set zero here to mark this
   // and will reset latter.
@@ -113,7 +136,21 @@ void TensorRTEngine::DeclareOutput(const std::string& name) {
 }
 
 void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
-  return buffer(name);
+  return buffer(name).buffer;
+}
+
+void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst,
+                                    size_t max_size) {
+  // determine data size
+  auto it = buffer_sizes_.find(name);
+  PADDLE_ENFORCE(it != buffer_sizes_.end());
+  PADDLE_ENFORCE_GT(it->second, 0);
+  PADDLE_ENFORCE_GE(max_size, it->second);
+  auto& buf = buffer(name);
+  PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
+  PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, it->second,
+                                    cudaMemcpyDeviceToDevice, *stream_),
+                    0);
 }
 
 void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst,
@@ -123,11 +160,13 @@ void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst,
   PADDLE_ENFORCE(it != buffer_sizes_.end());
   PADDLE_ENFORCE_GT(it->second, 0);
   PADDLE_ENFORCE_GE(max_size, it->second);
-  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buffer(name), it->second,
+  auto& buf = buffer(name);
+  PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
+  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, it->second,
                                        cudaMemcpyDeviceToHost, *stream_));
 }
 
-void*& TensorRTEngine::buffer(const std::string& name) {
+Buffer& TensorRTEngine::buffer(const std::string& name) {
   PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
   auto it = buffer_sizes_.find(name);
   PADDLE_ENFORCE(it != buffer_sizes_.end());
@@ -135,24 +174,36 @@ void*& TensorRTEngine::buffer(const std::string& name) {
   return buffers_[slot_offset];
 }
 
-void TensorRTEngine::SetInputFromCPU(const std::string& name, void* data,
+void TensorRTEngine::SetInputFromCPU(const std::string& name, const void* data,
+                                     size_t size) {
+  auto& buf = buffer(name);
+  PADDLE_ENFORCE_NOT_NULL(buf.buffer);
+  PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
+  PADDLE_ENFORCE(buf.device == DeviceType::GPU);
+  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
+                                       cudaMemcpyHostToDevice, *stream_));
+}
+
+void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data,
                                      size_t size) {
-  void* buf = buffer(name);
-  cudaMemcpyAsync(buf, data, size, cudaMemcpyHostToDevice, *stream_);
-  PADDLE_ENFORCE_EQ(
-      0, cudaMemcpyAsync(buf, data, size, cudaMemcpyHostToDevice, *stream_));
+  auto& buf = buffer(name);
+  PADDLE_ENFORCE_NOT_NULL(buf.buffer);
+  PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
+  PADDLE_ENFORCE(buf.device == DeviceType::GPU);
+  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
+                                       cudaMemcpyDeviceToDevice, *stream_));
 }
 
 void TensorRTEngine::SetITensor(const std::string& name,
                                 nvinfer1::ITensor* tensor) {
   PADDLE_ENFORCE(tensor != nullptr);
-  PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate itensor name %s",
+  PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s",
                     name);
   itensor_map_[name] = tensor;
 }
 
 nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) {
-  PADDLE_ENFORCE(itensor_map_.count(name), "no itensor %s", name);
+  PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name);
   return itensor_map_[name];
 }
 
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index ec919b943d3281dd675b15e2f14adb7b3487f46f..b60f00de9fa5fc8f8f4537379bf9ee9c8bb6f31c 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/inference/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/inference/utils/singleton.h"
 
 namespace paddle {
 namespace inference {
@@ -37,13 +38,15 @@ class TensorRTEngine : public EngineBase {
   // Weight is model parameter.
   class Weight {
    public:
-    Weight(nvinfer1::DataType dtype, void* value, int num_elem) {
+    Weight(nvinfer1::DataType dtype, void* value, size_t num_elem) {
       w_.type = dtype;
       w_.values = value;
       w_.count = num_elem;
     }
     const nvinfer1::Weights& get() { return w_; }
 
+    std::vector<int64_t> dims;
+
    private:
     nvinfer1::Weights w_;
   };
@@ -87,16 +90,20 @@ class TensorRTEngine : public EngineBase {
   // these memory directly for acceleration, for example, output the converted
   // data directly to the buffer to save data copy overhead.
   // NOTE this should be used after calling `FreezeNetwork`.
-  void*& buffer(const std::string& name);
+  Buffer& buffer(const std::string& name) override;
+
+  cudaStream_t* stream() { return stream_; }
 
   // Fill an input from CPU memory with name and size.
-  void SetInputFromCPU(const std::string& name, void* data, size_t size);
+  void SetInputFromCPU(const std::string& name, const void* data, size_t size);
   // TODO(Superjomn) is this method necessary given that buffer(xxx) can be
   // accessed directly. Fill an input from GPU memory with name and size.
-  void SetInputFromGPU(const std::string& name, void* data, size_t size);
+  void SetInputFromGPU(const std::string& name, const void* data, size_t size);
   // Get an output called name, the output of tensorrt is in GPU, so this method
-  // will just return the output's GPU memory address.
+  // Return the output's GPU memory address without copy.
   void* GetOutputInGPU(const std::string& name);
+  // Copy data into dst inside the GPU device.
+  void GetOutputInGPU(const std::string& name, void* dst, size_t max_size);
   // LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU
   // to CPU.
   void GetOutputInCPU(const std::string& name, void* dst, size_t max_size);
@@ -116,7 +123,7 @@ class TensorRTEngine : public EngineBase {
   cudaStream_t* stream_;
   nvinfer1::ILogger& logger_;
 
-  std::vector<void*> buffers_;
+  std::vector<Buffer> buffers_;
   // max data size for the buffers.
   std::unordered_map<std::string /*name*/, size_t /*max size*/> buffer_sizes_;
   std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/>
@@ -125,7 +132,11 @@ class TensorRTEngine : public EngineBase {
   // TensorRT related internal members
   template <typename T>
   struct Destroyer {
-    void operator()(T* x) { x->destroy(); }
+    void operator()(T* x) {
+      if (x) {
+        x->destroy();
+      }
+    }
   };
   template <typename T>
   using infer_ptr = std::unique_ptr<T, Destroyer<T>>;
@@ -149,6 +160,27 @@ class TensorRTEngine : public EngineBase {
 #define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \
   engine__->network()->add##layer__(ARGS);
 
+/*
+ * Helper to control the TensorRT engine's creation and deletion.
+ */
+class TRT_EngineManager {
+ public:
+  TensorRTEngine* Create(int max_batch, int max_workspace,
+                         cudaStream_t* stream) {
+    engines_.emplace_back(new TensorRTEngine(max_batch, max_workspace, stream));
+    return engines_.back().get();
+  }
+
+  void DeleteALl() {
+    for (auto& ptr : engines_) {
+      ptr.reset(nullptr);
+    }
+  }
+
+ private:
+  std::vector<std::unique_ptr<TensorRTEngine>> engines_;
+};
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h
index 2b402cce60762d774cd7b371e448b2b88794b6a8..b6e7968108403c9c9c192759c44eac040d1c5073 100644
--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@@ -26,15 +26,6 @@ namespace tensorrt {
 
 namespace dy = paddle::platform::dynload;
 
-static size_t AccumDims(nvinfer1::Dims dims) {
-  size_t num = dims.nbDims == 0 ? 0 : 1;
-  for (int i = 0; i < dims.nbDims; i++) {
-    PADDLE_ENFORCE_GT(dims.d[i], 0);
-    num *= dims.d[i];
-  }
-  return num;
-}
-
 // TensorRT data type to size
 const int kDataTypeSize[] = {
     4,  // kFLOAT
diff --git a/paddle/fluid/inference/tensorrt/io_converter.cc b/paddle/fluid/inference/tensorrt/io_converter.cc
deleted file mode 100644
index 2baac96c26453af7e70e541d80b437df3d5c2657..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/io_converter.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/tensorrt/io_converter.h"
-#include <cuda.h>
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-using platform::is_gpu_place;
-using platform::is_cpu_place;
-
-class DefaultInputConverter : public EngineInputConverter {
- public:
-  DefaultInputConverter() {}
-  // NOTE out is GPU memory.
-  virtual void operator()(const LoDTensor& in, void* out,
-                          size_t max_size) override {
-    PADDLE_ENFORCE(out != nullptr);
-    PADDLE_ENFORCE_LE(in.memory_size(), max_size);
-    const auto& place = in.place();
-    if (is_cpu_place(place)) {
-      PADDLE_ENFORCE(stream_ != nullptr);
-      PADDLE_ENFORCE_EQ(0,
-                        cudaMemcpyAsync(out, in.data<float>(), in.memory_size(),
-                                        cudaMemcpyHostToDevice, *stream_));
-
-    } else if (is_gpu_place(place)) {
-      PADDLE_ENFORCE_EQ(0,
-                        cudaMemcpyAsync(out, in.data<float>(), in.memory_size(),
-                                        cudaMemcpyHostToHost, *stream_));
-
-    } else {
-      PADDLE_THROW("Unknown device for converter");
-    }
-    cudaStreamSynchronize(*stream_);
-  }
-};
-
-REGISTER_TENSORRT_INPUT_CONVERTER(mul, DefaultInputConverter);
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/io_converter.h b/paddle/fluid/inference/tensorrt/io_converter.h
deleted file mode 100644
index 6ea61cbbac05f106f736b7d6a13912157c5ef48c..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/io_converter.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <unordered_map>
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/inference/utils/singleton.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-using framework::LoDTensor;
-
-/*
- * Convert Input from Fluid to an Engine.
- * TensorRT's ITensor follows row major, NCHW. Fluid is also row major, so in
- * most cases just need to copy the data.
- */
-class EngineInputConverter {
- public:
-  EngineInputConverter() {}
-
-  virtual void operator()(const LoDTensor& in, void* out, size_t max_size) {}
-
-  void SetStream(cudaStream_t* stream) { stream_ = stream; }
-
-  static void Run(const std::string& in_op_type, const LoDTensor& in, void* out,
-                  size_t max_size, cudaStream_t* stream) {
-    PADDLE_ENFORCE(stream != nullptr);
-    auto* converter = Registry<EngineInputConverter>::Lookup(in_op_type);
-    PADDLE_ENFORCE_NOT_NULL(converter);
-    converter->SetStream(stream);
-    (*converter)(in, out, max_size);
-  }
-
-  virtual ~EngineInputConverter() {}
-
- protected:
-  cudaStream_t* stream_{nullptr};
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-#define REGISTER_TENSORRT_INPUT_CONVERTER(in_op_type__, Converter__) \
-  struct trt_input_##in_op_type__##_converter {                      \
-    trt_input_##in_op_type__##_converter() {                         \
-      ::paddle::inference::Registry<EngineInputConverter>::Register< \
-          Converter__>(#in_op_type__);                               \
-    }                                                                \
-  };                                                                 \
-  trt_input_##in_op_type__##_converter trt_input_##in_op_type__##_converter__;
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index a08b78f930d30d674247a713fadd3e42e5ada350..e635f0f87d577a1f1ac74687ee60f762be525418 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -77,6 +77,37 @@ TEST_F(TensorRTEngineTest, add_layer) {
   ASSERT_EQ(y_cpu, x_v * 2 + 3);
 }
 
+TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
+  // Weight in CPU memory.
+  // It seems tensorrt FC use col-major: [[1.0, 3.3], [1.1, 4.4]]
+  // instead of row-major, which is [[1.0, 1.1], [3.3, 4.4]]
+  float raw_weight[4] = {1.0, 1.1, 3.3, 4.4};
+  float raw_bias[2] = {1.3, 2.4};
+
+  TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 4);
+  TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 2);
+  auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
+                                  nvinfer1::DimsCHW{1, 2, 1});
+  auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2,
+                                        weight.get(), bias.get());
+  PADDLE_ENFORCE(fc_layer != nullptr);
+
+  engine_->DeclareOutput(fc_layer, 0, "y");
+  engine_->FreezeNetwork();
+  ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
+
+  float x_v[2] = {1.0, 2.0};
+  engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
+                           2 * sizeof(float));
+  engine_->Execute(1);
+
+  LOG(INFO) << "to get output";
+  float y_cpu[2] = {-1., -1.};
+  engine_->GetOutputInCPU("y", &y_cpu[0], sizeof(float) * 2);
+  ASSERT_EQ(y_cpu[0], 4.5);
+  ASSERT_EQ(y_cpu[1], 14.5);
+}
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/test_io_converter.cc b/paddle/fluid/inference/tensorrt/test_io_converter.cc
deleted file mode 100644
index 365e9366862bee25c70dba0cdd92f318ab3ee90f..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/tensorrt/test_io_converter.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/inference/tensorrt/io_converter.h"
-
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-class EngineInputConverterTester : public ::testing::Test {
- public:
-  void SetUp() override { tensor.Resize({10, 10}); }
-
-  framework::LoDTensor tensor;
-};
-
-TEST_F(EngineInputConverterTester, DefaultCPU) {
-  void* buffer;
-  tensor.mutable_data<float>(platform::CPUPlace());
-  ASSERT_EQ(cudaMalloc(&buffer, tensor.memory_size()), 0);
-
-  cudaStream_t stream;
-  EngineInputConverter::Run("mul", tensor, buffer, tensor.memory_size(),
-                            &stream);
-}
-
-TEST_F(EngineInputConverterTester, DefaultGPU) {
-  void* buffer;
-  tensor.mutable_data<float>(platform::CUDAPlace());
-  ASSERT_EQ(cudaMalloc(&buffer, tensor.memory_size()), 0);
-
-  cudaStream_t stream;
-  EngineInputConverter::Run("mul", tensor, buffer, tensor.memory_size(),
-                            &stream);
-}
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index cc179a86256e6b552c08a091402157bdcc86b383..2fa5a9540ba1311c7f87e6675a53044b23dd8276 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -36,5 +36,13 @@ inference_test(label_semantic_roles)
 inference_test(recognize_digits ARGS mlp conv)
 inference_test(recommender_system)
 #inference_test(rnn_encoder_decoder)
-inference_test(understand_sentiment ARGS conv)
+#inference_test(understand_sentiment ARGS conv)
 inference_test(word2vec)
+
+# This is an unly work around to make this test run
+# TODO(TJ): clean me up
+cc_test(test_inference_nlp
+  SRCS test_inference_nlp.cc
+  DEPS paddle_fluid
+  ARGS
+  --model_path=${PADDLE_BINARY_DIR}/python/paddle/fluid/tests/book/recognize_digits_mlp.inference.model)
diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
index c4fd1e298b0daea85db2a407d04ad2d7bcdee0f0..987da18116cc6f4902bd66ae317f2470a8bc5057 100644
--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@@ -16,12 +16,12 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
-DEFINE_string(data_set, "cifar10", "Data set to test");
 DEFINE_string(dirname, "", "Directory of the inference model.");
 DEFINE_string(fp16_dirname, "", "Directory of the float16 inference model.");
 DEFINE_int32(batch_size, 1, "Batch size of input data");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times");
 DEFINE_bool(skip_cpu, false, "Skip the cpu test");
+DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference");
 
 TEST(inference, image_classification) {
   if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
@@ -35,19 +35,19 @@ TEST(inference, image_classification) {
   // 0. Call `paddle::framework::InitDevices()` initialize all the devices
   // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
 
+  const bool is_combined = false;
+  std::vector<std::vector<int64_t>> feed_target_shapes =
+      GetFeedTargetShapes(dirname, is_combined);
+
   paddle::framework::LoDTensor input;
   // Use normilized image pixels as input data,
   // which should be in the range [0.0, 1.0].
-  if (FLAGS_data_set == "cifar10") {
-    SetupTensor<float>(&input, {FLAGS_batch_size, 3, 32, 32},
-                       static_cast<float>(0), static_cast<float>(1));
-  } else if (FLAGS_data_set == "imagenet") {
-    SetupTensor<float>(&input, {FLAGS_batch_size, 3, 224, 224},
-                       static_cast<float>(0), static_cast<float>(1));
-  } else {
-    LOG(FATAL) << "Only cifar10 or imagenet is supported.";
-  }
-
+  feed_target_shapes[0][0] = FLAGS_batch_size;
+  paddle::framework::DDim input_dims =
+      paddle::framework::make_ddim(feed_target_shapes[0]);
+  LOG(INFO) << input_dims;
+  SetupTensor<float>(&input, input_dims, static_cast<float>(0),
+                     static_cast<float>(1));
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&input);
 
@@ -59,8 +59,10 @@ TEST(inference, image_classification) {
     // Run inference on CPU
     LOG(INFO) << "--- CPU Runs: ---";
     LOG(INFO) << "Batch size is " << FLAGS_batch_size;
+    LOG(INFO) << "FLAGS_use_mkldnn: " << FLAGS_use_mkldnn;
     TestInference<paddle::platform::CPUPlace, false, true>(
-        dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
+        dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined,
+        FLAGS_use_mkldnn);
     LOG(INFO) << output1.dims();
   }
 
@@ -73,7 +75,7 @@ TEST(inference, image_classification) {
   LOG(INFO) << "--- GPU Runs: ---";
   LOG(INFO) << "Batch size is " << FLAGS_batch_size;
   TestInference<paddle::platform::CUDAPlace, false, true>(
-      dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat);
+      dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined);
   LOG(INFO) << output2.dims();
 
   if (!FLAGS_skip_cpu) {
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a0e83a17058a4edcb8f23f23ce155e644ae0cf3b
--- /dev/null
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -0,0 +1,231 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <sys/time.h>
+#include <time.h>
+#include <fstream>
+#include <thread>  // NOLINT
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+#ifdef PADDLE_WITH_MKLML
+#include <mkl_service.h>
+#include <omp.h>
+#endif
+
+DEFINE_string(model_path, "", "Directory of the inference model.");
+DEFINE_string(data_file, "", "File of input index data.");
+DEFINE_int32(repeat, 100, "Running the inference program repeat times");
+DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference");
+DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
+DEFINE_int32(num_threads, 1, "Number of threads should be used");
+
+inline double GetCurrentMs() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
+}
+
+// This function just give dummy data for recognize_digits model.
+size_t DummyData(std::vector<paddle::framework::LoDTensor>* out) {
+  paddle::framework::LoDTensor input;
+  SetupTensor<float>(&input, {1, 1, 28, 28}, -1.f, 1.f);
+  out->emplace_back(input);
+  return 1;
+}
+
+// Load the input word index data from file and save into LodTensor.
+// Return the size of words.
+size_t LoadData(std::vector<paddle::framework::LoDTensor>* out,
+                const std::string& filename) {
+  if (filename.empty()) {
+    return DummyData(out);
+  }
+
+  size_t sz = 0;
+  std::fstream fin(filename);
+  std::string line;
+  out->clear();
+  while (getline(fin, line)) {
+    std::istringstream iss(line);
+    std::vector<int64_t> ids;
+    std::string field;
+    while (getline(iss, field, ' ')) {
+      ids.push_back(stoi(field));
+    }
+    if (ids.size() >= 1024) {
+      // Synced with NLP guys, they will ignore input larger then 1024
+      continue;
+    }
+
+    paddle::framework::LoDTensor words;
+    paddle::framework::LoD lod{{0, ids.size()}};
+    words.set_lod(lod);
+    int64_t* pdata = words.mutable_data<int64_t>(
+        {static_cast<int64_t>(ids.size()), 1}, paddle::platform::CPUPlace());
+    memcpy(pdata, ids.data(), words.numel() * sizeof(int64_t));
+    out->emplace_back(words);
+    sz += ids.size();
+  }
+  return sz;
+}
+
+// Split input data samples into small pieces jobs as balanced as possible,
+// according to the number of threads.
+void SplitData(
+    const std::vector<paddle::framework::LoDTensor>& datasets,
+    std::vector<std::vector<const paddle::framework::LoDTensor*>>* jobs,
+    const int num_threads) {
+  size_t s = 0;
+  jobs->resize(num_threads);
+  while (s < datasets.size()) {
+    for (auto it = jobs->begin(); it != jobs->end(); it++) {
+      it->emplace_back(&datasets[s]);
+      s++;
+      if (s >= datasets.size()) {
+        break;
+      }
+    }
+  }
+}
+
+void ThreadRunInfer(
+    const int tid, paddle::framework::Scope* scope,
+    const std::vector<std::vector<const paddle::framework::LoDTensor*>>& jobs) {
+  // maybe framework:ProgramDesc is not thread-safe
+  auto& sub_scope = scope->NewScope();
+  auto place = paddle::platform::CPUPlace();
+  auto executor = paddle::framework::Executor(place);
+  auto inference_program =
+      paddle::inference::Load(&executor, scope, FLAGS_model_path);
+
+  auto ctx = executor.Prepare(*inference_program, /*block_id*/ 0);
+  executor.CreateVariables(*inference_program, &sub_scope, /*block_id*/ 0);
+
+  const std::vector<std::string>& feed_target_names =
+      inference_program->GetFeedTargetNames();
+  const std::vector<std::string>& fetch_target_names =
+      inference_program->GetFetchTargetNames();
+
+  PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
+  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+  paddle::framework::LoDTensor outtensor;
+  fetch_targets[fetch_target_names[0]] = &outtensor;
+
+  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+  PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL);
+
+  auto& inputs = jobs[tid];
+  auto start_ms = GetCurrentMs();
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    feed_targets[feed_target_names[0]] = inputs[i];
+    executor.RunPreparedContext(ctx.get(), &sub_scope, &feed_targets,
+                                &fetch_targets, false /*create_local_scope*/);
+  }
+  auto stop_ms = GetCurrentMs();
+  scope->DeleteScope(&sub_scope);
+  LOG(INFO) << "Tid: " << tid << ", process " << inputs.size()
+            << " samples, avg time per sample: "
+            << (stop_ms - start_ms) / inputs.size() << " ms";
+}
+
+TEST(inference, nlp) {
+  if (FLAGS_model_path.empty()) {
+    LOG(FATAL) << "Usage: ./example --model_path=path/to/your/model";
+  }
+  if (FLAGS_data_file.empty()) {
+    LOG(WARNING) << "No data file provided, will use dummy data!"
+                 << "Note: if you use nlp model, please provide data file.";
+  }
+  LOG(INFO) << "Model Path: " << FLAGS_model_path;
+  LOG(INFO) << "Data File: " << FLAGS_data_file;
+
+  std::vector<paddle::framework::LoDTensor> datasets;
+  size_t num_total_words = LoadData(&datasets, FLAGS_data_file);
+  LOG(INFO) << "Number of samples (seq_len<1024): " << datasets.size();
+  LOG(INFO) << "Total number of words: " << num_total_words;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  std::unique_ptr<paddle::framework::Scope> scope(
+      new paddle::framework::Scope());
+
+#ifdef PADDLE_WITH_MKLML
+  // only use 1 thread number per std::thread
+  omp_set_dynamic(0);
+  omp_set_num_threads(1);
+  mkl_set_num_threads(1);
+#endif
+
+  double start_ms = 0, stop_ms = 0;
+  if (FLAGS_num_threads > 1) {
+    std::vector<std::vector<const paddle::framework::LoDTensor*>> jobs;
+    SplitData(datasets, &jobs, FLAGS_num_threads);
+    std::vector<std::unique_ptr<std::thread>> threads;
+    start_ms = GetCurrentMs();
+    for (int i = 0; i < FLAGS_num_threads; ++i) {
+      threads.emplace_back(
+          new std::thread(ThreadRunInfer, i, scope.get(), std::ref(jobs)));
+    }
+    for (int i = 0; i < FLAGS_num_threads; ++i) {
+      threads[i]->join();
+    }
+    stop_ms = GetCurrentMs();
+  } else {
+    // 1. Define place, executor, scope
+    auto place = paddle::platform::CPUPlace();
+    auto executor = paddle::framework::Executor(place);
+
+    // 2. Initialize the inference_program and load parameters
+    std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
+    inference_program = InitProgram(&executor, scope.get(), FLAGS_model_path,
+                                    /*model combined*/ false);
+    if (FLAGS_use_mkldnn) {
+      EnableMKLDNN(inference_program);
+    }
+    // always prepare context
+    std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
+    ctx = executor.Prepare(*inference_program, 0);
+    if (FLAGS_prepare_vars) {
+      executor.CreateVariables(*inference_program, scope.get(), 0);
+    }
+    // preapre fetch
+    const std::vector<std::string>& fetch_target_names =
+        inference_program->GetFetchTargetNames();
+    PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
+    std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+    paddle::framework::LoDTensor outtensor;
+    fetch_targets[fetch_target_names[0]] = &outtensor;
+
+    // prepare feed
+    const std::vector<std::string>& feed_target_names =
+        inference_program->GetFeedTargetNames();
+    PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL);
+    std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+
+    // feed data and run
+    start_ms = GetCurrentMs();
+    for (size_t i = 0; i < datasets.size(); ++i) {
+      feed_targets[feed_target_names[0]] = &(datasets[i]);
+      executor.RunPreparedContext(ctx.get(), scope.get(), &feed_targets,
+                                  &fetch_targets, !FLAGS_prepare_vars);
+    }
+    stop_ms = GetCurrentMs();
+    LOG(INFO) << "Tid: 0, process " << datasets.size()
+              << " samples, avg time per sample: "
+              << (stop_ms - start_ms) / datasets.size() << " ms";
+  }
+  LOG(INFO) << "Total inference time with " << FLAGS_num_threads
+            << " threads : " << (stop_ms - start_ms) / 1000.0
+            << " sec, QPS: " << datasets.size() / ((stop_ms - start_ms) / 1000);
+}
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index af2a7a5620487a10c1df6152fc4e4bf67b150752..01b8dc0be662da22fe15a79cd9abfe5fa92c9577 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -89,11 +89,68 @@ void CheckError(const paddle::framework::LoDTensor& output1,
   EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
 }
 
+std::unique_ptr<paddle::framework::ProgramDesc> InitProgram(
+    paddle::framework::Executor* executor, paddle::framework::Scope* scope,
+    const std::string& dirname, const bool is_combined = false) {
+  std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
+  if (is_combined) {
+    // All parameters are saved in a single file.
+    // Hard-coding the file names of program and parameters in unittest.
+    // The file names should be consistent with that used in Python API
+    //  `fluid.io.save_inference_model`.
+    std::string prog_filename = "__model_combined__";
+    std::string param_filename = "__params_combined__";
+    inference_program =
+        paddle::inference::Load(executor, scope, dirname + "/" + prog_filename,
+                                dirname + "/" + param_filename);
+  } else {
+    // Parameters are saved in separate files sited in the specified
+    // `dirname`.
+    inference_program = paddle::inference::Load(executor, scope, dirname);
+  }
+  return inference_program;
+}
+
+std::vector<std::vector<int64_t>> GetFeedTargetShapes(
+    const std::string& dirname, const bool is_combined = false) {
+  auto place = paddle::platform::CPUPlace();
+  auto executor = paddle::framework::Executor(place);
+  auto* scope = new paddle::framework::Scope();
+
+  auto inference_program = InitProgram(&executor, scope, dirname, is_combined);
+  auto& global_block = inference_program->Block(0);
+
+  const std::vector<std::string>& feed_target_names =
+      inference_program->GetFeedTargetNames();
+  std::vector<std::vector<int64_t>> feed_target_shapes;
+  for (size_t i = 0; i < feed_target_names.size(); ++i) {
+    auto* var = global_block.FindVar(feed_target_names[i]);
+    std::vector<int64_t> var_shape = var->GetShape();
+    feed_target_shapes.push_back(var_shape);
+  }
+
+  delete scope;
+  return feed_target_shapes;
+}
+
+void EnableMKLDNN(
+    const std::unique_ptr<paddle::framework::ProgramDesc>& program) {
+  for (size_t bid = 0; bid < program->Size(); ++bid) {
+    auto* block = program->MutableBlock(bid);
+    for (auto* op : block->AllOps()) {
+      if (op->HasAttr("use_mkldnn")) {
+        op->SetAttr("use_mkldnn", true);
+      }
+    }
+  }
+}
+
 template <typename Place, bool CreateVars = true, bool PrepareContext = false>
 void TestInference(const std::string& dirname,
                    const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
                    const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
-                   const int repeat = 1, const bool is_combined = false) {
+                   const int repeat = 1, const bool is_combined = false,
+                   const bool use_mkldnn = false) {
   // 1. Define place, executor, scope
   auto place = Place();
   auto executor = paddle::framework::Executor(place);
@@ -105,7 +162,7 @@ void TestInference(const std::string& dirname,
     state = paddle::platform::ProfilerState::kCPU;
   } else {
 #ifdef PADDLE_WITH_CUDA
-    state = paddle::platform::ProfilerState::kCUDA;
+    state = paddle::platform::ProfilerState::kAll;
     // The default device_id of paddle::platform::CUDAPlace is 0.
     // Users can get the device_id using:
     //   int device_id = place.GetDeviceId();
@@ -124,26 +181,14 @@ void TestInference(const std::string& dirname,
     paddle::platform::RecordEvent record_event(
         "init_program",
         paddle::platform::DeviceContextPool::Instance().Get(place));
-
-    if (is_combined) {
-      // All parameters are saved in a single file.
-      // Hard-coding the file names of program and parameters in unittest.
-      // The file names should be consistent with that used in Python API
-      //  `fluid.io.save_inference_model`.
-      std::string prog_filename = "__model_combined__";
-      std::string param_filename = "__params_combined__";
-      inference_program = paddle::inference::Load(
-          &executor, scope, dirname + "/" + prog_filename,
-          dirname + "/" + param_filename);
-    } else {
-      // Parameters are saved in separate files sited in the specified
-      // `dirname`.
-      inference_program = paddle::inference::Load(&executor, scope, dirname);
+    inference_program = InitProgram(&executor, scope, dirname, is_combined);
+    if (use_mkldnn) {
+      EnableMKLDNN(inference_program);
     }
   }
   // Disable the profiler and print the timing information
   paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
-                                    "load_program_profiler.txt");
+                                    "load_program_profiler");
   paddle::platform::ResetProfiler();
 
   // 3. Get the feed_target_names and fetch_target_names
@@ -179,10 +224,10 @@ void TestInference(const std::string& dirname,
     if (PrepareContext) {
       ctx = executor.Prepare(*inference_program, 0);
       executor.RunPreparedContext(ctx.get(), scope, &feed_targets,
-                                  &fetch_targets, CreateVars);
+                                  &fetch_targets, true, CreateVars);
     } else {
       executor.Run(*inference_program, scope, &feed_targets, &fetch_targets,
-                   CreateVars);
+                   true, CreateVars);
     }
 
     // Enable the profiler
@@ -207,8 +252,7 @@ void TestInference(const std::string& dirname,
 
     // Disable the profiler and print the timing information
     paddle::platform::DisableProfiler(
-        paddle::platform::EventSortingKey::kDefault,
-        "run_inference_profiler.txt");
+        paddle::platform::EventSortingKey::kDefault, "run_inference_profiler");
     paddle::platform::ResetProfiler();
   }
 
diff --git a/paddle/fluid/inference/utils/singleton.h b/paddle/fluid/inference/utils/singleton.h
index f05921067c45f156319375b613f51101cfda8e90..cfb89e704457a11a3cd6e89dba5efad5acae0bce 100644
--- a/paddle/fluid/inference/utils/singleton.h
+++ b/paddle/fluid/inference/utils/singleton.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
 #include <unordered_map>
 #include "paddle/fluid/platform/enforce.h"
 
@@ -49,9 +50,15 @@ struct Registry {
     items_[name] = new ItemChild;
   }
 
-  static ItemParent* Lookup(const std::string& name) {
+  static ItemParent* Lookup(const std::string& name,
+                            const std::string& default_name = "") {
     auto it = items_.find(name);
-    if (it == items_.end()) return nullptr;
+    if (it == items_.end()) {
+      if (default_name == "")
+        return nullptr;
+      else
+        return items_.find(default_name)->second;
+    }
     return it->second;
   }
 
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 256aded8ca234a24229e11f27b9e3e25728ad293..5e86b16ba1ff69c798372a144fb3bf699768f2e6 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -166,6 +166,8 @@ function(op_library TARGET)
       # NOTE(*): activation use macro to regist the kernels, set use_op manually.
       if(${TARGET} STREQUAL "activation")
         file(APPEND ${pybind_file} "USE_OP(relu);\n")
+      elseif(${TARGET} STREQUAL "fake_dequantize")
+        file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
       else()
         file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
       endif()
@@ -184,6 +186,7 @@ endif()
 
 add_subdirectory(detail)
 if(WITH_DISTRIBUTE)
+    
     set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
     set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
     op_library(send_op DEPS ${DISTRIBUTE_DEPS})
@@ -197,17 +200,36 @@ if(WITH_DISTRIBUTE)
     op_library(send_vars_op DEPS ${DISTRIBUTE_DEPS})
     set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
+    op_library(fetch_barrier_op DEPS ${DISTRIBUTE_DEPS})
     set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op listen_and_serv_op sum_op executor)
+    set_source_files_properties(fetch_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
+    #        listen_and_serv_op sum_op executor SERIAL)
+    if(WITH_GPU)
+        set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+        cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS send_op
+                listen_and_serv_op executor SERIAL)
+        op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc)
+        set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    else()
+        set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op)
+    endif()
 else()
-    set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op)
+    set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op fetch_barrier_op gen_nccl_id_op)
 endif()
 
 op_library(cross_entropy_op DEPS cross_entropy)
 op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(softmax_op DEPS softmax)
 op_library(sequence_softmax_op DEPS softmax)
+if (WITH_GPU AND TENSORRT_FOUND)
+    op_library(tensorrt_engine_op DEPS tensorrt_engine)
+    nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
+      DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter)
+else()
+    set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
+endif()
 op_library(sum_op DEPS selected_rows_functor)
 op_library(sgd_op DEPS selected_rows_functor)
 op_library(print_op DEPS lod_tensor)
@@ -268,6 +290,11 @@ foreach(src ${READER_LIBRARY})
     set(OP_LIBRARY ${src} ${OP_LIBRARY})
 endforeach()
 
+add_subdirectory(detection)
+foreach(src ${DETECTION_LIBRARY})
+    set(OP_LIBRARY ${src} ${OP_LIBRARY})
+endforeach()
+
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
diff --git a/paddle/fluid/operators/accuracy_op.cc b/paddle/fluid/operators/accuracy_op.cc
index ac10d759fecb56635d1303fd383a5f9ea18f0a4d..42fcace17926641b5caf677eb3c8ba5222e37190 100644
--- a/paddle/fluid/operators/accuracy_op.cc
+++ b/paddle/fluid/operators/accuracy_op.cc
@@ -63,8 +63,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
 
 class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AccuracyOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     // TODO(typhoonzero): support both inference value and indices.
     AddInput("Out", "The network output of topk (inferences)");
     AddInput("Indices", "The the network output of topk (indices)");
diff --git a/paddle/fluid/operators/activation_mkldnn_op.cc b/paddle/fluid/operators/activation_mkldnn_op.cc
index ab7c61227114fe7a0ce2ff2515dd560706058b64..46ed99bcf2234f7621d9f00eb48c846d8a355795 100644
--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
@@ -15,6 +15,7 @@
 #include "mkldnn.hpp"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/mkldnn_activation_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
 
 namespace paddle {
 namespace operators {
@@ -23,6 +24,18 @@ using paddle::framework::Tensor;
 using paddle::platform::MKLDNNDeviceContext;
 
 namespace {
+std::string gethash(const mkldnn::memory::dims &operand_dims,
+                    const mkldnn::algorithm algorithm) {
+  auto dim2str = [](const mkldnn::memory::dims &operand_dims) {
+    std::string dstr = "";
+    for (size_t i = 0; i < operand_dims.size(); ++i) {
+      dstr += std::to_string(operand_dims[i]) + "-";
+    }
+    return dstr;
+  };
+  return dim2str(operand_dims) + std::to_string(algorithm);
+}
+
 template <typename T, typename ExecContext>
 void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm,
                      const T alpha = 0, const T beta = 0) {
@@ -37,42 +50,70 @@ void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm,
   const auto *src_data = src->template data<T>();
 
   auto *dst = ctx.template Output<Tensor>("Out");
-  const T *dst_data = dst->template mutable_data<T>(ctx.GetPlace());
+  T *dst_data = dst->template mutable_data<T>(ctx.GetPlace());
 
   // get memory dim
   PADDLE_ENFORCE(src->dims().size() == 2 || src->dims().size() == 4,
                  "Input dim must be with 2 or 4");
   std::vector<int> src_tz = framework::vectorize2int(src->dims());
 
-  // create memory description
-  auto data_md = src_tz.size() == 2
-                     ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                               mkldnn::memory::format::nc)
-                     : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                               mkldnn::memory::format::nchw);
-
-  // create memory primitives
-  auto src_memory =
-      mkldnn::memory({data_md, mkldnn_engine},
-                     static_cast<void *>(const_cast<float *>(src_data)));
-  auto dst_memory =
-      mkldnn::memory({data_md, mkldnn_engine},
-                     static_cast<void *>(const_cast<float *>(dst_data)));
-
-  auto forward_desc = mkldnn::eltwise_forward::desc(
-      mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta);
-
-  // save prim desc into global device context to be referred in backward path
-  const std::string key = ctx.op().Output("Out");
-  const std::string key_eltwise_pd = key + "@eltwise_pd";
-  auto forward_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
-      forward_desc, mkldnn_engine);
-  dev_ctx.SetBlob(key_eltwise_pd, forward_pd);
-
-  auto eltwise = mkldnn::eltwise_forward(*forward_pd, src_memory, dst_memory);
+  const std::string key = gethash(src_tz, algorithm);
+  const std::string key_src_data =
+      key + ctx.op().Output("Out") + "@eltwise_fwd_src_data";
+  const std::string key_src_mem = key + "@eltwise_fwd_src_mem";
+  const std::string key_dst_mem = key + "@eltwise_fwd_dst_mem";
+  const std::string key_fwd = key + "@eltwise_fwd";
+
+  auto p_fwd = std::static_pointer_cast<mkldnn::eltwise_forward>(
+      dev_ctx.GetBlob(key_fwd));
+
+  // save input data to be referred in backward path
+  auto p_src_data = std::make_shared<const T *>(src_data);
+  dev_ctx.SetBlob(key_src_data, p_src_data);
+
+  if (p_fwd == nullptr) {
+    // create memory description
+    auto data_md = src_tz.size() == 2
+                       ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                                 mkldnn::memory::format::nc)
+                       : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                                 mkldnn::memory::format::nchw);
+
+    // create memory primitives
+    auto p_src_mem = std::make_shared<mkldnn::memory>(mkldnn::memory(
+        {data_md, mkldnn_engine}, platform::to_void_cast(src_data)));
+    dev_ctx.SetBlob(key_src_mem, p_src_mem);
+
+    auto p_dst_mem = std::make_shared<mkldnn::memory>(mkldnn::memory(
+        {data_md, mkldnn_engine}, platform::to_void_cast(dst_data)));
+    dev_ctx.SetBlob(key_dst_mem, p_dst_mem);
+
+    auto fwd_desc = mkldnn::eltwise_forward::desc(
+        mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta);
+    auto p_fwd_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
+        fwd_desc, mkldnn_engine);
+    const std::string key_fwd_pd = key + "eltwise_fwd_pd";
+    dev_ctx.SetBlob(key_fwd_pd, p_fwd_pd);
+    p_fwd = std::make_shared<mkldnn::eltwise_forward>(
+        *p_fwd_pd, *(p_src_mem.get()), *(p_dst_mem.get()));
+    dev_ctx.SetBlob(key_fwd, p_fwd);
+  } else {
+    // primitives already exist
+    auto p_src_mem =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_src_mem));
+    PADDLE_ENFORCE(p_src_mem != nullptr,
+                   "Fail to find eltwise p_src_mem in device context.");
+    auto p_dst_mem =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_dst_mem));
+    PADDLE_ENFORCE(p_dst_mem != nullptr,
+                   "Fail to find eltwise p_src_mem in device context.");
+
+    p_src_mem->set_data_handle(platform::to_void_reinterpret_cast(src_data));
+    p_dst_mem->set_data_handle(dst_data);
+  }
 
   // push primitive to stream and wait until it's executed
-  std::vector<mkldnn::primitive> pipeline = {eltwise};
+  std::vector<mkldnn::primitive> pipeline = {*(p_fwd.get())};
   mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
 }
 
@@ -83,8 +124,7 @@ void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm,
   const auto &mkldnn_engine = dev_ctx.GetEngine();
 
   // get buffers
-  const auto *x = ctx.template Input<Tensor>("X");
-  const auto *src = x->template data<T>();
+  const auto *out = ctx.template Input<Tensor>("Out");
 
   auto *dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
   const auto *diff_dst = dout->template data<T>();
@@ -94,45 +134,73 @@ void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm,
   const T *diff_src = dx->template mutable_data<T>(ctx.GetPlace());
 
   // get memory dim
-  std::vector<int> src_tz = framework::vectorize2int(x->dims());
-
-  // create memory description
-  auto data_md = src_tz.size() == 2
-                     ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                               mkldnn::memory::format::nc)
-                     : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                               mkldnn::memory::format::nchw);
-
-  // create memory primitives
-  auto src_memory = mkldnn::memory(
-      {data_md, mkldnn_engine}, static_cast<void *>(const_cast<float *>(src)));
-  auto diff_src_memory =
-      mkldnn::memory({data_md, mkldnn_engine},
-                     static_cast<void *>(const_cast<float *>(diff_src)));
-  auto diff_dst_memory =
-      mkldnn::memory({data_md, mkldnn_engine},
-                     static_cast<void *>(const_cast<float *>(diff_dst)));
-
-  auto backward_desc =
-      mkldnn::eltwise_backward::desc(algorithm, data_md, data_md, alpha, beta);
-
-  // retrieve eltwise primitive desc from device context
-  const std::string key = ctx.op().Input("Out");
-  const std::string key_eltwise_pd = key + "@eltwise_pd";
-  const std::shared_ptr<void> forward_pd = dev_ctx.GetBlob(key_eltwise_pd);
-  PADDLE_ENFORCE(forward_pd != nullptr,
-                 "Fail to find eltwise_pd in device context");
-  auto *p_forward_pd =
-      static_cast<mkldnn::eltwise_forward::primitive_desc *>(forward_pd.get());
-
-  auto eltwise_bwd_prim_desc = mkldnn::eltwise_backward::primitive_desc(
-      backward_desc, mkldnn_engine, *p_forward_pd);
-
-  auto eltwise_bwd = mkldnn::eltwise_backward(eltwise_bwd_prim_desc, src_memory,
-                                              diff_dst_memory, diff_src_memory);
+  std::vector<int> src_tz = framework::vectorize2int(out->dims());
+
+  const std::string key = gethash(src_tz, algorithm);
+  const std::string key_diff_src_mem = key + "@eltwise_diff_src_mem";
+  const std::string key_diff_dst_mem = key + "@eltwise_diff_dst_mem";
+  const std::string key_grad = key + "@eltwise_grad";
+
+  const std::string key_src_data =
+      key + ctx.op().Input("Out") + "@eltwise_fwd_src_data";
+  const auto p_src_data =
+      std::static_pointer_cast<T *>(dev_ctx.GetBlob(key_src_data));
+
+  const std::string key_src_mem = key + "@eltwise_fwd_src_mem";
+  auto p_src_mem =
+      std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_src_mem));
+  p_src_mem->set_data_handle(*p_src_data.get());
+
+  auto p_grad = std::static_pointer_cast<mkldnn::eltwise_forward::primitive>(
+      dev_ctx.GetBlob(key_grad));
+
+  if (p_grad == nullptr) {
+    // create memory description
+    auto data_md = src_tz.size() == 2
+                       ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                                 mkldnn::memory::format::nc)
+                       : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                                 mkldnn::memory::format::nchw);
+
+    // create memory primitives
+    std::shared_ptr<void> p_diff_src_mem =
+        std::make_shared<mkldnn::memory>(mkldnn::memory(
+            {data_md, mkldnn_engine}, platform::to_void_cast(diff_src)));
+    dev_ctx.SetBlob(key_diff_src_mem, p_diff_src_mem);
+    std::shared_ptr<void> p_diff_dst_mem =
+        std::make_shared<mkldnn::memory>(mkldnn::memory(
+            {data_md, mkldnn_engine}, platform::to_void_cast(diff_dst)));
+    dev_ctx.SetBlob(key_diff_dst_mem, p_diff_dst_mem);
+
+    auto bwd_desc = mkldnn::eltwise_backward::desc(algorithm, data_md, data_md,
+                                                   alpha, beta);
+
+    const std::string key_fwd_pd = key + "eltwise_fwd_pd";
+    auto *p_fwd_pd = static_cast<mkldnn::eltwise_forward::primitive_desc *>(
+        dev_ctx.GetBlob(key_fwd_pd).get());
+
+    auto eltwise_bwd_prim_desc = mkldnn::eltwise_backward::primitive_desc(
+        bwd_desc, mkldnn_engine, *p_fwd_pd);
+
+    p_grad = std::make_shared<mkldnn::eltwise_backward>(
+        eltwise_bwd_prim_desc, *static_cast<mkldnn::memory *>(p_src_mem.get()),
+        *(static_cast<mkldnn::memory *>(p_diff_dst_mem.get())),
+        *(static_cast<mkldnn::memory *>(p_diff_src_mem.get())));
+  } else {
+    // primitives already exist
+    auto p_diff_src_mem = std::static_pointer_cast<mkldnn::memory>(
+        dev_ctx.GetBlob(key_diff_src_mem));
+    auto p_diff_dst_mem = std::static_pointer_cast<mkldnn::memory>(
+        dev_ctx.GetBlob(key_diff_dst_mem));
+
+    p_diff_src_mem->set_data_handle(
+        platform::to_void_reinterpret_cast(diff_src));
+    p_diff_dst_mem->set_data_handle(
+        platform::to_void_reinterpret_cast(diff_dst));
+  }
 
   // push primitive to stream and wait until it's executed
-  std::vector<mkldnn::primitive> pipeline = {eltwise_bwd};
+  std::vector<mkldnn::primitive> pipeline = {*(p_grad.get())};
   mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
 }
 }  // anonymous namespace
@@ -154,35 +222,35 @@ struct MKLDNNActivationGradFunc : public BaseActivationFunctor<T> {
 };
 
 template <typename T>
-using ReluMkldnnFunctor =
+using ReluMKLDNNFunctor =
     MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_relu>;
 
 template <typename T>
-using TanhMkldnnFunctor =
+using TanhMKLDNNFunctor =
     MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_tanh>;
 
 template <typename T>
-using SqrtMkldnnFunctor =
+using SqrtMKLDNNFunctor =
     MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_sqrt>;
 
 template <typename T>
-using AbsMkldnnFunctor =
+using AbsMKLDNNFunctor =
     MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_abs>;
 
 template <typename T>
-using ReluMkldnnGradFunctor =
+using ReluMKLDNNGradFunctor =
     MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_relu>;
 
 template <typename T>
-using TanhMkldnnGradFunctor =
+using TanhMKLDNNGradFunctor =
     MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_tanh>;
 
 template <typename T>
-using SqrtMkldnnGradFunctor =
+using SqrtMKLDNNGradFunctor =
     MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_sqrt>;
 
 template <typename T>
-using AbsMkldnnGradFunctor =
+using AbsMKLDNNGradFunctor =
     MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_abs>;
 }  // namespace operators
 }  // namespace paddle
@@ -197,9 +265,9 @@ namespace ops = paddle::operators;
       ops::MKLDNNActivationGradKernel<ops::grad_functor<float>>);
 
 #define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro)            \
-  __macro(relu, ReluMkldnnFunctor, ReluMkldnnGradFunctor); \
-  __macro(tanh, TanhMkldnnFunctor, TanhMkldnnGradFunctor); \
-  __macro(sqrt, SqrtMkldnnFunctor, SqrtMkldnnGradFunctor); \
-  __macro(abs, AbsMkldnnFunctor, AbsMkldnnGradFunctor);
+  __macro(relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \
+  __macro(tanh, TanhMKLDNNFunctor, TanhMKLDNNGradFunctor); \
+  __macro(sqrt, SqrtMKLDNNFunctor, SqrtMKLDNNGradFunctor); \
+  __macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor);
 
 FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 87ef55c50b0be46492a695928625d140345d415d..96e4c0e04cc30db6d0b86376434d5ea02694ae21 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -19,19 +19,18 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)                  \
-  class OP_NAME##OpMaker                                                   \
-      : public ::paddle::framework::OpProtoAndCheckerMaker {               \
-   public:                                                                 \
-    OP_NAME##OpMaker(OpProto *proto, OpAttrChecker *op_checker)            \
-        : ::paddle::framework::OpProtoAndCheckerMaker(proto, op_checker) { \
-      AddInput("X", "Input of " #OP_NAME "operator");                      \
-      AddOutput("Out", "Output of" #OP_NAME "operator");                   \
-      AddAttr<bool>("use_mkldnn",                                          \
-                    "(bool, default false) Only used in mkldnn kernel")    \
-          .SetDefault(false);                                              \
-      AddComment(#OP_COMMENT);                                             \
-    }                                                                      \
+#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)               \
+  class OP_NAME##OpMaker                                                \
+      : public ::paddle::framework::OpProtoAndCheckerMaker {            \
+   public:                                                              \
+    void Make() override {                                              \
+      AddInput("X", "Input of " #OP_NAME " operator");                  \
+      AddOutput("Out", "Output of " #OP_NAME " operator");              \
+      AddAttr<bool>("use_mkldnn",                                       \
+                    "(bool, default false) Only used in mkldnn kernel") \
+          .SetDefault(false);                                           \
+      AddComment(OP_COMMENT);                                           \
+    }                                                                   \
   }
 
 #define REGISTER_ACTIVATION_OP_GRAD_MAKER(OP_NAME, KERNEL_TYPE)              \
@@ -42,7 +41,7 @@ namespace operators {
                                                                              \
    protected:                                                                \
     std::unique_ptr<::paddle::framework::OpDesc> Apply() const override {    \
-      auto *op = new ::paddle::framework::OpDesc();                          \
+      auto* op = new ::paddle::framework::OpDesc();                          \
       op->SetType(#KERNEL_TYPE "_grad");                                     \
       op->SetInput("Out", Output("Out"));                                    \
       op->SetInput(::paddle::framework::GradVarName("Out"),                  \
@@ -55,23 +54,52 @@ namespace operators {
     }                                                                        \
   }
 
+framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
+                                      const framework::OperatorWithKernel& oper,
+                                      const std::string& name) {
+  framework::LibraryType library{framework::LibraryType::kPlain};
+
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+#ifdef PADDLE_WITH_MKLDNN
+  auto it = oper.Attrs().find("use_mkldnn");
+  if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() &&
+      platform::CanMKLDNNBeUsed(ctx)) {
+    library = framework::LibraryType::kMKLDNN;
+    layout = framework::DataLayout::kMKLDNN;
+  }
+#endif
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<framework::Tensor>(name)->type()),
+      ctx.GetPlace(), layout, library);
+}
+
 class ActivationOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
     ctx->ShareLoD("X", /*->*/ "Out");
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return GetKernelType(ctx, *this, "X");
+  }
 };
 
 class ActivationOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return GetKernelType(ctx, *this, "Out");
+  }
 };
 
 __attribute__((unused)) constexpr char SigmoidDoc[] = R"DOC(
@@ -204,8 +232,7 @@ $$out = \frac{x}{1 + |x|}$$
 
 class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LeakyReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "Input of LeakyRelu operator");
     AddOutput("Out", "Output of LeakyRelu operator");
     AddAttr<float>("alpha", "The small negative slope").SetDefault(0.02f);
@@ -220,8 +247,7 @@ $out = \max(x, \alpha * x)$
 
 class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "Input of Softshrink operator");
     AddOutput("Out", "Output of Softshrink operator");
     AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
@@ -242,8 +268,7 @@ $$
 
 class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  HardShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "Input of HardShrink operator");
     AddOutput("Out", "Output of HardShrink operator");
     AddAttr<float>("threshold", "The value of threshold for HardShrink")
@@ -265,8 +290,7 @@ $$
 
 class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "Input of BRelu operator");
     AddOutput("Out", "Output of BRelu operator");
     AddAttr<float>("t_min", "The min marginal value of BRelu")
@@ -284,8 +308,7 @@ $out = \max(\min(x, t_{min}), t_{max})$
 
 class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "Input of SoftRelu operator");
     AddOutput("Out", "Output of SoftRelu operator");
     AddAttr<float>("threshold", "The threshold value of SoftRelu")
@@ -301,8 +324,7 @@ $out = \ln(1 + \exp(\max(\min(x, threshold), threshold))$
 
 class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ELUOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "Input of ELU operator");
     AddOutput("Out", "Output of ELU operator");
     AddAttr<float>("alpha", "The alpha value of ELU").SetDefault(1.0f);
@@ -320,8 +342,7 @@ $out = \max(0, x) + \min(0, \alpha * (e^x - 1))$
 
 class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Relu6OpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "Input of Relu6 operator");
     AddOutput("Out", "Output of Relu6 operator");
     AddAttr<float>("threshold", "The threshold value of Relu6")
@@ -337,8 +358,7 @@ $out = \min(\max(0, x), 6)$
 
 class PowOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PowOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "Input of Pow operator");
     AddOutput("Out", "Output of Pow operator");
     AddAttr<float>("factor", "The exponential factor of Pow").SetDefault(1.0f);
@@ -353,8 +373,7 @@ $out = x^{factor}$
 
 class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  STanhOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "Input of STanh operator");
     AddOutput("Out", "Output of STanh operator");
     AddAttr<float>("scale_a", "The scale parameter of a for the input")
@@ -372,8 +391,7 @@ $$out = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
 
 class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ThresholdedReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "Input of ThresholdedRelu operator");
     AddOutput("Out", "Output of ThresholdedRelu operator");
     AddAttr<float>("threshold", "The threshold location of activation")
@@ -394,8 +412,7 @@ $$
 
 class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  HardSigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "Input of HardSigmoid operator");
     AddOutput("Out", "Output of HardSigmoid operator");
     AddAttr<float>("slope", "Slope for linear approximation of sigmoid")
@@ -420,8 +437,7 @@ It is recommended to use the defaults for this activation.
 
 class SwishOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SwishOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "Input of Swish operator");
     AddOutput("Out", "Output of Swish operator");
     AddAttr<float>("beta", "Constant beta of swish operator").SetDefault(1.0f);
diff --git a/paddle/fluid/operators/adadelta_op.cc b/paddle/fluid/operators/adadelta_op.cc
index 7bdb3f274aa9bacb6b261e0d0cd00b72f1d409ae..d1970515f58969948b1d2db5847e4344112f77f9 100644
--- a/paddle/fluid/operators/adadelta_op.cc
+++ b/paddle/fluid/operators/adadelta_op.cc
@@ -66,8 +66,7 @@ class AdadeltaOp : public framework::OperatorWithKernel {
 
 class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AdadeltaOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
     AddInput("AvgSquaredGrad", "(Tensor) Input average of squared gradient");
diff --git a/paddle/fluid/operators/adagrad_op.cc b/paddle/fluid/operators/adagrad_op.cc
index 1227129429addb0ed412c7f1755fd39c9ca77157..a3ef9ad9f91f1f626bd33876693ecc17ad76b96b 100644
--- a/paddle/fluid/operators/adagrad_op.cc
+++ b/paddle/fluid/operators/adagrad_op.cc
@@ -67,8 +67,7 @@ class AdagradOp : public framework::OperatorWithKernel {
 
 class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AdagradOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
     AddInput("Moment", "(Tensor) Second moment");
diff --git a/paddle/fluid/operators/adam_op.cc b/paddle/fluid/operators/adam_op.cc
index f12f0c6663d1785b8af852244ffe32358fb1b693..99b0239855d6241b064a5883c2be3d58078b3b61 100644
--- a/paddle/fluid/operators/adam_op.cc
+++ b/paddle/fluid/operators/adam_op.cc
@@ -80,8 +80,7 @@ class AdamOp : public framework::OperatorWithKernel {
 
 class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AdamOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
     AddInput("LearningRate", "(Tensor) Learning rate");
diff --git a/paddle/fluid/operators/adamax_op.cc b/paddle/fluid/operators/adamax_op.cc
index 608b855d58a2f701fbb8631cb5f24768a61f3deb..32062574bcf71ff96e451eaa6865b6bbfc3b1c80 100644
--- a/paddle/fluid/operators/adamax_op.cc
+++ b/paddle/fluid/operators/adamax_op.cc
@@ -74,8 +74,7 @@ class AdamaxOp : public framework::OperatorWithKernel {
 
 class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AdamaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
     AddInput("LearningRate", "(Tensor) Learning rate");
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index 5db2e4540ef170079328f24ac8d30f7b1901fa1e..149226e92d4d08a25c211bce686ff03c5d7ddf40 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -123,8 +123,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
 
 class ArrayToLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ArrayToLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(std::vector<LodTensor>) A vector of tensors that is going to "
              "be casted to a big LoDTensor.");
diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc
index d372213e1b6008b0c4227103dd40730f86a84301..d9294048a9e89662958fd5c6af4fcbe5da3814c2 100644
--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
@@ -94,8 +94,7 @@ class AssignOp : public framework::OperatorBase {
 
 class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AssignOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(LoDTensor, SelectedRows or LoDTensorArray) The input variable "
              "could be LoDTensor, SelectedRows or LoDTensorArray.")
diff --git a/paddle/fluid/operators/assign_value_op.cc b/paddle/fluid/operators/assign_value_op.cc
index 993610fdedde4bafd99f59a0adeeeef4526eb089..4ad6f3443db33fd14b67091d14fd877b951730ff 100644
--- a/paddle/fluid/operators/assign_value_op.cc
+++ b/paddle/fluid/operators/assign_value_op.cc
@@ -45,8 +45,7 @@ class AssignValueOp : public framework::OperatorWithKernel {
 
 class AssignValueOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AssignValueOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddOutput("Out", "(Tensor) Output tensor of assign_value operator.");
     AddAttr<std::vector<int>>("shape",
                               "(vector<int>) "
diff --git a/paddle/fluid/operators/auc_op.cc b/paddle/fluid/operators/auc_op.cc
index a168eaeab56128b75bbe97d7ccf843a081b5dced..c9871a9fe6b3b0d0cf671c2d155715f92c94fd8f 100644
--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -50,8 +50,7 @@ class AucOp : public framework::OperatorWithKernel {
 
 class AucOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AucOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Out",
              "A floating point 2D tensor, values are in the range [0, 1]."
              "Each row is sorted in descending order. This input should be the"
diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc
index b21deaf9258567c05a8816b14ac7d6462964e8ba..25864e95d7e290c7f684501893e99c828c511979 100644
--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -111,8 +111,7 @@ class AverageAccumulatesOp : public framework::OperatorWithKernel {
 
 class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AverageAccumulatesOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("param", "(Tensor), The parameter to be accumulated.");
     AddInput("in_sum_1",
              "(Tensor), A tensor used to store the parameter "
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index b4bd40d0311bf10ec1fddabab2ee131fe02baf52..d7e0af28c1bfa6a9073b25b0a301234cc5d194f5 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -111,14 +111,16 @@ class BatchNormOp : public framework::OperatorWithKernel {
                       "Variance input should be of float type");
 
     framework::LibraryType library_{framework::LibraryType::kPlain};
+    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+
 #ifdef PADDLE_WITH_MKLDNN
     if (library_ == framework::LibraryType::kPlain &&
         platform::CanMKLDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
     }
 #endif
-    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
     return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
                                    library_);
   }
@@ -126,8 +128,7 @@ class BatchNormOp : public framework::OperatorWithKernel {
 
 class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BatchNormOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddAttr<bool>("is_test", "").SetDefault(false);
     AddAttr<float>("momentum", "").SetDefault(0.9);
     AddAttr<float>("epsilon", "")
@@ -368,17 +369,18 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
     }
 
     framework::LibraryType library_{framework::LibraryType::kPlain};
+    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
 #ifdef PADDLE_WITH_MKLDNN
     if (library_ == framework::LibraryType::kPlain &&
         platform::CanMKLDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
     }
 #endif
-    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        layout, library_);
+        layout_, library_);
   }
 };
 
diff --git a/paddle/fluid/operators/batch_size_like.h b/paddle/fluid/operators/batch_size_like.h
index dd51a11fbe6ad5e528197b67536518c4b31fa355..483c9f8c2191fa4eb98b91112f9d6753e2fbddc3 100644
--- a/paddle/fluid/operators/batch_size_like.h
+++ b/paddle/fluid/operators/batch_size_like.h
@@ -53,8 +53,7 @@ class BatchSizeLikeOp : public framework::OperatorWithKernel {
 
 class BatchSizeLikeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() final {
     AddInput("Input",
              "(Tensor) Tensor "
              "whose input_dim_idx'th dimension specifies the batch_size");
@@ -68,7 +67,11 @@ class BatchSizeLikeOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("output_dim_idx",
                  "(int, default 0) The index of output's batch size dimension")
         .SetDefault(0);
+    Apply();
   }
+
+ protected:
+  virtual void Apply() = 0;
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index 68fb988afd8af4e9ac3acb4506c1c31fcf85e5a3..c3dd22119ddab8ecf9213ee274e4cbd4f05e78fd 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -134,8 +134,7 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
 
 class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BeamSearchDecodeOpProtoMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Ids",
              "(LodTensorArray)"
              "score of the candidate words in each step");
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index cff097cca13f3b92c7efe4b69259fdf7c75b3760..df0b50881f4e3ec6f57bdb2b63033931059c486e 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -197,8 +197,7 @@ std::string ItemToString(const BeamSearch::Item &item) {
 
 class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BeamSearchOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     // inputs and outputs stored in proto
     AddInput("pre_ids", "ids in previous step");
     AddInput("ids", "a LoDTensor of shape of [None,k]");
diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h
index 9b51db8a45186c2a90cf8b2eb7966d0aaea04028..46bc4f6f936929050276e8b3b93f1eddd62ac638 100644
--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@@ -14,10 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef PADDLE_WITH_TESTING
-#include "gtest/gtest.h"
-#endif
-
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
diff --git a/paddle/fluid/operators/bilinear_interp_op.cc b/paddle/fluid/operators/bilinear_interp_op.cc
index 69f79bf93be8ac7df9cab43b84cf755f2f3dfeaa..3321adf2743c28f6eeca8b5cc91ef89beed6b97c 100644
--- a/paddle/fluid/operators/bilinear_interp_op.cc
+++ b/paddle/fluid/operators/bilinear_interp_op.cc
@@ -34,18 +34,34 @@ class BilinearInterpOp : public framework::OperatorWithKernel {
     int out_w = ctx->Attrs().Get<int>("out_w");
     PADDLE_ENFORCE_EQ(dim_x.size(), 4, "X's dimension must be 4");
 
+    if (ctx->HasInput("OutSize")) {
+      auto out_size_dim = ctx->GetInputDim("OutSize");
+      PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
+                        "OutSize's dimension size must be 1");
+      PADDLE_ENFORCE_EQ(out_size_dim[0], 2, "OutSize's dim[0] must be 2");
+    }
     std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
     ctx->SetOutputDim("Out", framework::make_ddim(dim_out));
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+  }
 };
 
 class BilinearInterpOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BilinearInterpOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(Tensor) The input tensor of bilinear interpolation, "
              "This is a 4-D tensor with shape of (N x C x h x w)");
+    AddInput("OutSize",
+             "(Tensor) This is a 1-D tensor with two number. "
+             "The first number is height and the second number is width.")
+        .AsDispensable();
     AddOutput("Out",
               "(Tensor) The dimension of output is (N x C x out_h x out_w]");
 
@@ -79,6 +95,12 @@ class BilinearInterpOpGrad : public framework::OperatorWithKernel {
       ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/bilinear_interp_op.cu b/paddle/fluid/operators/bilinear_interp_op.cu
index 510190f1aaf02960284216a1bedd409011088499..4c1971538495c6f111e9db18f4014786f6f0dd58 100644
--- a/paddle/fluid/operators/bilinear_interp_op.cu
+++ b/paddle/fluid/operators/bilinear_interp_op.cu
@@ -102,10 +102,21 @@ class BilinearInterpOpCUDAKernel : public framework::OpKernel<T> {
     auto* input_t = ctx.Input<Tensor>("X");      // float tensor
     auto* output_t = ctx.Output<Tensor>("Out");  // float tensor
     auto* input = input_t->data<T>();
-    auto* output = output_t->mutable_data<T>(ctx.GetPlace());
 
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
+    auto out_dims = output_t->dims();
+    auto out_size_t = ctx.Input<Tensor>("OutSize");
+    if (out_size_t != nullptr) {
+      Tensor sizes;
+      framework::TensorCopy(*out_size_t, platform::CPUPlace(), &sizes);
+      auto size_data = sizes.data<int>();
+      out_h = size_data[0];
+      out_w = size_data[1];
+    }
+    auto* output = output_t->mutable_data<T>(
+        {out_dims[0], out_dims[1], out_h, out_w}, ctx.GetPlace());
+
     int batch_size = input_t->dims()[0];
     int channels = input_t->dims()[1];
     int in_h = input_t->dims()[2];
@@ -139,8 +150,8 @@ class BilinearInterpGradOpCUDAKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* d_input_t = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* d_output_t = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
     auto* d_output = d_output_t->data<T>();
+    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
 
     auto& device_ctx =
         ctx.template device_context<platform::CUDADeviceContext>();
@@ -149,6 +160,16 @@ class BilinearInterpGradOpCUDAKernel : public framework::OpKernel<T> {
 
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
+
+    auto out_size_t = ctx.Input<Tensor>("OutSize");
+    if (out_size_t != nullptr) {
+      Tensor sizes;
+      framework::TensorCopy(*out_size_t, platform::CPUPlace(), &sizes);
+      auto size_data = sizes.data<int>();
+      out_h = size_data[0];
+      out_w = size_data[1];
+    }
+
     int batch_size = d_input_t->dims()[0];
     int channels = d_input_t->dims()[1];
     int in_h = d_input_t->dims()[2];
diff --git a/paddle/fluid/operators/bilinear_interp_op.h b/paddle/fluid/operators/bilinear_interp_op.h
index f6cd77e4d49b53ecde6a84908cdffc7e1e02ac6a..8b03cd5a0635584a45782fe5a4823c37fe4fa8e8 100644
--- a/paddle/fluid/operators/bilinear_interp_op.h
+++ b/paddle/fluid/operators/bilinear_interp_op.h
@@ -24,11 +24,18 @@ class BilinearInterpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* input_t = ctx.Input<Tensor>("X");      // float tensor
     auto* output_t = ctx.Output<Tensor>("Out");  // float tensor
+    auto out_dims = output_t->dims();
     auto* input = input_t->data<T>();
-    auto* output = output_t->mutable_data<T>(ctx.GetPlace());
-
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
+    auto out_size_t = ctx.Input<Tensor>("OutSize");
+    if (out_size_t != nullptr) {
+      auto out_size_data = out_size_t->data<int>();
+      out_h = out_size_data[0];
+      out_w = out_size_data[1];
+    }
+    auto* output = output_t->mutable_data<T>(
+        {out_dims[0], out_dims[1], out_h, out_w}, ctx.GetPlace());
     int batch_size = input_t->dims()[0];
     int channels = input_t->dims()[1];
     int in_h = input_t->dims()[2];
@@ -83,9 +90,8 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* d_input_t = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* d_output_t = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
     auto* d_output = d_output_t->data<T>();
-
+    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
     auto& device_ctx =
         ctx.template device_context<platform::CPUDeviceContext>();
     math::SetConstant<platform::CPUDeviceContext, T> zero;
@@ -93,6 +99,14 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> {
 
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
+
+    auto out_size_t = ctx.Input<Tensor>("OutSize");
+    if (out_size_t != nullptr) {
+      auto out_size_data = out_size_t->data<int>();
+      out_h = out_size_data[0];
+      out_w = out_size_data[1];
+    }
+
     int batch_size = d_input_t->dims()[0];
     int channels = d_input_t->dims()[1];
     int in_h = d_input_t->dims()[2];
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc
index e910ad92d1051aa89fdb3290a977ff376378a227..8d261a118a75ee16027faf60341cefd30c3cdbba 100644
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cc
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc
@@ -65,8 +65,7 @@ class BilinearTensorProductOp : public framework::OperatorWithKernel {
 
 class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BilinearTensorProductOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The first input of bilinear_tensor_product operator.");
     AddInput("Y", "The second input of bilinear_tensor_product operator.");
     AddInput("Weight",
diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc
index dd0068d571f72c9c22334e523cd091fe4c8da5a6..8d6a498dc941e44688ec8a2b49a6e080608f9b85 100644
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -21,8 +21,7 @@ namespace operators {
 
 class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CastOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The input tensor of cast op");
     AddOutput("Out", "The output tensor of cast op");
     AddAttr<int>("out_dtype", "output data type");
@@ -90,4 +89,5 @@ REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
                        ops::CastOpKernel<CPU, int>,
                        ops::CastOpKernel<CPU, int64_t>,
                        ops::CastOpKernel<CPU, bool>,
+                       ops::CastOpKernel<CPU, uint8_t>,
                        ops::CastOpKernel<CPU, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu
index c486c5850e25fcf4370f02cb145c244743a4cc4b..657d162878c108760585ca9bd58e2fd34bf1fef3 100644
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
@@ -21,5 +21,5 @@ using CastOpKernel =
 
 REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel<float>, CastOpKernel<double>,
                         CastOpKernel<int>, CastOpKernel<int64_t>,
-                        CastOpKernel<bool>,
+                        CastOpKernel<bool>, CastOpKernel<uint8_t>,
                         CastOpKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/channel_close_op.cc b/paddle/fluid/operators/channel_close_op.cc
index 5892650c49e2e9d7345fb94465d124cff57f0a6f..8e2db250a069c488ee98f618bc03df6485022456 100644
--- a/paddle/fluid/operators/channel_close_op.cc
+++ b/paddle/fluid/operators/channel_close_op.cc
@@ -50,8 +50,7 @@ class ChannelCloseOpOpInferShape : public framework::InferShapeBase {
 
 class ChannelCloseOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ChannelCloseOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(kChannel,
              "The Channel Variable that should be closed by"
              " the ChannelClose Op.");
diff --git a/paddle/fluid/operators/channel_create_op.cc b/paddle/fluid/operators/channel_create_op.cc
index b2fdfd0e1f24ed071bb57b7de8f99b2d5e1d3196..a7f59e4088e3fb328e5b5a83eed65f0f90edb9f0 100644
--- a/paddle/fluid/operators/channel_create_op.cc
+++ b/paddle/fluid/operators/channel_create_op.cc
@@ -91,8 +91,7 @@ class ChannelCreateOpOpInferShape : public framework::InferShapeBase {
 
 class ChannelCreateOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ChannelCreateOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddOutput(kOutput,
               "The object of a Channel type created by ChannelCreate Op.");
     AddAttr<int>("capacity", "The size of the buffer of Channel.")
diff --git a/paddle/fluid/operators/channel_recv_op.cc b/paddle/fluid/operators/channel_recv_op.cc
index 25c5c3c95ef6899589c98570df6ecbf9b3241d89..101015e837e28b504b71d919abd5f908a102c812 100644
--- a/paddle/fluid/operators/channel_recv_op.cc
+++ b/paddle/fluid/operators/channel_recv_op.cc
@@ -72,8 +72,7 @@ class ChannelRecvOp : public framework::OperatorBase {
 
 class ChannelRecvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ChannelRecvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(Channel,
              "(Channel) A variable which \"receives\" the a value sent"
              "to it by a channel_send op.")
diff --git a/paddle/fluid/operators/channel_send_op.cc b/paddle/fluid/operators/channel_send_op.cc
index 66d33617ede5bef8a95de14f5b447c0910fe3eb4..67d6deb511d883ac69426ddd34be2199367cd4c7 100644
--- a/paddle/fluid/operators/channel_send_op.cc
+++ b/paddle/fluid/operators/channel_send_op.cc
@@ -57,8 +57,7 @@ class ChannelSendOp : public framework::OperatorBase {
 
 class ChannelSendOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ChannelSendOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(Channel,
              "(Channel) A variable which \"sends\" the passed in value to "
              "a listening receiver.")
diff --git a/paddle/fluid/operators/chunk_eval_op.cc b/paddle/fluid/operators/chunk_eval_op.cc
index 95440ff89e883e754795c67cd58a08f1131df368..62636bb2f9078768180ab1e0016e3565617d24d2 100644
--- a/paddle/fluid/operators/chunk_eval_op.cc
+++ b/paddle/fluid/operators/chunk_eval_op.cc
@@ -66,8 +66,7 @@ class ChunkEvalOp : public framework::OperatorWithKernel {
 
 class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ChunkEvalOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Inference",
              "(Tensor, default: Tensor<int64_t>). "
              "Predictions from the network.");
diff --git a/paddle/fluid/operators/clip_by_norm_op.cc b/paddle/fluid/operators/clip_by_norm_op.cc
index f43726b4793f284f14226f90c94ac6eebf632bd5..c87bded034e382c981d119e8499d6780e288031f 100644
--- a/paddle/fluid/operators/clip_by_norm_op.cc
+++ b/paddle/fluid/operators/clip_by_norm_op.cc
@@ -37,8 +37,7 @@ class ClipByNormOp : public framework::OperatorWithKernel {
 
 class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ClipByNormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(Tensor) The input of clip_by_norm op."
              "The number of dimensions must be between [1, 9].");
diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc
index c71139fc7c01a696299296e43d06cf195fb3d03f..a679f7e2536a0a44148193f423f5ffe11b5e35fc 100644
--- a/paddle/fluid/operators/clip_op.cc
+++ b/paddle/fluid/operators/clip_op.cc
@@ -38,8 +38,7 @@ class ClipOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ClipOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(Tensor)The input of clip op."
              "The number of dimensions must be between [1, 9].");
diff --git a/paddle/fluid/operators/compare_op.cc b/paddle/fluid/operators/compare_op.cc
index 3a6a357e81949014a70e5bae1ee0e1c8b9d0c2ce..3a4819f3dec9704a4a7c8910dd22e80fda082335 100644
--- a/paddle/fluid/operators/compare_op.cc
+++ b/paddle/fluid/operators/compare_op.cc
@@ -21,8 +21,7 @@ namespace operators {
 template <typename OpComment>
 class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CompareOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     OpComment comment;
     AddInput("X",
              string::Sprintf("(LoDTensor) the left hand operand of %s operator",
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 3bb3bd4eb15881afb5ae42beb944b76b5e8207cb..38337f9aa52435c445420047957500d21069506a 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -63,8 +63,7 @@ class ConcatOp : public framework::OperatorWithKernel {
 
 class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ConcatOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "Input tensors of concat operator.").AsDuplicable();
     AddOutput("Out", "Output tensor of concat operator.");
     AddAttr<int>("axis",
diff --git a/paddle/fluid/operators/conditional_block_op.cc b/paddle/fluid/operators/conditional_block_op.cc
index 27f74a789beef02d31ebceb9b909e97ebd68232a..5984f80d04bdeb232f8e24264ae979725af24ef4 100644
--- a/paddle/fluid/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
@@ -108,8 +108,7 @@ class ConditionalBlockOp : public ConditionalOp {
 
 class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ConditionalBlockOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "The conditional variable of this operator. If X is empty, the "
              "whole sub-block will not be executed.")
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index 7a7b8b76e43b1f91a3ba2767c217993cc39f26b6..1828be57b5a54005a0066b18ebebdb740726f67a 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/cudnn_helper.h"
 #include "paddle/fluid/platform/float16.h"
 
-DEFINE_bool(cudnn_algo_use_autotune, true,
+DEFINE_bool(cudnn_deterministic, true,
             "Whether allow using an autotuning algorithm for convolution "
             "operator. The autotuning algorithm may be non-deterministic. If "
             "false, the algorithm is deterministic.");
@@ -272,7 +272,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
     if (input_grad) {
-      if (FLAGS_cudnn_algo_use_autotune) {
+      if (FLAGS_cudnn_deterministic) {
         PADDLE_ENFORCE(
             platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
                 handle, cudnn_filter_desc,
@@ -297,7 +297,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     }
 
     if (filter_grad) {
-      if (FLAGS_cudnn_algo_use_autotune) {
+      if (FLAGS_cudnn_deterministic) {
         PADDLE_ENFORCE(
             platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
                 handle, cudnn_input_desc, cudnn_output_grad_desc,
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 92748993c32ffb93ae25db8d9916798e657cc804..850297a2327f33a4a765f64f201e217fce5db89b 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -75,6 +75,11 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   framework::LibraryType library{framework::LibraryType::kPlain};
+
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  framework::DataLayout layout = framework::StringToDataLayout(data_format);
+
 #ifdef PADDLE_WITH_CUDA
   if (platform::CanCUDNNBeUsed(ctx)) {
     library = framework::LibraryType::kCUDNN;
@@ -84,6 +89,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
   if (library == framework::LibraryType::kPlain &&
       platform::CanMKLDNNBeUsed(ctx)) {
     library = framework::LibraryType::kMKLDNN;
+    layout = framework::DataLayout::kMKLDNN;
   }
 #endif
 
@@ -99,15 +105,11 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
                       "float16 can only be used when CUDNN is used");
   }
 
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-  framework::DataLayout layout = framework::StringToDataLayout(data_format);
   return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
                                  library);
 }
 
-Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
+void Conv2DOpMaker::Make() {
   AddInput(
       "Input",
       "(Tensor) The input tensor of convolution operator. "
@@ -200,8 +202,7 @@ $$
 )DOC");
 }
 
-Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
+void Conv3DOpMaker::Make() {
   AddInput(
       "Input",
       "(Tensor) The input tensor of convolution operator. "
@@ -311,6 +312,10 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   framework::LibraryType library_{framework::LibraryType::kPlain};
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+
 #ifdef PADDLE_WITH_CUDA
   if (platform::CanCUDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kCUDNN;
@@ -320,12 +325,10 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
   if (library_ == framework::LibraryType::kPlain &&
       platform::CanMKLDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kMKLDNN;
+    layout_ = framework::DataLayout::kMKLDNN;
   }
 #endif
 
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
       layout_, library_);
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index c51898abb422663a6731a17e0717c62ebf0701f8..b3140116dfe6a17a400bb88219ff43b249ecb32a 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -60,12 +60,12 @@ inline bool IsExpand(const std::vector<int64_t>& filter_dim,
 // operator implementations can reuse the code.
 class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+  void Make() override;
 };
 
 class Conv3DOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+  void Make() override;
 };
 
 class ConvOp : public framework::OperatorWithKernel {
@@ -187,7 +187,8 @@ class GemmConvKernel : public framework::OpKernel<T> {
         // gemm
         Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
         Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-        blas.MatMul(filter_slice, col_matrix, &out_slice);
+        blas.MatMul(filter_slice, false, col_matrix, false, T(1.0), &out_slice,
+                    T(0.0));
       }
     }
   }
@@ -304,7 +305,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
             col_matrix.ShareDataWith(in_grad_slice);
             col_matrix.Resize(col_matrix_shape);
           }
-          blas.MatMul(filter_slice, true, out_grad_slice, false, &col_matrix);
+          blas.MatMul(filter_slice, true, out_grad_slice, false, T(1.0),
+                      &col_matrix, T(0.0));
 
           if (is_expand && data_dim == 2U) {
             col2im(dev_ctx, col, dilations, strides,
@@ -351,8 +353,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
           // gemm
           Tensor filter_grad_slice =
               filter_grad_.Slice(g * out_step, (g + 1) * out_step);
-          blas.MatMul(out_grad_slice, false, col_matrix, true,
-                      &filter_grad_slice);
+          blas.MatMul(out_grad_slice, false, col_matrix, true, T(1.0),
+                      &filter_grad_slice, T(1.0));
         }
       }
     }
diff --git a/paddle/fluid/operators/conv_shift_op.cc b/paddle/fluid/operators/conv_shift_op.cc
index 82fdd308207adb159632dbb9decd67fd2d1c4646..f2549e814d6f3b5674fe2eec1139f1c3dc6fa0b4 100644
--- a/paddle/fluid/operators/conv_shift_op.cc
+++ b/paddle/fluid/operators/conv_shift_op.cc
@@ -75,8 +75,7 @@ class ConvShiftGradOp : public framework::OperatorWithKernel {
 
 class ConvShiftOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ConvShiftOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(Tensor, default Tensor<float>), a 2-D tensor with shape B x M, "
              "where B is the batch size and M is the data dimension.");
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
index 901682edbb01c563be6ea407228336b14f942778..038ea8999072f562104c5386ed18b6b275816345 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
@@ -44,6 +44,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
     // cudnn v5 does not support dilations
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
     int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
 
     const T* input_data = input->data<T>();
@@ -64,13 +65,13 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
 
     // (N, M, H, W) or (N, M, D, H, W)
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, framework::vectorize2int(input->dims()));
+        layout, framework::vectorize2int(input->dims()), groups);
     // (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
     cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, framework::vectorize2int(output->dims()));
+        layout, framework::vectorize2int(output->dims()), groups);
     // (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w)
     cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
-        layout, framework::vectorize2int(filter->dims()));
+        layout, framework::vectorize2int(filter->dims()), groups);
     cudnnConvolutionDescriptor_t cudnn_conv_desc =
         conv_desc.descriptor<T>(paddings, strides, dilations);
 
@@ -104,11 +105,17 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
 
     // ------------------- cudnn conv transpose forward ---------------------
+    int input_offset = input->numel() / input->dims()[0] / groups;
+    int output_offset = output->numel() / output->dims()[0] / groups;
+    int filter_offset = filter->numel() / groups;
     T alpha = 1.0f, beta = 0.0f;
-    PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
-        handle, &alpha, cudnn_filter_desc, filter_data, cudnn_input_desc,
-        input_data, cudnn_conv_desc, algo, cudnn_workspace,
-        workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
+    for (int g = 0; g < groups; g++) {
+      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+          handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
+          cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
+          algo, cudnn_workspace, workspace_size_in_bytes, &beta,
+          cudnn_output_desc, output_data + output_offset * g));
+    }
 
     // Release the cudnn workspace
     paddle::memory::Free(gpu, cudnn_workspace);
@@ -134,6 +141,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
     // cudnn v5 does not support dilations
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
     int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
 
     // ------------------- cudnn descriptors ---------------------
@@ -145,13 +153,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
 
     // Input: (N, M, H, W) or (N, M, D, H, W)
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, framework::vectorize2int(input->dims()));
+        layout, framework::vectorize2int(input->dims()), groups);
     // Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
     cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, framework::vectorize2int(output_grad->dims()));
+        layout, framework::vectorize2int(output_grad->dims()), groups);
     // Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w)
     cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
-        layout, framework::vectorize2int(filter->dims()));
+        layout, framework::vectorize2int(filter->dims()), groups);
 
     cudnnConvolutionDescriptor_t cudnn_conv_desc =
         conv_desc.descriptor<T>(paddings, strides, dilations);
@@ -205,15 +213,22 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
     // ------------------- cudnn conv backward data ---------------------
     // FIXME(typhoonzero): template type T may not be the same as cudnn call.
+    int input_offset = input->numel() / input->dims()[0] / groups;
+    int output_grad_offset =
+        output_grad->numel() / output_grad->dims()[0] / groups;
+    int filter_offset = filter->numel() / groups;
     T alpha = 1.0f, beta = 0.0f;
     if (input_grad) {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset input_grad.
-      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
-          handle, &alpha, cudnn_output_desc, output_grad_data,
-          cudnn_filter_desc, filter_data, cudnn_conv_desc, data_algo,
-          cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
-          input_grad_data));
+      for (int g = 0; g < groups; g++) {
+        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
+            handle, &alpha, cudnn_output_desc,
+            output_grad_data + output_grad_offset * g, cudnn_filter_desc,
+            filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
+            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
+            input_grad_data + input_offset * g));
+      }
     }
 
     // ------------------- cudnn conv backward filter ---------------------
@@ -221,11 +236,16 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset filter_grad.
       // Gradient with respect to the filter
-      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
-          handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc,
-          input_data, cudnn_conv_desc, filter_algo, cudnn_workspace,
-          workspace_size_in_bytes, &beta, cudnn_filter_desc, filter_grad_data));
+      for (int g = 0; g < groups; g++) {
+        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+            handle, &alpha, cudnn_output_desc,
+            output_grad_data + output_grad_offset * g, cudnn_input_desc,
+            input_data + input_offset * g, cudnn_conv_desc, filter_algo,
+            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc,
+            filter_grad_data + filter_offset * g));
+      }
     }
+
     // Release the cudnn workspace
     paddle::memory::Free(gpu, cudnn_workspace);
   }
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index d699dcafa4e2c7e0a3ffb62ec3985e4961fa2133..0b363f5c43f9fc191790e5cca629ffc46eb9388c 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -32,6 +32,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
   std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
   std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
   std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
+  int groups = ctx->Attrs().Get<int>("groups");
 
   PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
                  "ConvTransposeOp intput should be 4-D or 5-D tensor.");
@@ -48,10 +49,10 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
                     "ConvTransposeOp paddings dimension and dilations "
                     "dimension should be the same.");
   PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
-                    "In ConvTransposeOp, The input channel should be the same "
-                    "as the number of filters.");
+                    "In ConvTransposeOp, The number of input channels should "
+                    "be equal to the number of filter's channels.");
 
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]});
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
   for (size_t i = 0; i < strides.size(); ++i) {
     auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
     output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] +
@@ -84,9 +85,7 @@ framework::OpKernelType ConvTransposeOp::GetExpectedKernelType(
       layout_, library_);
 }
 
-Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(OpProto* proto,
-                                               OpAttrChecker* op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
+void Conv2DTransposeOpMaker::Make() {
   AddInput(
       "Input",
       "(Tensor) The input tensor of convolution transpose operator. "
@@ -104,7 +103,10 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(OpProto* proto,
   AddOutput("Output",
             "(Tensor) The output tensor of convolution transpose operator. "
             "The format of output tensor is also NCHW.");
-
+  AddAttr<int>("groups",
+               "(int default:1), the groups number of the convolution "
+               "transpose operator. ")
+      .SetDefault(1);
   AddAttr<std::vector<int>>("dilations",
                             "(vector<int> default:{1, 1}), the "
                             "dilations(h_dilation, w_dilation) of convolution "
@@ -168,9 +170,7 @@ Example:
 )DOC");
 }
 
-Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(OpProto* proto,
-                                               OpAttrChecker* op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
+void Conv3DTransposeOpMaker::Make() {
   AddInput("Input",
            "(Tensor) The input tensor of convolution transpose operator."
            "The format of input tensor is NCDHW. Where N is batch size, C is "
@@ -208,6 +208,10 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(OpProto* proto,
                             "(vector<int> default:{0, 0, 0}), paddings(d_pad, "
                             "h_pad, w_pad) of convolution transpose operator.")
       .SetDefault({0, 0, 0});
+  AddAttr<int>("groups",
+               "(int default:1), the groups number of the convolution3d "
+               "transpose operator. ")
+      .SetDefault(1);
   AddAttr<bool>(
       "use_cudnn",
       "(bool, default false) Only used in cudnn kernel, need install cudnn")
diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h
index 9276e5bfef71a58741c2dfa25b31c2bd07c309b8..1dcfc651fdd79aed50736d05d38ec8576b183d41 100644
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -30,12 +30,12 @@ using DDim = framework::DDim;
 // operator implementations can reuse the code.
 class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Conv2DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+  void Make() override;
 };
 
 class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Conv3DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+  void Make() override;
 };
 
 class ConvTransposeOp : public framework::OperatorWithKernel {
@@ -70,7 +70,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    // groups will alway be disabled in conv2dtranspose.
+    int groups = context.Attr<int>("groups");
 
     const int batch_size = static_cast<int>(input->dims()[0]);
 
@@ -81,10 +81,10 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
 
     // use col_shape in the im2col and col2im (or vol2col and col2vol)
     // calculation
-    // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w}
+    // col_shape_vec: {c/g, k_h, k_w, h, w} or {c/g, k_d, k_h, k_w, d, h, w}
     size_t data_dim = filter_shape_vec.size() - 2;
     std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    col_shape_vec[0] = output->dims()[1];
+    col_shape_vec[0] = output->dims()[1] / groups;
     for (size_t j = 0; j < data_dim; ++j) {
       col_shape_vec[j + 1] = filter_shape_vec[j + 2];
       col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2];
@@ -92,7 +92,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
     DDim col_shape(framework::make_ddim(col_shape_vec));
 
     // use col_matrix_shape in the gemm calculation
-    // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
+    // size: (c/g * k_h * k_w, h * w) or (c/g * k_d * k_h * k_w, d * h * w)
     DDim col_matrix_shape = framework::flatten_to_2d(col_shape, data_dim + 1);
 
     Tensor col;
@@ -111,7 +111,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
     // input matrix size: (m, h * w) or (m, d * h * w)
     DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
 
-    // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w)
+    // filter size: (m, c/g * k_h * k_w) or (m, c/g * k_d * k_h * k_w)
     DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
     filter.Resize(filter_matrix_shape);
 
@@ -121,6 +121,8 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     set_zero(dev_ctx, output, static_cast<T>(0));
 
+    int in_step = static_cast<int>(input->dims()[1]) / groups;
+    int out_step = static_cast<int>(output->dims()[1]) / groups;
     math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
     math::Col2VolFunctor<DeviceContext, T> col2vol;
 
@@ -133,21 +135,29 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
       // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
       Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape);
 
-      // col_matrix = filter * input_batch
-      // of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
-      blas.MatMul(filter, true, input_batch, false, &col_matrix);
-
-      if (data_dim == 2U) {
-        // col2im: col_matrix -> dy
-        // from (c * k_h * k_w, h * w) to (c, o_h, o_w)
-        col2im(dev_ctx, col, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &output_batch);
-      } else if (data_dim == 3U) {
-        // col2vol: col_matrix -> dy
-        // from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w)
-        col2vol(dev_ctx, col, dilations, strides, paddings, &output_batch);
+      for (int g = 0; g < groups; g++) {
+        Tensor in_slice = input_batch.Slice(g * in_step, (g + 1) * in_step);
+        Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
+        Tensor out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step);
+
+        // col_matrix = filter_slice * input_slice
+        // of shape (c/g * k_h * k_w, h * w)
+        // or (c/g * k_d * k_h * k_w, d * h * w)
+        blas.MatMul(filter_slice, true, in_slice, false, static_cast<T>(1.0),
+                    &col_matrix, static_cast<T>(0.0));
+
+        if (data_dim == 2U) {
+          // col2im: col_matrix -> dy
+          // from (c/g * k_h * k_w, h * w) to (c/g, o_h, o_w)
+          col2im(dev_ctx, col, dilations, strides,
+                 std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                  paddings[1]},
+                 &out_slice);
+        } else if (data_dim == 3U) {
+          // col2vol: col_matrix -> dy
+          // from (c/g * k_d * k_h * k_w, d * h * w) to (c/g, o_d, o_h, o_w)
+          col2vol(dev_ctx, col, dilations, strides, paddings, &out_slice);
+        }
       }
     }
   }
@@ -173,6 +183,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    int groups = context.Attr<int>("groups");
 
     const int batch_size = static_cast<int>(input->dims()[0]);
 
@@ -204,9 +215,11 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
     // input matrix size: (m, h * w) or (m, d * h * w)
     DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
 
-    // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w)
-    DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
+    // filter size: (m, c/g * k_h * k_w) or (m, c/g * k_d * k_h * k_w)
+    DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0] / groups};
     filter.Resize(filter_matrix_shape);
+    int in_step = static_cast<int>(input->dims()[1]) / groups;
+    int col_step = static_cast<int>(col_matrix_shape[0]) / groups;
 
     // convolution transpose grad on input:
     // im2col + gemm (similar to conv-forward)
@@ -232,7 +245,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
       if (input_grad) {
         input_grad->mutable_data<T>(context.GetPlace());
       }
-      if (filter_grad) {  // filter size (m, c, k_h, k_w)
+      if (filter_grad) {  // filter size (m, c/g, k_h, k_w)
         filter_grad->mutable_data<T>(context.GetPlace());
         set_zero(dev_ctx, filter_grad, static_cast<T>(0));
         filter_grad_ = *filter_grad;
@@ -267,7 +280,17 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
           // or
           // (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m,
           // d, h, w)
-          blas.MatMul(filter, false, col_matrix, false, &input_grad_batch);
+          for (int g = 0; g < groups; g++) {
+            Tensor input_grad_slice =
+                input_grad_batch.Slice(g * in_step, (g + 1) * in_step);
+            Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
+            Tensor col_matrix_slice =
+                col_matrix.Slice(g * col_step, (g + 1) * col_step);
+
+            blas.MatMul(filter_slice, false, col_matrix_slice, false,
+                        static_cast<T>(1.0), &input_grad_slice,
+                        static_cast<T>(0.0));
+          }
         }
         if (filter_grad) {
           // input batch
@@ -277,7 +300,17 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
           // or
           // (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d *
           // k_h * k_w)
-          blas.MatMul(in_batch, false, col_matrix, true, &filter_grad_);
+          for (int g = 0; g < groups; g++) {
+            Tensor in_batch_slice =
+                in_batch.Slice(g * in_step, (g + 1) * in_step);
+            Tensor filter_grad_slice =
+                filter_grad_.Slice(g * in_step, (g + 1) * in_step);
+            Tensor col_matrix_slice =
+                col_matrix.Slice(g * col_step, (g + 1) * col_step);
+            blas.MatMul(in_batch_slice, false, col_matrix_slice, true,
+                        static_cast<T>(1.0), &filter_grad_slice,
+                        static_cast<T>(1.0));
+          }
         }
       }
     }
diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc
index 04ca878e687f9b8e5239d8c4aad7e5f262fda0fa..046dd11910bb0ff46b567c3b89883582782205d3 100644
--- a/paddle/fluid/operators/cos_sim_op.cc
+++ b/paddle/fluid/operators/cos_sim_op.cc
@@ -62,8 +62,7 @@ class CosSimOp : public framework::OperatorWithKernel {
 
 class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CosSimOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The 1st input of cos_sim op.");
     AddInput("Y", "The 2nd input of cos_sim op.");
     AddOutput("Out", "The output of cos_sim op.");
diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc
index a83013c428a77a0ead545d87852e1017bc927edf..40f43936db662f2b18ffa540da4794755b5d6fc7 100644
--- a/paddle/fluid/operators/crf_decoding_op.cc
+++ b/paddle/fluid/operators/crf_decoding_op.cc
@@ -18,8 +18,7 @@ namespace paddle {
 namespace operators {
 class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CRFDecodingOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Emission",
              "(LoDTensor, default: LoDTensor<float>). A LoDTensor with shape "
              "[N x D] where N is the size of the mini-batch and D is the total "
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index a8f1fbd529c71d1915c75fa90b7e4e8239d2fa3f..5b5a220cf90e7813f914ae35733e7a4103391b2d 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -48,12 +48,18 @@ class CropOp : public framework::OperatorWithKernel {
       ctx->SetOutputDim("Out", y_dim);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
 };
 
 class CropOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CropOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "The input of pad op. "
              "The input should be a k-D tensor(k > 0 and k < 7).");
@@ -61,13 +67,19 @@ class CropOpMaker : public framework::OpProtoAndCheckerMaker {
              "The input used as reference for cropping, "
              "which is of the same dimensions as X.")
         .AsDispensable();
+    AddInput("Offsets",
+             "The input used to describe offsets in runtime, which is a "
+             "1-D vector whose size equals to the rank of input 'X'. The "
+             "elements data type must be int.")
+        .AsDispensable();
     AddOutput("Out",
               "The output of crop op, "
               "which is of the same dimensions as X.");
     AddAttr<std::vector<int>>("offsets",
                               "A list<int> describing offsets to be cropped. "
                               "The size of offsets list should be the same as "
-                              "the dimension size of input X.");
+                              "the dimension size of input X.")
+        .SetDefault(std::vector<int>());
     AddAttr<std::vector<int>>("shape",
                               "A list<int> describing the shape of output. "
                               "The size of shape list should be the same as "
@@ -78,6 +90,17 @@ Crop Operator.
 
 Crop input into output, as specified by offsets and shape.
 
+There are two ways to set the offsets:
+1. In runtime: Using the input 'Offsets', which is a Vairbale and can be 
+               output of other operators. This way is suitable for 
+               dynamic offsets.
+2. In network configuration: Using the attribute 'offsets', which will be 
+                             set in Python configure script. This way is 
+                             suitable for fixed offsets.
+You CANNOT use these two ways at the same time. An exception will be raised 
+if input 'Offset' is configured and meanwhile the attribute 'offsets' is 
+not empty.
+
 There are two ways to set shape:
 1. reference input: crop input X into the same shape as reference input.
                     The dimension of reference input should
@@ -147,6 +170,15 @@ class CropOpGrad : public framework::OperatorWithKernel {
       ctx->SetOutputDim(x_grad_name, x_dims);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
+                ->type()),
+        ctx.device_context());
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h
index f05c2e23284e3a24cf48442996f671ec6084c391..91cfbbda7352c9b1676aae99e2bd57ccc9e10069 100644
--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -27,6 +27,37 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 using framework::Tensor;
 
+static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) {
+  std::vector<int> res;
+  int rank = ctx.Input<Tensor>("X")->dims().size();
+  if (ctx.HasInput("Offsets")) {
+    PADDLE_ENFORCE(ctx.Attr<std::vector<int>>("offsets").empty(),
+                   "Input 'Offsets' and attribute 'offsets' should not be used "
+                   "at the same time.");
+    const auto* offsets_tensor = ctx.Input<Tensor>("Offsets");
+    PADDLE_ENFORCE_EQ(offsets_tensor->dims().size(), 1);
+    PADDLE_ENFORCE_EQ(
+        rank, offsets_tensor->dims()[0],
+        "Offsets size should be equal to dimension size of input tensor.");
+    const int* offsets_data;
+    framework::Tensor cpu_tmp_tensor;
+    if (platform::is_cpu_place(offsets_tensor->place())) {
+      offsets_data = offsets_tensor->data<int>();
+    } else {
+      framework::TensorCopySync(*offsets_tensor, platform::CPUPlace(),
+                                &cpu_tmp_tensor);
+      offsets_data = cpu_tmp_tensor.data<int>();
+    }
+    res = std::vector<int>(offsets_data, offsets_data + rank);
+  } else {
+    res = ctx.Attr<std::vector<int>>("offsets");
+    PADDLE_ENFORCE_EQ(
+        rank, res.size(),
+        "Offsets size should be equal to dimension size of input tensor.");
+  }
+  return res;
+}
+
 template <typename T>
 class CropKernel : public framework::OpKernel<T> {
  public:
@@ -37,10 +68,7 @@ class CropKernel : public framework::OpKernel<T> {
     T* out_data = out->mutable_data<T>(context.GetPlace());
     auto x_stride = framework::stride(x->dims());
     auto out_stride = framework::stride(out->dims());
-    auto offsets = context.Attr<std::vector<int>>("offsets");
-    PADDLE_ENFORCE_EQ(
-        x->dims().size(), static_cast<int64_t>(offsets.size()),
-        "Offsets size should be equal to dimension size of input tensor.");
+    auto offsets = GetOffsets(context);
     int64_t offset = 0;
     for (size_t i = 0; i < offsets.size(); ++i) {
       offset += (x_stride[i] * offsets[i]);
@@ -56,7 +84,7 @@ void CropGradFunction(const framework::ExecutionContext& context) {
   if (d_x != nullptr) {
     auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
     d_x->mutable_data<T>(context.GetPlace());
-    auto offsets = context.Attr<std::vector<int>>("offsets");
+    auto offsets = GetOffsets(context);
     Eigen::array<std::pair<int, int>, D> paddings;
     for (size_t i = 0; i < D; ++i) {
       paddings[i].first = offsets[i];
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index 2b2a9dc8319f964875371214168ce04cb67fc818..a3bec3da45136bca5cb2763e7ffd6b67703a1813 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -111,8 +111,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
 
 class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(Tensor, default Tensor<float>), a 2-D tensor with shape [N x D],"
              " where N is the batch size and D is the number of classes. "
diff --git a/paddle/fluid/operators/ctc_align_op.cc b/paddle/fluid/operators/ctc_align_op.cc
index 19e7649660edd0bc90bc6a9537b1cdbb2e7e8ebc..d2b440d9d2e50340af7a7bb4e76e55beea1bcb46 100644
--- a/paddle/fluid/operators/ctc_align_op.cc
+++ b/paddle/fluid/operators/ctc_align_op.cc
@@ -44,8 +44,7 @@ class CTCAlignOp : public framework::OperatorWithKernel {
 
 class CTCAlignOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CTCAlignOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Input",
              "(LodTensor, default: LoDTensor<int>), Its shape is "
              "[Lp, 1], where Lp is the sum of all input sequences' length.");
diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc
index f7c516a0ba375a68e3adeb44c99f2808dc0418bb..92bb835e8f18e17ae1355fdec29f43b8ffb70460 100644
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -29,8 +29,7 @@ class CumOp : public framework::OperatorWithKernel {
 
 class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CumsumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "Input of Cumsum operator");
     AddOutput("Out", "Output of Cumsum operator");
     AddAttr<int>("axis",
diff --git a/paddle/fluid/operators/decayed_adagrad_op.cc b/paddle/fluid/operators/decayed_adagrad_op.cc
index 5a1315fb2a80bf7f7f57388d0d6832686442c4ff..c0f2b49a04d9e88502c4b63bca493cd2b7ad1c5c 100644
--- a/paddle/fluid/operators/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/decayed_adagrad_op.cc
@@ -62,8 +62,7 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {
 
 class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  DecayedAdagradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
     AddInput("Moment", "(Tensor) Second moment");
diff --git a/paddle/fluid/operators/delete_var_op.cc b/paddle/fluid/operators/delete_var_op.cc
index 1fe9404c00335edbe3594486f8c403e69f2ab08f..d7a9bfbc437dbf4c723b9c87ff62ec6b62c38638 100644
--- a/paddle/fluid/operators/delete_var_op.cc
+++ b/paddle/fluid/operators/delete_var_op.cc
@@ -34,8 +34,7 @@ class DeleteVarOp : public framework::OperatorBase {
 
 class DeleteVarOpInfoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  DeleteVarOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The input of delete op").AsDuplicable();
     AddComment(R"DOC(
 Delete Operator.
diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt
index 719a7465b8d58ef8588ff1e83c2b971eb6fbb00f..c29dc5d7e077a73b7db6a0c9204c2029ec1392bc 100644
--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
@@ -1,9 +1,12 @@
 if(WITH_DISTRIBUTE)
   grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
-      grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
+      request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor
+      selected_rows memory)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
-      cares zlib protobuf sendrecvop_grpc)
-  cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_table_op)
+          cares zlib protobuf sendrecvop_grpc SERIAL)
+  cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc
+          grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
+          proto_desc lookup_table_op SERIAL)
 endif()
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index 661dfa69fe1580ff3890f12defcd124225be0c06..fae39418b4166c9110a40b550e0b9801075edc7e 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -19,16 +19,36 @@ limitations under the License. */
 #include <limits>
 
 #include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
 namespace detail {
 
-bool RPCClient::AsyncSendVariable(const std::string& ep,
-                                  const platform::DeviceContext& ctx,
-                                  const framework::Scope& scope,
-                                  const std::string& var_name,
-                                  int64_t time_out) {
+void GRPCClient::InitImpl() { InitEventLoop(); }
+
+void GRPCClient::InitEventLoop() {
+  // start the client process thread
+  // TODO(wuyi): can make this in a threadpool
+  client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this)));
+}
+
+GRPCClient::~GRPCClient() {
+  Wait();
+  cq_.Shutdown();
+  {
+    std::lock_guard<std::mutex> guard(chan_mutex_);
+    for (auto& it : channels_) {
+      it.second.reset();
+    }
+  }
+  client_thread_->join();
+}
+
+bool GRPCClient::AsyncSendVar(const std::string& ep,
+                              const platform::DeviceContext& ctx,
+                              const framework::Scope& scope,
+                              const std::string& var_name, int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string var_name_val = var_name;
@@ -52,14 +72,13 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
     // stub context
     SendProcessor* s = new SendProcessor(ch);
     s->Prepare(var_h, time_out);
-    s->response_call_back_ = NULL;
+    s->response_call_back_ = nullptr;
 
     auto call = s->stub_g_.PrepareUnaryCall(
         s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
     call->StartCall();
     call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
   });
-
   req_count_++;
 
   return true;
@@ -79,11 +98,10 @@ void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
   result->Swap(&tmp);
 }
 
-bool RPCClient::AsyncGetVariable(const std::string& ep,
-                                 const platform::DeviceContext& ctx,
-                                 const framework::Scope& scope,
-                                 const std::string& var_name,
-                                 int64_t time_out) {
+bool GRPCClient::AsyncGetVar(const std::string& ep,
+                             const platform::DeviceContext& ctx,
+                             const framework::Scope& scope,
+                             const std::string& var_name, int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string var_name_val = var_name;
@@ -121,12 +139,12 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
   return true;
 }
 
-bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
-                                      const platform::DeviceContext& ctx,
-                                      const framework::Scope& scope,
-                                      const std::string& in_var_name,
-                                      const std::string& out_var_name,
-                                      int64_t time_out) {
+bool GRPCClient::AsyncPrefetchVar(const std::string& ep,
+                                  const platform::DeviceContext& ctx,
+                                  const framework::Scope& scope,
+                                  const std::string& in_var_name,
+                                  const std::string& out_var_name,
+                                  int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string in_var_name_val = in_var_name;
@@ -164,7 +182,8 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
   return true;
 }
 
-void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
+void GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
+                                       int64_t time_out) {
   const auto ch = GetChannel(ep);
 
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
@@ -177,7 +196,8 @@ void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
   req_count_++;
 }
 
-void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) {
+void GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
+                                       int64_t time_out) {
   const auto ch = GetChannel(ep);
   FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
   s->Prepare(time_out);
@@ -189,62 +209,37 @@ void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) {
   req_count_++;
 }
 
-bool RPCClient::Wait() {
-  if (req_count_ <= 0) {
-    return true;
-  }
-  const size_t kReqCnt = req_count_;
-  bool a[kReqCnt];
-  std::vector<std::future<void>> waits(req_count_);
-
-  for (int i = 0; i < req_count_; i++) {
-    waits[i] = framework::AsyncIO([i, &a, this] { a[i] = Proceed(); });
-  }
-
-  for (int i = 0; i < req_count_; i++) {
-    waits[i].wait();
-  }
-
-  int last_req_count = req_count_;
-  req_count_ = 0;
-
-  for (int i = 0; i < last_req_count; i++) {
-    if (!a[i]) {
-      return false;
-    }
-  }
-
-  return true;
+void GRPCClient::Wait() {
+  std::unique_lock<std::mutex> lk(sync_mutex_);
+  sync_cond_.wait(lk, [this] { return req_count_ == 0; });
 }
 
-bool RPCClient::Proceed() {
-  void* tag = NULL;
+void GRPCClient::Proceed() {
+  void* tag = nullptr;
   bool ok = false;
 
-  // request counts.
-  if (!cq_.Next(&tag, &ok)) {
-    LOG(ERROR) << "Get meets CompletionQueue error";
-    return false;
-  }
-
-  GPR_ASSERT(ok);
-  PADDLE_ENFORCE(tag);
-
-  // TODO(gongwb): add more retries.
-  BaseProcessor* c = static_cast<BaseProcessor*>(tag);
-  if (!c->status_.ok()) {
-    LOG(ERROR) << "proc param error:" << c->var_h_.String()
-               << " grpc error:" << c->status_.error_message();
+  while (cq_.Next(&tag, &ok)) {
+    BaseProcessor* c = static_cast<BaseProcessor*>(tag);
+    GPR_ASSERT(ok);
+    PADDLE_ENFORCE(c);
+    if (c->status_.ok()) {
+      c->Process();
+    } else {
+      LOG(ERROR) << "var: " << c->var_h_.String()
+                 << " grpc error:" << c->status_.error_message();
+    }
     delete c;
-    return false;
+    {
+      std::lock_guard<std::mutex> lk(sync_mutex_);
+      req_count_--;
+    }
+    sync_cond_.notify_all();
   }
-
-  c->Process();
-  delete c;
-  return true;
 }
 
-std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
+std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
+  // TODO(Yancey1989): make grpc client completely thread-safe
+  std::lock_guard<std::mutex> guard(chan_mutex_);
   auto it = channels_.find(ep);
   if (it != channels_.end()) {
     return it->second;
@@ -257,7 +252,6 @@ std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
 
   auto ch =
       grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args);
-
   channels_[ep] = ch;
   return ch;
 }
diff --git a/paddle/fluid/operators/detail/grpc_client.h b/paddle/fluid/operators/detail/grpc_client.h
index f6229b71bc01a6de51f50f5fe880ada6e15e74dd..8db73f875e3e2048386e91f6b5efb29b4ee7e193 100644
--- a/paddle/fluid/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/detail/grpc_client.h
@@ -16,14 +16,18 @@ limitations under the License. */
 
 #include <time.h>
 
-#include <chrono>  // NOLINT
+#include <chrono>              // NOLINT
+#include <condition_variable>  // NOLINT
 #include <ctime>
 #include <functional>
 #include <iostream>
 #include <map>
+#include <mutex>  // NOLINT
 #include <string>
+#include <thread>  // NOLINT
 #include <vector>
 
+#include "grpc++/channel.h"
 #include "grpc++/generic/generic_stub.h"
 #include "grpc++/grpc++.h"
 #include "grpc++/support/byte_buffer.h"
@@ -34,7 +38,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/detail/rpc_client.h"
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
 namespace paddle {
 namespace operators {
@@ -57,7 +63,9 @@ void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
 
 class BaseProcessor {
  public:
-  explicit BaseProcessor(std::shared_ptr<grpc::Channel> ch) { context_ = NULL; }
+  explicit BaseProcessor(std::shared_ptr<grpc::Channel> ch) {
+    context_ = nullptr;
+  }
 
   virtual ~BaseProcessor() {}
 
@@ -105,7 +113,7 @@ class SendProcessor : public BaseProcessor {
 
   ::grpc::GenericStub stub_g_;
   ::grpc::ByteBuffer reply_;
-  RequestSendCallBack response_call_back_ = NULL;
+  RequestSendCallBack response_call_back_ = nullptr;
 };
 
 typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
@@ -157,43 +165,60 @@ class FetchBarrierProcessor : public BaseProcessor {
   std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
 };
 
-class RPCClient {
+class GRPCClient : public RPCClient {
  public:
-  bool AsyncSendVariable(const std::string& ep,
-                         const platform::DeviceContext& ctx,
-                         const framework::Scope& scope,
-                         const std::string& var_name,
-                         int64_t time_out = 600 * 1000);
+  GRPCClient() {}
+  virtual ~GRPCClient();
+
+  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
+                    const framework::Scope& scope, const std::string& var_name,
+                    int64_t time_out = RPCClient::rpc_time_out) override;
 
-  bool AsyncGetVariable(const std::string& ep,
+  bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
+                   const framework::Scope& scope, const std::string& var_name,
+                   int64_t time_out = RPCClient::rpc_time_out) override;
+
+  bool AsyncPrefetchVar(const std::string& ep,
                         const platform::DeviceContext& ctx,
                         const framework::Scope& scope,
-                        const std::string& var_name,
-                        int64_t time_out = 600 * 1000);
+                        const std::string& in_var_name,
+                        const std::string& out_var_name,
+                        int64_t time_out = RPCClient::rpc_time_out) override;
 
-  bool AsyncPrefetchVariable(const std::string& ep,
-                             const platform::DeviceContext& ctx,
-                             const framework::Scope& scope,
-                             const std::string& in_var_name,
-                             const std::string& out_var_name,
-                             int64_t time_out = 600 * 1000);
+  void AsyncSendBatchBarrier(
+      const std::string& ep,
+      int64_t time_out = RPCClient::rpc_time_out) override;
 
-  void AsyncSendBatchBarrier(const std::string& ep,
-                             int64_t time_out = 600 * 1000);
+  void AsyncSendFetchBarrier(
+      const std::string& ep,
+      int64_t time_out = RPCClient::rpc_time_out) override;
 
-  void AsyncSendFetchBarrier(const std::string& ep,
-                             int64_t time_out = 600 * 1000);
+  void Wait() override;
 
-  bool Wait();
+ protected:
+  void InitImpl() override;
 
  private:
-  bool Proceed();
+  // InitEventLoop should only be called by Init()
+  void InitEventLoop();
+
+  void Proceed();
+
   std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
 
  private:
   grpc::CompletionQueue cq_;
-  std::map<std::string, std::shared_ptr<grpc::Channel>> channels_;
-  int64_t req_count_ = 0;
+  std::unordered_map<std::string, std::shared_ptr<grpc::Channel>> channels_;
+  std::unique_ptr<std::thread> client_thread_;
+
+  // mutex for Wait client sync
+  std::mutex sync_mutex_;
+  std::condition_variable sync_cond_;
+  std::atomic<int64_t> req_count_{0};
+
+  // mutex for GetChannel thread safety
+  std::mutex chan_mutex_;
+  DISABLE_COPY_AND_ASSIGN(GRPCClient);
 };
 
 }  // namespace detail
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index e6ee28ea8d920ef80fead258a9bd0d5f6762c879..57867aad4d679f75ea790b65b5773a73586fd96e 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/*Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,17 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/detail/grpc_server.h"
-
 #include <limits>
 #include <string>
 
+#include "paddle/fluid/operators/detail/grpc_server.h"
+
 using ::grpc::ServerAsyncResponseWriter;
 
 namespace paddle {
 namespace operators {
 namespace detail {
-
 enum CallStatus { PROCESS = 0, FINISH };
 
 // reference:
@@ -30,335 +29,315 @@ enum CallStatus { PROCESS = 0, FINISH };
 class RequestBase {
  public:
   explicit RequestBase(GrpcService::AsyncService* service,
-                       ::grpc::ServerCompletionQueue* cq, bool sync_mode,
-                       const platform::DeviceContext* dev_ctx)
+                       ::grpc::ServerCompletionQueue* cq,
+                       RequestHandler* request_handler, int req_id)
       : service_(service),
         cq_(cq),
-        sync_mode_(sync_mode),
         status_(PROCESS),
-        dev_ctx_(dev_ctx) {
+        request_handler_(request_handler),
+        req_id_(req_id) {
     PADDLE_ENFORCE(cq_);
   }
   virtual ~RequestBase() {}
-  virtual void Process() { assert(false); }
+  virtual void Process() = 0;
 
   CallStatus Status() { return status_; }
   void SetStatus(CallStatus status) { status_ = status; }
-  virtual std::string GetReqName() {
-    assert(false);
-    return "";
-  }
+  virtual std::string GetReqName() = 0;
 
  protected:
   ::grpc::ServerContext ctx_;
   GrpcService::AsyncService* service_;
   ::grpc::ServerCompletionQueue* cq_;
-  const bool sync_mode_;
   CallStatus status_;
-  const platform::DeviceContext* dev_ctx_;
+  RequestHandler* request_handler_;
+  int req_id_;
 };
 
 class RequestSend final : public RequestBase {
  public:
   explicit RequestSend(GrpcService::AsyncService* service,
-                       ::grpc::ServerCompletionQueue* cq, bool sync_mode,
-                       framework::Scope* scope, ReceivedQueue* queue,
-                       const platform::DeviceContext* dev_ctx)
-      : RequestBase(service, cq, sync_mode, dev_ctx),
-        queue_(queue),
-        responder_(&ctx_) {
-    if (sync_mode_) {
-      request_.reset(new VariableResponse(scope, dev_ctx_, false));
-    } else {
-      request_.reset(new VariableResponse(scope, dev_ctx_, true));
-    }
+                       ::grpc::ServerCompletionQueue* cq,
+                       RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
+    request_.reset(new VariableResponse(request_handler->scope(),
+                                        request_handler->dev_ctx(),
+                                        !request_handler->sync_mode()));
     int method_id = static_cast<int>(detail::GrpcMethod::kSendVariable);
-    service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_,
-                                cq_, cq_, this);
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
   }
-
   virtual ~RequestSend() {}
+  std::string GetReqName() override { return request_->Varname(); }
 
-  virtual std::string GetReqName() { return request_->Varname(); }
+  void Process() override {
+    std::string varname = GetReqName();
+    VLOG(3) << "RequestSend var_name:" << varname;
 
-  virtual void Process() {
-    std::string var_name = GetReqName();
-    VLOG(3) << "RequestSend " << var_name;
-    queue_->Push(std::make_pair(var_name, request_));
+    auto scope = request_->GetMutableLocalScope();
+    auto invar = request_->GetVar();
+    framework::Variable* outvar = nullptr;
 
-    sendrecv::VoidMessage reply;
-    responder_.Finish(reply, ::grpc::Status::OK, this);
+    request_handler_->Handle(varname, scope, invar, &outvar);
     status_ = FINISH;
+    responder_.Finish(reply_, ::grpc::Status::OK,
+                      reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
   }
 
  protected:
+  sendrecv::VoidMessage reply_;
   std::shared_ptr<VariableResponse> request_;
-  ReceivedQueue* queue_;
   ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
 };
 
 class RequestGet final : public RequestBase {
  public:
   explicit RequestGet(GrpcService::AsyncService* service,
-                      ::grpc::ServerCompletionQueue* cq, bool sync_mode,
-                      framework::Scope* scope,
-                      const platform::DeviceContext* dev_ctx,
-                      framework::BlockingQueue<MessageWithName>* queue)
-      : RequestBase(service, cq, sync_mode, dev_ctx),
-        responder_(&ctx_),
-        scope_(scope),
-        queue_(queue) {
+                      ::grpc::ServerCompletionQueue* cq,
+                      RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
     auto method_id = static_cast<int>(detail::GrpcMethod::kGetVariable);
-    service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_,
-                                cq_, this);
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, &request_, &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
   }
 
   virtual ~RequestGet() {}
 
-  virtual std::string GetReqName() { return request_.varname(); }
+  std::string GetReqName() override { return request_.varname(); }
 
-  virtual void Process() {
+  void Process() override {
     // proc request.
-    std::string var_name = request_.varname();
-    VLOG(3) << "RequestGet " << var_name;
-    auto* var = scope_->FindVar(var_name);
+    std::string varname = request_.varname();
+    VLOG(3) << "RequestGet " << varname;
 
-    ::grpc::ByteBuffer reply;
-    if (var_name != FETCH_BARRIER_MESSAGE) {
-      SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply);
-    }
+    auto scope = request_handler_->scope();
+    auto invar = scope->FindVar(varname);
+    framework::Variable* outvar = nullptr;
 
-    responder_.Finish(reply, ::grpc::Status::OK, this);
-    status_ = FINISH;
+    request_handler_->Handle(varname, scope, invar, &outvar);
 
-    if (var_name == FETCH_BARRIER_MESSAGE) {
-      sendrecv::VariableMessage msg;
-      MessageWithName msg_with_name = std::make_pair(var_name, msg);
-      queue_->Push(msg_with_name);
+    if (outvar) {
+      SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(),
+                            &reply_);
     }
+    status_ = FINISH;
+    responder_.Finish(reply_, ::grpc::Status::OK,
+                      reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
   }
 
  protected:
   sendrecv::VariableMessage request_;
+  ::grpc::ByteBuffer reply_;
   ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-  framework::Scope* scope_;
-  framework::BlockingQueue<MessageWithName>* queue_;
 };
 
 class RequestPrefetch final : public RequestBase {
  public:
   explicit RequestPrefetch(GrpcService::AsyncService* service,
-                           ::grpc::ServerCompletionQueue* cq, bool sync_mode,
-                           framework::Scope* scope,
-                           const platform::DeviceContext* dev_ctx,
-                           framework::Executor* executor,
-                           framework::ProgramDesc* program,
-                           framework::ExecutorPrepareContext* prefetch_ctx)
-      : RequestBase(service, cq, sync_mode, dev_ctx),
+                           ::grpc::ServerCompletionQueue* cq,
+                           RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id),
         responder_(&ctx_),
-        scope_(scope),
-        executor_(executor),
-        program_(program),
-        prefetch_ctx_(prefetch_ctx) {
-    if (sync_mode_) {
-      request_.reset(new VariableResponse(scope, dev_ctx_, false));
-    } else {
-      request_.reset(new VariableResponse(scope, dev_ctx_, true));
-    }
+        local_scope_(nullptr) {
+    request_.reset(new VariableResponse(request_handler->scope(),
+                                        request_handler->dev_ctx(), true));
     int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
-    service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_,
-                                cq_, cq_, this);
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
   }
 
   virtual ~RequestPrefetch() {}
 
-  virtual std::string GetReqName() { return request_->Varname(); }
+  std::string GetReqName() override { return request_->Varname(); }
 
-  virtual void Process() {
+  void Process() override {
     // prefetch process...
-    ::grpc::ByteBuffer reply;
+    std::string varname = request_->OutVarname();
+    VLOG(3) << "RequestPrefetch " << varname;
 
-    std::string var_name = request_->OutVarname();
-    VLOG(3) << "RequestPrefetch " << var_name;
-    auto var_desc = program_->Block(0).FindVar(var_name);
-    framework::Scope* local_scope = &scope_->NewScope();
-    auto* var = local_scope->FindVar(var_name);
-    InitializeVariable(var, var_desc->GetType());
-    executor_->RunPreparedContext(prefetch_ctx_, scope_, false, false);
+    auto scope = request_->GetMutableLocalScope();
+    auto invar = scope->FindVar(varname);
+    framework::Variable* outvar = nullptr;
 
-    SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply);
+    request_handler_->Handle(varname, scope, invar, &outvar);
 
-    responder_.Finish(reply, ::grpc::Status::OK, this);
+    SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(),
+                          &reply_);
+    responder_.Finish(reply_, ::grpc::Status::OK,
+                      reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
     status_ = FINISH;
   }
 
  protected:
   std::shared_ptr<VariableResponse> request_;
+  ::grpc::ByteBuffer reply_;
   ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-  framework::Scope* scope_;
-  framework::Executor* executor_;
-  framework::ProgramDesc* program_;
-  framework::ExecutorPrepareContext* prefetch_ctx_;
+  framework::Scope* local_scope_;
 };
 
-void AsyncGRPCServer::WaitClientGet(int count) {
-  int fetch_barriers = 0;
-  while (fetch_barriers < count) {
-    auto msg = var_get_queue_.Pop();
-    if (msg.first == FETCH_BARRIER_MESSAGE) {
-      fetch_barriers++;
-    }
-  }
-}
-
 void AsyncGRPCServer::WaitServerReady() {
+  VLOG(3) << "AsyncGRPCServer is wait server ready";
   std::unique_lock<std::mutex> lock(this->mutex_ready_);
   condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
+  VLOG(3) << "AsyncGRPCServer WaitSeverReady";
 }
 
-void AsyncGRPCServer::RunSyncUpdate() {
+void AsyncGRPCServer::StartServer() {
   ::grpc::ServerBuilder builder;
-  builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials(),
+  builder.AddListeningPort(bind_address_, ::grpc::InsecureServerCredentials(),
                            &selected_port_);
+
   builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
   builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
   builder.RegisterService(&service_);
 
-  cq_send_ = builder.AddCompletionQueue();
-  cq_get_ = builder.AddCompletionQueue();
-  cq_prefetch_ = builder.AddCompletionQueue();
+  for (auto t : rpc_call_map_) {
+    rpc_cq_[t.first].reset(builder.AddCompletionQueue().release());
+  }
 
   server_ = builder.BuildAndStart();
-  LOG(INFO) << "Server listening on " << address_
+  LOG(INFO) << "Server listening on " << bind_address_
             << " selected port: " << selected_port_;
 
-  std::function<void()> send_register =
-      std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this);
-  std::function<void()> get_register =
-      std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this);
-  std::function<void()> prefetch_register =
-      std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this);
-
-  // TODO(wuyi): Run these "HandleRequest" in thread pool
-  t_send_.reset(
-      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
-                                cq_send_.get(), "cq_send", send_register)));
-  t_get_.reset(
-      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
-                                cq_get_.get(), "cq_get", get_register)));
-  t_prefetch_.reset(new std::thread(
-      std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(),
-                "cq_prefetch", prefetch_register)));
+  std::function<void(const std::string&, int)> f =
+      std::bind(&AsyncGRPCServer::TryToRegisterNewOne, this,
+                std::placeholders::_1, std::placeholders::_2);
+
+  for (auto& t : rpc_call_map_) {
+    auto& rpc_name = t.first;
+    auto& cq = rpc_cq_[rpc_name];
+    auto threadnum = rpc_thread_num_[rpc_name];
+    auto& reqs = rpc_reqs_[rpc_name];
+
+    reqs.reserve(kRequestBufSize);
+
+    for (int i = 0; i < kRequestBufSize; i++) {
+      TryToRegisterNewOne(rpc_name, i);
+    }
+
+    for (int i = 0; i < threadnum; i++) {
+      rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind(
+          &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f)));
+      VLOG(3) << t.first << " creates threads!";
+    }
+  }
 
   {
     std::lock_guard<std::mutex> lock(this->mutex_ready_);
     ready_ = 1;
   }
   condition_ready_.notify_all();
+
   // wait server
   server_->Wait();
-  t_send_->join();
-  t_get_->join();
-  t_prefetch_->join();
+
+  for (auto& t : rpc_threads_) {
+    auto& threads = t.second;
+    for (size_t i = 0; i < threads.size(); ++i) {
+      threads[i]->join();
+      VLOG(3) << t.first << " threads ends!";
+    }
+  }
 }
 
 void AsyncGRPCServer::ShutdownQueue() {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  cq_send_->Shutdown();
-  cq_get_->Shutdown();
-  cq_prefetch_->Shutdown();
+  for (auto& t : rpc_cq_) {
+    t.second->Shutdown();
+    VLOG(3) << t.first << " shutdown!";
+  }
 }
 
-// This URL explains why shutdown is complicate:
-void AsyncGRPCServer::ShutDown() {
+void AsyncGRPCServer::ShutDownImpl() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
   is_shut_down_ = true;
   ShutdownQueue();
+
+  VLOG(3) << "server_ shutdown!";
   server_->Shutdown();
 }
 
-void AsyncGRPCServer::TryToRegisterNewSendOne() {
+void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
+                                          int req_id) {
   std::unique_lock<std::mutex> lock(cq_mutex_);
   if (is_shut_down_) {
     VLOG(3) << "shutdown, do not TryToRegisterNewSendOne";
     return;
   }
-  RequestSend* send = new RequestSend(&service_, cq_send_.get(), sync_mode_,
-                                      scope_, &var_recv_queue_, dev_ctx_);
-  VLOG(4) << "Create RequestSend status:" << send->Status();
-}
 
-void AsyncGRPCServer::TryToRegisterNewGetOne() {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  if (is_shut_down_) {
-    VLOG(3) << "shutdown, do not TryToRegisterNewGetOne";
-    return;
+  VLOG(4) << "register send rpc_name:" << rpc_name
+          << ", handler:" << rpc_call_map_[kRequestSend];
+
+  auto& reqs = rpc_reqs_[rpc_name];
+  auto& handler = rpc_call_map_[rpc_name];
+  auto& cq = rpc_cq_[rpc_name];
+
+  RequestBase* b = nullptr;
+  if (rpc_name == kRequestSend) {
+    b = new RequestSend(&service_, cq.get(), handler, req_id);
+  } else if (rpc_name == kRequestGet) {
+    b = new RequestGet(&service_, cq.get(), handler, req_id);
+  } else if (rpc_name == kRequestPrefetch) {
+    b = new RequestPrefetch(&service_, cq.get(), handler, req_id);
+  } else {
+    PADDLE_ENFORCE(false, "not surpported rpc");
   }
-  RequestGet* get = new RequestGet(&service_, cq_get_.get(), sync_mode_, scope_,
-                                   dev_ctx_, &var_get_queue_);
-  VLOG(4) << "Create RequestGet status:" << get->Status();
-}
 
-void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  if (is_shut_down_) {
-    VLOG(3) << "shutdown, do not TryToRegisterNewPrefetchOne";
-    return;
-  }
-  RequestPrefetch* prefetch =
-      new RequestPrefetch(&service_, cq_prefetch_.get(), sync_mode_, scope_,
-                          dev_ctx_, executor_, program_, prefetch_ctx_);
+  reqs[req_id] = b;
 
-  VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status();
+  VLOG(4) << "Create RequestSend status:" << b->Status();
 }
 
-// FIXME(typhoonzero): change cq_name to enum.
-void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
-                                    const std::string& cq_name,
-                                    std::function<void()> TryToRegisterNewOne) {
-  TryToRegisterNewOne();
-
+void AsyncGRPCServer::HandleRequest(
+    ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name,
+    std::function<void(const std::string&, int)> TryToRegisterNewOne) {
   void* tag = NULL;
   bool ok = false;
 
   while (true) {
-    VLOG(3) << "HandleRequest for " << cq_name << " wait Next";
+    VLOG(3) << "HandleRequest " << rpc_name << " wait next";
     if (!cq->Next(&tag, &ok)) {
-      LOG(INFO) << cq_name << " CompletionQueue shutdown!";
+      LOG(INFO) << "CompletionQueue " << rpc_name << " shutdown!";
       break;
     }
-    VLOG(3) << "HandleRequest for " << cq_name << " get Next";
 
-    PADDLE_ENFORCE(tag);
+    int req_id = static_cast<int>(reinterpret_cast<intptr_t>(tag));
+    VLOG(3) << "HandleRequest " << rpc_name << ", req_id:" << req_id
+            << " get next";
 
-    if (sync_mode_) {
-      // FIXME(typhoonzero): de-couple the barriers with recv_op
-      if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1);
-      if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0);
-      VLOG(3) << "HandleRequest for " << cq_name << " after WaitCond";
+    auto& reqs = rpc_reqs_[rpc_name];
+    RequestBase* base = nullptr;
+    {
+      PADDLE_ENFORCE(req_id >= 0 && req_id < kRequestBufSize);
+      std::unique_lock<std::mutex> lock(cq_mutex_);
+      base = reqs[req_id];
     }
 
-    RequestBase* base = reinterpret_cast<RequestBase*>(tag);
     // reference:
     // https://github.com/tensorflow/tensorflow/issues/5596
     // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
     // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
     if (!ok) {
-      LOG(WARNING) << cq_name << " recv no regular event:argument name["
+      LOG(WARNING) << "completion queue:" << rpc_name
+                   << " recv no regular event:argument name["
                    << base->GetReqName() << "]";
-      TryToRegisterNewOne();
+      TryToRegisterNewOne(rpc_name, req_id);
       delete base;
       continue;
     }
 
+    VLOG(3) << "queue id:" << rpc_name << ", req_id:" << req_id
+            << ", status:" << base->Status();
+
     switch (base->Status()) {
       case PROCESS: {
-        TryToRegisterNewOne();
         base->Process();
-        VLOG(4) << cq_name << " PROCESS status:" << base->Status();
         break;
       }
       case FINISH: {
-        VLOG(4) << cq_name << " FINISH status:" << base->Status();
+        TryToRegisterNewOne(rpc_name, req_id);
         delete base;
         break;
       }
@@ -367,20 +346,6 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
   }
 }
 
-void AsyncGRPCServer::WaitCond(int cond) {
-  std::unique_lock<std::mutex> lock(this->barrier_mutex_);
-  barrier_condition_.wait(lock,
-                          [=] { return this->barrier_cond_step_ == cond; });
-}
-
-void AsyncGRPCServer::SetCond(int cond) {
-  {
-    std::lock_guard<std::mutex> lock(this->barrier_mutex_);
-    barrier_cond_step_ = cond;
-  }
-  barrier_condition_.notify_all();
-}
-
 }  // namespace detail
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
index 7f9cae21ccca8dd51f9fbe98148d01a51ac6eb84..e6ffc7066f24d5088a95801ed1c0670b24d5771f 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -14,9 +14,12 @@ limitations under the License. */
 
 #pragma once
 
+#include <map>
+#include <set>
 #include <string>
 #include <thread>  // NOLINT
 #include <utility>
+#include <vector>
 
 #include "grpc++/grpc++.h"
 #include "paddle/fluid/framework/blocking_queue.h"
@@ -27,102 +30,57 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/detail/grpc_service.h"
+#include "paddle/fluid/operators/detail/request_handler.h"
+#include "paddle/fluid/operators/detail/rpc_server.h"
 #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
 #include "paddle/fluid/operators/detail/send_recv.pb.h"
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
 namespace detail {
 
-typedef std::pair<std::string, std::shared_ptr<VariableResponse>>
-    ReceivedMessage;
-typedef framework::BlockingQueue<ReceivedMessage> ReceivedQueue;
-
-typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
 class RequestBase;
 
-class AsyncGRPCServer final {
+class AsyncGRPCServer final : public RPCServer {
  public:
-  explicit AsyncGRPCServer(const std::string &address, bool sync_mode)
-      : address_(address), sync_mode_(sync_mode), ready_(0) {}
-
-  void WaitServerReady();
-  void RunSyncUpdate();
-
-  // functions to sync server barrier status.
-  void WaitCond(int cond);
-  void SetCond(int cond);
-  void WaitClientGet(int count);
-
-  void SetScope(framework::Scope *scope) { scope_ = scope; }
-
-  void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; }
-
-  void SetProgram(framework::ProgramDesc *program) { program_ = program; }
-
-  void SetExecutor(framework::Executor *executor) { executor_ = executor; }
-
-  void SetPrefetchPreparedCtx(framework::ExecutorPrepareContext *prepared) {
-    prefetch_ctx_ = prepared;
-  }
-
-  int GetSelectedPort() const { return selected_port_; }
+  explicit AsyncGRPCServer(const std::string& address, int client_num)
+      : RPCServer(address, client_num), ready_(0) {}
 
-  const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); }
+  virtual ~AsyncGRPCServer() {}
+  void WaitServerReady() override;
+  void StartServer() override;
 
-  void Push(const std::string &msg_name) {
-    this->var_recv_queue_.Push(std::make_pair(msg_name, nullptr));
-  }
-
-  void ShutDown();
+ private:
+  void HandleRequest(
+      ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name,
+      std::function<void(const std::string&, int)> TryToRegisterNewOne);
 
- protected:
-  void HandleRequest(::grpc::ServerCompletionQueue *cq,
-                     const std::string &cq_name,
-                     std::function<void()> TryToRegisterNewOne);
-  void TryToRegisterNewSendOne();
-  void TryToRegisterNewGetOne();
-  void TryToRegisterNewPrefetchOne();
+  void TryToRegisterNewOne(const std::string& rpc_name, int req_id);
   void ShutdownQueue();
+  void ShutDownImpl() override;
 
  private:
+  static const int kRequestBufSize = 100;
+
   std::mutex cq_mutex_;
   volatile bool is_shut_down_ = false;
-  std::unique_ptr<::grpc::ServerCompletionQueue> cq_send_;
-  std::unique_ptr<::grpc::ServerCompletionQueue> cq_get_;
-  std::unique_ptr<::grpc::ServerCompletionQueue> cq_prefetch_;
 
   GrpcService::AsyncService service_;
   std::unique_ptr<::grpc::Server> server_;
 
-  std::string address_;
-  const bool sync_mode_;
-  framework::Scope *scope_;
-  const platform::DeviceContext *dev_ctx_;
-
-  // received variable from RPC, operators fetch variable from this queue.
-  framework::BlockingQueue<MessageWithName> var_get_queue_;
-  // client send variable to this queue.
-  ReceivedQueue var_recv_queue_;
-
   // condition of the sub program
-  std::mutex barrier_mutex_;
-  mutable int barrier_cond_step_;
   std::condition_variable barrier_condition_;
 
-  std::unique_ptr<std::thread> t_send_;
-  std::unique_ptr<std::thread> t_get_;
-  std::unique_ptr<std::thread> t_prefetch_;
-
-  framework::ExecutorPrepareContext *prefetch_ctx_;
-  framework::ProgramDesc *program_;
-  framework::Executor *executor_;
-  int selected_port_;
-
   std::mutex mutex_ready_;
   std::condition_variable condition_ready_;
+
   int ready_;
+
+  std::map<std::string, std::unique_ptr<::grpc::ServerCompletionQueue>> rpc_cq_;
+  std::map<std::string, std::vector<std::unique_ptr<std::thread>>> rpc_threads_;
+  std::map<std::string, std::vector<RequestBase*>> rpc_reqs_;
 };
 
 };  // namespace detail
diff --git a/paddle/fluid/operators/detail/grpc_server_test.cc b/paddle/fluid/operators/detail/grpc_server_test.cc
index 25b95d608d10d6e456d5f563ce9fbe35d812cb0f..a1f9ba15e656a686c4bb0d81cf00dea120b8c0ad 100644
--- a/paddle/fluid/operators/detail/grpc_server_test.cc
+++ b/paddle/fluid/operators/detail/grpc_server_test.cc
@@ -19,18 +19,22 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/operators/detail/grpc_client.h"
 #include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/detail/rpc_client.h"
 
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
+#include "paddle/fluid/operators/detail/request_handler_impl.h"
+
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace detail = paddle::operators::detail;
 
 USE_OP(lookup_table);
 
-std::unique_ptr<detail::AsyncGRPCServer> rpc_service_;
+std::unique_ptr<detail::AsyncGRPCServer> g_rpc_service;
+std::unique_ptr<detail::RequestHandler> g_req_handler;
 
 framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
   auto root_block = program->MutableBlock(0);
@@ -88,8 +92,7 @@ void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
   }
 }
 
-void StartServer(const std::string& endpoint) {
-  rpc_service_.reset(new detail::AsyncGRPCServer(endpoint, true));
+void StartServer() {
   framework::ProgramDesc program;
   framework::Scope scope;
   platform::CPUPlace place;
@@ -99,42 +102,57 @@ void StartServer(const std::string& endpoint) {
   auto prepared = exe.Prepare(program, block->ID());
   InitTensorsOnServer(&scope, &place, 10);
 
-  rpc_service_->SetProgram(&program);
-  rpc_service_->SetPrefetchPreparedCtx(prepared.get());
-  rpc_service_->SetDevCtx(&ctx);
-  rpc_service_->SetScope(&scope);
-  rpc_service_->SetExecutor(&exe);
+  g_req_handler->SetProgram(&program);
+  g_req_handler->SetPrefetchPreparedCtx(std::move(prepared));
+  g_req_handler->SetDevCtx(&ctx);
+  g_req_handler->SetScope(&scope);
+  g_req_handler->SetExecutor(&exe);
+
+  g_rpc_service->RegisterRPC(detail::kRequestPrefetch, g_req_handler.get());
+  g_req_handler->SetRPCServer(g_rpc_service.get());
+
+  std::thread server_thread(
+      std::bind(&detail::AsyncGRPCServer::StartServer, g_rpc_service.get()));
 
-  rpc_service_->RunSyncUpdate();
+  server_thread.join();
 }
 
 TEST(PREFETCH, CPU) {
-  // start up a server instance backend
-  std::thread server_thread(StartServer, "127.0.0.1:8889");
-  sleep(2);
+  g_req_handler.reset(new detail::RequestPrefetchHandler(true));
+  g_rpc_service.reset(new detail::AsyncGRPCServer("127.0.0.1:0", 1));
+
+  std::thread server_thread(StartServer);
+  g_rpc_service->WaitServerReady();
+
+  detail::RPCClient* client =
+      detail::RPCClient::GetInstance<detail::GRPCClient>();
+  int port = g_rpc_service->GetSelectedPort();
+  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
+
   framework::Scope scope;
   platform::CPUPlace place;
   platform::CPUDeviceContext ctx(place);
-  // create var on local scope
-  int64_t rows_numel = 5;
-  InitTensorsOnClient(&scope, &place, rows_numel);
-  std::string in_var_name("ids");
-  std::string out_var_name("out");
-
-  detail::RPCClient client;
-  client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, in_var_name,
-                               out_var_name);
-  client.Wait();
-
-  auto var = scope.Var(out_var_name);
-  auto value = var->GetMutable<framework::SelectedRows>()->value();
-  auto ptr = value.mutable_data<float>(place);
-
-  rpc_service_->ShutDown();
-  server_thread.join();
-  rpc_service_.reset(nullptr);
-
-  for (int64_t i = 0; i < rows_numel; ++i) {
-    EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast<float>(i * 2));
+  {
+    // create var on local scope
+    int64_t rows_numel = 5;
+    InitTensorsOnClient(&scope, &place, rows_numel);
+    std::string in_var_name("ids");
+    std::string out_var_name("out");
+
+    client->AsyncPrefetchVar(ep, ctx, scope, in_var_name, out_var_name);
+    client->Wait();
+    auto var = scope.Var(out_var_name);
+    auto value = var->GetMutable<framework::SelectedRows>()->value();
+    auto ptr = value.mutable_data<float>(place);
+
+    for (int64_t i = 0; i < rows_numel; ++i) {
+      EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast<float>(i * 2));
+    }
   }
+
+  g_rpc_service->ShutDown();
+  server_thread.join();
+  LOG(INFO) << "begin reset";
+  g_rpc_service.reset(nullptr);
+  g_req_handler.reset(nullptr);
 }
diff --git a/paddle/fluid/operators/detail/grpc_service.h b/paddle/fluid/operators/detail/grpc_service.h
index e6dab2f5a3a4280f3979417c3ca2d884a0b8ff2f..e0505c2b9d0903837713d7e0032b01ab091c2e04 100644
--- a/paddle/fluid/operators/detail/grpc_service.h
+++ b/paddle/fluid/operators/detail/grpc_service.h
@@ -25,6 +25,8 @@
 #include <grpc++/support/byte_buffer.h>
 #include "paddle/fluid/operators/detail/variable_response.h"
 
+#include "paddle/fluid/platform/profiler.h"
+
 // NOTE: This method was originally created by tensorflow
 //       (https://github.com/tensorflow/tensorflow/) we borrow this
 //       method and did some modifications so that we can parse gRPC
diff --git a/paddle/fluid/operators/detail/request_handler.h b/paddle/fluid/operators/detail/request_handler.h
new file mode 100644
index 0000000000000000000000000000000000000000..d74206aaba6a79ee06475985e642221bd84d9382
--- /dev/null
+++ b/paddle/fluid/operators/detail/request_handler.h
@@ -0,0 +1,120 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <time.h>
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+constexpr char kRequestSend[] = "RequestSend";
+constexpr char kRequestGet[] = "RequestGet";
+constexpr char kRequestPrefetch[] = "RequestPrefetch";
+
+class RPCServer;
+
+class RequestHandler {
+ public:
+  explicit RequestHandler(bool sync_mode)
+      : sync_mode_(sync_mode),
+        dev_ctx_(nullptr),
+        executor_(nullptr),
+        scope_(nullptr),
+        program_(nullptr),
+        rpc_server_(nullptr) {}
+
+  virtual ~RequestHandler() {}
+
+  // Set attributes.
+  void SetScope(framework::Scope* scope) { scope_ = scope; }
+  void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; }
+  void SetProgram(framework::ProgramDesc* program) { program_ = program; }
+  void SetExecutor(framework::Executor* executor) { executor_ = executor; }
+  void SetPrefetchPreparedCtx(
+      std::unique_ptr<framework::ExecutorPrepareContext> prepared) {
+    prefetch_ctx_.reset(prepared.release());
+  }
+
+  // Used for async.
+  void SetGradToPreparedCtx(
+      std::unordered_map<
+          std::string, std::shared_ptr<framework::ExecutorPrepareContext>>* g) {
+    grad_to_prepared_ctx_ = g;
+  }
+
+  void SetRPCServer(RPCServer* rpc_server) { rpc_server_ = rpc_server; }
+
+  // Get attributes.
+  bool sync_mode() { return sync_mode_; }
+  framework::Scope* scope() { return scope_; }
+  const platform::DeviceContext* dev_ctx() { return dev_ctx_; }
+  framework::ExecutorPrepareContext* prefetch_ctx() {
+    return prefetch_ctx_.get();
+  }
+  framework::ProgramDesc* program() { return program_; }
+  framework::Executor* executor() { return executor_; }
+
+  // This function processes user's rpc request.
+  // The implemention is in request_handler_impl.
+  // example:
+  //    std::string varname = request_.varname();
+  //
+  //    auto scope = request_handler_->scope();
+  //    auto invar = scope->FindVar(varname);
+  //    framework::Variable* outvar = nullptr;
+  //
+  //    request_handler_->Handle(varname, scope, invar, &outvar);
+  //    if (outvar) {
+  //        SerializeToByteBuffer(varname, outvar,
+  //           *request_handler_->dev_ctx(), &reply_);
+  //    }
+  virtual bool Handle(const std::string& varname, framework::Scope* scope,
+                      framework::Variable* var,
+                      framework::Variable** outvar) = 0;
+
+ protected:
+  const bool sync_mode_;
+
+  const platform::DeviceContext* dev_ctx_;
+  framework::Executor* executor_;
+  framework::Scope* scope_;
+  framework::ProgramDesc* program_;
+  std::unique_ptr<framework::ExecutorPrepareContext> prefetch_ctx_;
+
+  // Used for async.
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>*
+      grad_to_prepared_ctx_;
+  RPCServer* rpc_server_;
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/request_handler_impl.cc b/paddle/fluid/operators/detail/request_handler_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b5ee3ab51ec5e685b41057ba60d6701d61cbc09c
--- /dev/null
+++ b/paddle/fluid/operators/detail/request_handler_impl.cc
@@ -0,0 +1,113 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/detail/request_handler_impl.h"
+#include "paddle/fluid/operators/detail/rpc_server.h"
+#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/detail/variable_response.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+bool RequestSendHandler::Handle(const std::string& varname,
+                                framework::Scope* scope,
+                                framework::Variable* invar,
+                                framework::Variable** outvar) {
+  VLOG(4) << "RequestSendHandler:" << varname;
+
+  // Async
+  if (!sync_mode_) {
+    try {
+      executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
+                                    scope);
+    } catch (std::exception& e) {
+      LOG(ERROR) << "async: run sub program error " << e.what();
+      return false;
+    }
+    return true;
+  }
+
+  // Sync
+  if (varname == BATCH_BARRIER_MESSAGE) {
+    VLOG(3) << "sync: recv batch barrier message";
+    rpc_server_->IncreaseBatchBarrier(kRequestSend);
+  } else {
+    VLOG(3) << "sync: received var_name: " << varname;
+    if (sync_mode_) {
+      rpc_server_->WaitCond(kRequestSend);
+    }
+
+    if (invar == nullptr) {
+      LOG(ERROR) << "sync: Can not find server side var: " << varname;
+      PADDLE_THROW("sync: Can not find server side var");
+      return false;
+    }
+    if (invar->IsType<framework::SelectedRows>()) {
+      rpc_server_->RecordSparseVar(invar);
+    }
+  }
+
+  return true;
+}
+
+bool RequestGetHandler::Handle(const std::string& varname,
+                               framework::Scope* scope,
+                               framework::Variable* invar,
+                               framework::Variable** outvar) {
+  VLOG(4) << "RequestGetHandler:" << varname;
+
+  if (varname != FETCH_BARRIER_MESSAGE) {
+    if (sync_mode_) {
+      rpc_server_->WaitCond(kRequestGet);
+    }
+    *outvar = scope_->FindVar(varname);
+    return true;
+  }
+
+  // FETCH_BARRIER_MESSAGE
+  if (sync_mode_) {
+    VLOG(3) << "sync: recv fetch barrier message";
+    rpc_server_->IncreaseBatchBarrier(kRequestGet);
+  }
+
+  return true;
+}
+
+bool RequestPrefetchHandler::Handle(const std::string& varname,
+                                    framework::Scope* scope,
+                                    framework::Variable* invar,
+                                    framework::Variable** outvar) {
+  VLOG(4) << "RequestPrefetchHandler " << varname;
+
+  auto var_desc = program_->Block(0).FindVar(varname);
+  *outvar = scope->FindVar(varname);
+  InitializeVariable(*outvar, var_desc->GetType());
+  executor_->RunPreparedContext(prefetch_ctx_.get(), scope);
+
+  return true;
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/request_handler_impl.h b/paddle/fluid/operators/detail/request_handler_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d0c62232b68ad6c05e751c25103802ee12db57e
--- /dev/null
+++ b/paddle/fluid/operators/detail/request_handler_impl.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <time.h>
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/detail/request_handler.h"
+#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+class RequestSendHandler final : public RequestHandler {
+ public:
+  explicit RequestSendHandler(bool sync_mode) : RequestHandler(sync_mode) {}
+  virtual ~RequestSendHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar) override;
+};
+
+class RequestGetHandler final : public RequestHandler {
+ public:
+  explicit RequestGetHandler(bool sync_mode) : RequestHandler(sync_mode) {}
+  virtual ~RequestGetHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar) override;
+};
+
+class RequestPrefetchHandler final : public RequestHandler {
+ public:
+  explicit RequestPrefetchHandler(bool sync_mode) : RequestHandler(sync_mode) {}
+  virtual ~RequestPrefetchHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar) override;
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/rpc_client.cc b/paddle/fluid/operators/detail/rpc_client.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9a791403e3d6b99c5d4de5183e83e1af655d7d4c
--- /dev/null
+++ b/paddle/fluid/operators/detail/rpc_client.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/detail/rpc_client.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+std::once_flag RPCClient::init_flag_;
+std::unique_ptr<RPCClient> RPCClient::rpc_client_(nullptr);
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/rpc_client.h b/paddle/fluid/operators/detail/rpc_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e3717f076db6a52916d0e15813f9f61148c6553
--- /dev/null
+++ b/paddle/fluid/operators/detail/rpc_client.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+class RPCClient {
+ public:
+  virtual bool AsyncSendVar(const std::string& ep,
+                            const platform::DeviceContext& ctx,
+                            const framework::Scope& scope,
+                            const std::string& var_name,
+                            int64_t time_out = rpc_time_out) = 0;
+
+  virtual bool AsyncGetVar(const std::string& ep,
+                           const platform::DeviceContext& ctx,
+                           const framework::Scope& scope,
+                           const std::string& var_name,
+                           int64_t time_out = rpc_time_out) = 0;
+
+  virtual bool AsyncPrefetchVar(const std::string& ep,
+                                const platform::DeviceContext& ctx,
+                                const framework::Scope& scope,
+                                const std::string& in_var_name,
+                                const std::string& out_var_name,
+                                int64_t time_out = rpc_time_out) = 0;
+
+  virtual void AsyncSendBatchBarrier(const std::string& ep,
+                                     int64_t time_out = rpc_time_out) = 0;
+
+  virtual void AsyncSendFetchBarrier(const std::string& ep,
+                                     int64_t time_out = rpc_time_out) = 0;
+
+  virtual void Wait() = 0;
+
+  static constexpr int64_t rpc_time_out = 120 * 1000;
+
+  template <typename T>
+  static RPCClient* GetInstance() {
+    std::call_once(init_flag_, &RPCClient::Init<T>);
+    return rpc_client_.get();
+  }
+
+  // Init is called by GetInstance.
+  template <typename T>
+  static void Init() {
+    if (rpc_client_.get() == nullptr) {
+      rpc_client_.reset(new T());
+      rpc_client_->InitImpl();
+    }
+  }
+
+ protected:
+  virtual void InitImpl() {}
+
+ private:
+  static std::once_flag init_flag_;
+  static std::unique_ptr<RPCClient> rpc_client_;
+};
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/rpc_server.cc b/paddle/fluid/operators/detail/rpc_server.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7feddbeca89ee97769f5598b0111e50a3f572ce8
--- /dev/null
+++ b/paddle/fluid/operators/detail/rpc_server.cc
@@ -0,0 +1,126 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <string>
+
+#include "paddle/fluid/operators/detail/rpc_server.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+void RPCServer::ShutDown() {
+  LOG(INFO) << "RPCServer ShutDown ";
+  ShutDownImpl();
+
+  exit_flag_ = true;
+  barrier_cond_.notify_all();
+  rpc_cond_.notify_all();
+}
+
+void RPCServer::SavePort() const {
+  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
+  std::ofstream port_file;
+  port_file.open(file_path);
+  port_file << selected_port_;
+  port_file.close();
+  VLOG(4) << "selected port written to " << file_path;
+}
+
+void RPCServer::WaitBarrier(const std::string& rpc_name) {
+  std::unique_lock<std::mutex> lock(this->mutex_);
+  barrier_cond_.wait(lock, [=] {
+    return (barrier_counter_[rpc_name] >= client_num_ || exit_flag_.load());
+  });
+
+  VLOG(3) << "batch_barrier_:" << barrier_counter_[rpc_name];
+}
+
+void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
+  VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
+  int b = 0;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    b = ++barrier_counter_[rpc_name];
+  }
+
+  VLOG(3) << "RPCServer IncreaseBatchBarrier " << rpc_name
+          << ", barrier_count:" << b << ", fan_in" << client_num_;
+
+  if (b >= client_num_) {
+    barrier_cond_.notify_all();
+  }
+}
+
+void RPCServer::ResetBarrierCounter() {
+  VLOG(3) << "RPCServer ResetBarrierCounter ";
+  std::unique_lock<std::mutex> lock(mutex_);
+  for (auto& t : barrier_counter_) {
+    t.second = 0;
+  }
+}
+void RPCServer::RecordSparseVar(framework::Variable* sparse_var) {
+  std::unique_lock<std::mutex> lock(mutex_sparse_var_recorder_);
+  sparse_vars_.push_back(sparse_var);
+}
+
+void RPCServer::ResetSparseVarsRecorder() {
+  VLOG(3) << "RPCServer reset sparse vars recorder.";
+  std::unique_lock<std::mutex> lock(mutex_sparse_var_recorder_);
+  for (auto* var : sparse_vars_) {
+    var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
+  }
+  sparse_vars_.clear();
+}
+
+void RPCServer::RegisterRPC(const std::string& rpc_name,
+                            RequestHandler* handler, int thread_num) {
+  rpc_call_map_[rpc_name] = handler;
+  rpc_thread_num_[rpc_name] = thread_num;
+
+  static int cond = -1;
+  rpc_cond_map_[rpc_name] = ++cond;
+  VLOG(4) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler
+          << ", cond:" << rpc_cond_map_[rpc_name];
+}
+
+void RPCServer::SetCond(const std::string& rpc_name) {
+  VLOG(3) << "RPCServer SetCond " << rpc_name;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cur_cond_ = rpc_cond_map_[rpc_name];
+  }
+
+  rpc_cond_.notify_all();
+}
+
+void RPCServer::WaitCond(const std::string& rpc_name) {
+  VLOG(3) << "RPCServer WaitCond " << rpc_name;
+  int cond = 0;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cond = rpc_cond_map_[rpc_name];
+  }
+
+  std::unique_lock<std::mutex> lock(mutex_);
+  rpc_cond_.wait(
+      lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/rpc_server.h b/paddle/fluid/operators/detail/rpc_server.h
new file mode 100644
index 0000000000000000000000000000000000000000..94a21ef8d04e0fc8f4e22bf00cf89dc5f41f294b
--- /dev/null
+++ b/paddle/fluid/operators/detail/rpc_server.h
@@ -0,0 +1,97 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <set>
+#include <string>
+#include <thread>  // NOLINT
+#include <utility>
+#include <vector>
+#include "paddle/fluid/operators/detail/request_handler.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+class RPCServer {
+ public:
+  explicit RPCServer(const std::string& address, int client_num)
+      : cur_cond_(0),
+        bind_address_(address),
+        exit_flag_(false),
+        selected_port_(0),
+        client_num_(client_num) {}
+
+  virtual ~RPCServer() {}
+  virtual void StartServer() = 0;
+  virtual void WaitServerReady() = 0;
+
+  void ShutDown();
+
+  bool IsExit() { return exit_flag_.load(); }
+
+  int GetSelectedPort() const { return selected_port_; }
+  void SavePort() const;
+
+  // RegisterRPC, register the rpc method name to a handler
+  // class, and auto generate a condition id for this call
+  // to be used for the barrier.
+  void RegisterRPC(const std::string& rpc_name, RequestHandler* handler,
+                   int thread_num = 5);
+
+  // Wait util all the clients have reached the barrier for one
+  // rpc method. This function should be called in the
+  // RequestHandler if you want to run the server/client in a
+  // synchronous mode.
+  void WaitBarrier(const std::string& rpc_name);
+
+  void SetCond(const std::string& rpc_name);
+  void WaitCond(const std::string& rpc_name);
+  void IncreaseBatchBarrier(const std::string rpc_name);
+
+  void ResetBarrierCounter();
+  void RecordSparseVar(framework::Variable* sparse_var);
+  void ResetSparseVarsRecorder();
+
+ protected:
+  virtual void ShutDownImpl() = 0;
+
+ private:
+  std::mutex mutex_;
+  std::unordered_map<std::string, int> barrier_counter_;
+  std::condition_variable barrier_cond_;
+
+  std::unordered_map<std::string, int> rpc_cond_map_;
+  std::atomic<int> cur_cond_;
+  std::condition_variable rpc_cond_;
+
+  std::vector<framework::Variable*> sparse_vars_;
+  std::mutex mutex_sparse_var_recorder_;
+
+ protected:
+  std::string bind_address_;
+  std::atomic<int> exit_flag_;
+  int selected_port_;
+
+  const int client_num_;
+
+  std::unordered_map<std::string, RequestHandler*> rpc_call_map_;
+  std::unordered_map<std::string, int> rpc_thread_num_;
+  friend class RequestHandler;
+};
+
+};  // namespace detail
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto
index fffa9ae7a43ea5cd7b2bda6fbbf6ef9f7d23009d..a244afc46f3247c7e6e8481b09b5c729a2a569f7 100644
--- a/paddle/fluid/operators/detail/send_recv.proto
+++ b/paddle/fluid/operators/detail/send_recv.proto
@@ -32,6 +32,7 @@ service SendRecvService {
 enum VarType {
   LOD_TENSOR = 0;
   SELECTED_ROWS = 1;
+  NCCL_ID = 2;
 }
 
 // NOTICE(gongwb):don't modify this proto if you are not
@@ -69,10 +70,10 @@ message VariableMessage {
   bytes rows = 9;
   // Look up table block execution output variable name.
   string out_varname = 10;
-  // If true, the ps server will start profiling, the ps
+  // If 1, the ps server will start profiling, the ps
   // server stops profiling and generates a profile to /tmp/profile_ps_*
-  // when profile switches from true to false.
-  bool profile = 11;
+  // when profile switches from 1 to 2.
+  int64 profile = 11;
 }
 
 message VoidMessage {}
diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc
index d68cf467f7b0c6157dc1f69571e5d0c0b3c70348..507b465435609a91ebca97dd70b176c3b79bee02 100644
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
 
+#ifdef PADDLE_WITH_CUDA
+#include <nccl.h>
+#endif
 #include <sys/time.h>
 #include <thread>  // NOLINT
 
@@ -29,129 +32,158 @@ namespace paddle {
 namespace operators {
 namespace detail {
 
+using VarMsg = sendrecv::VariableMessage;
+
+void GetTensorPayload(framework::Variable* var,
+                      const platform::DeviceContext& ctx, VarMsg* request,
+                      void** payload, size_t* payload_size) {
+  auto tensor = var->Get<framework::LoDTensor>();
+  // FIXME(wuyi): data types in send_recv.proto is copied from
+  // framework.proto
+  request->set_data_type(
+      static_cast<VarMsg::Type>(framework::ToDataType(tensor.type())));
+  for (auto& dim : framework::vectorize(tensor.dims())) {
+    request->add_dims(dim);
+  }
+  const framework::LoD lod = tensor.lod();
+  if (lod.size() > 0) {
+    request->set_lod_level(lod.size());
+    for (auto& each : lod) {
+      VarMsg::LodData* lod_inner = request->add_lod();
+      for (auto& d : each) {
+        lod_inner->add_lod_data(d);
+      }
+    }
+  }
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+    PADDLE_ENFORCE(platform::is_gpu_place(tensor.place()));
+    platform::CUDAPinnedPlace cuda_pinned;
+    auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
+    auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
+    *payload = memory::Alloc(cuda_pinned, copy_size);
+
+    memory::Copy(cuda_pinned, *payload,
+                 boost::get<platform::CUDAPlace>(tensor.place()),
+                 reinterpret_cast<const void*>(tensor.data<void>()), copy_size,
+                 gpu_dev_ctx.stream());
+    ctx.Wait();
+#endif
+  } else {
+    *payload = tensor.data<void>();
+  }
+  *payload_size = tensor.numel() * framework::SizeOfType(tensor.type());
+}
+
+void GetSelectedRowsPayload(framework::Variable* var,
+                            const platform::DeviceContext& ctx, VarMsg* request,
+                            void** payload, size_t* payload_size) {
+  auto* slr = var->GetMutable<framework::SelectedRows>();
+  request->set_data_type(
+      static_cast<VarMsg::Type>(framework::ToDataType(slr->value().type())));
+  request->set_lod_level(0);
+  request->set_slr_height(slr->height());
+
+  for (auto& dim : framework::vectorize(slr->value().dims())) {
+    request->add_dims(dim);
+  }
+
+  auto* tensor = slr->mutable_value();
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+    platform::CUDAPinnedPlace cuda_pinned;
+    auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
+    auto copy_size = tensor->numel() * framework::SizeOfType(tensor->type());
+    *payload = memory::Alloc(cuda_pinned, copy_size);
+    memory::Copy(cuda_pinned, *payload,
+                 boost::get<platform::CUDAPlace>(tensor->place()),
+                 reinterpret_cast<const void*>(tensor->data<void>()), copy_size,
+                 gpu_dev_ctx.stream());
+    ctx.Wait();
+#endif
+  } else {
+    *payload = slr->mutable_value()->data<void>();
+  }
+  *payload_size = tensor->numel() * framework::SizeOfType(tensor->type());
+}
+
 void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                            const platform::DeviceContext& ctx,
                            ::grpc::ByteBuffer* msg,
                            const std::string& out_name) {
-  using VarMsg = sendrecv::VariableMessage;
-  // When using GPU, need to free the copied CPU buffer
-  // when the ByteBuffer destroies
-  // TODO(typhoonzero): add unref here, if we have dependent
-  // parallelism execution, need to know when to free the tensor.
+  // Default DestroyCallback does nothing, When using GPU
+  // the CPU buffer need to be freed.
   DestroyCallback destroy_callback = [](void* backing) {};
-
-  auto buffer = std::unique_ptr<char[]>(new char[1024]);
-  void* buf = buffer.get();
-
+  VarMsg request;
   void* payload = nullptr;
   size_t payload_size;
-  ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
+
+  request.set_varname(name);
   // Note: normally the profiler is enabled in 1 trainer, hence only
   // 1 trainer returns true for ShouldSendProfileState(). It tells PS
   // servers the trainer's profiling state so that PS can follow the
   // trainer.
   if (platform::ShouldSendProfileState()) {
-    e.WriteBool(VarMsg::kProfileFieldNumber, platform::IsProfileEnabled());
+    if (platform::IsProfileEnabled()) {
+      request.set_profile(platform::kEnableProfiler);
+    } else {
+      request.set_profile(platform::kDisableProfiler);
+    }
+  }
+  if (!out_name.empty()) {
+    request.set_out_varname(out_name);
   }
-  e.WriteString(VarMsg::kVarnameFieldNumber, name);
   if (var->IsType<framework::LoDTensor>()) {
-    e.WriteUint64(VarMsg::kTypeFieldNumber, 0);
+    request.set_type(::sendrecv::LOD_TENSOR);
+    GetTensorPayload(var, ctx, &request, &payload, &payload_size);
   } else if (var->IsType<framework::SelectedRows>()) {
-    e.WriteUint64(VarMsg::kTypeFieldNumber, 1);
+    request.set_type(::sendrecv::SELECTED_ROWS);
+    GetSelectedRowsPayload(var, ctx, &request, &payload, &payload_size);
+#ifdef PADDLE_WITH_CUDA
+  } else if (var->IsType<ncclUniqueId>()) {
+    request.set_type(::sendrecv::NCCL_ID);
+#endif
+  } else {
+    PADDLE_THROW("Serialize does not support type: %s",
+                 typeid(var->Type()).name());
   }
 
-  if (!out_name.empty()) {
-    e.WriteString(VarMsg::kOutVarnameFieldNumber, out_name);
-  }
-  switch (framework::ToVarType(var->Type())) {
-    case framework::proto::VarType_Type_LOD_TENSOR: {
-      auto tensor = var->Get<framework::LoDTensor>();
-      e.WriteUint64(VarMsg::kDataTypeFieldNumber,
-                    framework::ToDataType(tensor.type()));
-      for (auto& dim : framework::vectorize(tensor.dims())) {
-        e.WriteUint64(VarMsg::kDimsFieldNumber, dim);
-      }
-      auto lod = tensor.lod();  // std::vector<Vector<size_t>>
-      if (lod.size() > 0) {
-        e.WriteUint64(VarMsg::kLodLevelFieldNumber, lod.size());
-
-        for (auto& each : lod) {
-          e.WriteVarlengthBeginning(VarMsg::kLodFieldNumber,
-                                    2 +      // tag + varintlength of submessage
-                                        1 +  // kLodDataFieldNumber
-                                        each.size());
-          // auto copied from GPU
-          for (auto& d : each) {
-            e.WriteUint64(VarMsg::LodData::kLodDataFieldNumber, d);
-          }
-        }
-      }
-      if (platform::is_gpu_place(ctx.GetPlace())) {
+  if (platform::is_gpu_place(ctx.GetPlace())) {
 #ifdef PADDLE_WITH_CUDA
-        PADDLE_ENFORCE(platform::is_gpu_place(tensor.place()));
-        platform::CPUPlace cpu;
-        auto& gpu_dev_ctx =
-            static_cast<const platform::CUDADeviceContext&>(ctx);
-        auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
-        payload = memory::Alloc(cpu, copy_size);
-
-        memory::Copy(cpu, payload,
-                     boost::get<platform::CUDAPlace>(tensor.place()),
-                     reinterpret_cast<const void*>(tensor.data<void>()),
-                     copy_size, gpu_dev_ctx.stream());
-        ctx.Wait();
-        destroy_callback = [](void* backing) {
-          platform::CPUPlace cpu;
-          memory::Free(cpu, backing);
-        };
-
+    // GPU data is copied to CPU buffer when sending,
+    // free the buffer when possible.
+    destroy_callback = [](void* backing) {
+      platform::CUDAPinnedPlace cuda_pinned;
+      memory::Free(cuda_pinned, backing);
+    };
 #endif
-      } else {
-        payload = tensor.data<void>();
-      }
-      payload_size = tensor.numel() * framework::SizeOfType(tensor.type());
-      e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
-    } break;
-    case framework::proto::VarType_Type_SELECTED_ROWS: {
-      // TODO(typhoonzero): selectedrows implement should not use unique_ptr
-      auto* slr = var->GetMutable<framework::SelectedRows>();
-      e.WriteUint64(VarMsg::kDataTypeFieldNumber,
-                    framework::ToDataType(slr->value().type()));
-      for (auto& dim : framework::vectorize(slr->value().dims())) {
-        e.WriteUint64(VarMsg::kDimsFieldNumber, dim);
-      }
-      e.WriteUint64(VarMsg::kLodLevelFieldNumber, 0);
-      e.WriteUint64(VarMsg::kSlrHeightFieldNumber, slr->height());
-      auto* tensor = slr->mutable_value();
-      if (platform::is_gpu_place(ctx.GetPlace())) {
+  }
+
+  std::string header;
+  request.AppendToString(&header);
+  auto buffer = std::unique_ptr<char[]>(new char[1024]);
+  void* buf = buffer.get();
+  ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
+  e.WriteRawBytes(std::string(header.data(), header.size()));
+// NCCLID is copied directly to the message, return bytebuffer
+// with only one slice if serializing NCCLID.
 #ifdef PADDLE_WITH_CUDA
-        platform::CPUPlace cpu;
-        auto& gpu_dev_ctx =
-            static_cast<const platform::CUDADeviceContext&>(ctx);
-        auto copy_size =
-            tensor->numel() * framework::SizeOfType(tensor->type());
-        payload = memory::Alloc(cpu, copy_size);
-        memory::Copy(cpu, payload,
-                     boost::get<platform::CUDAPlace>(tensor->place()),
-                     reinterpret_cast<const void*>(tensor->data<void>()),
-                     copy_size, gpu_dev_ctx.stream());
-        ctx.Wait();
-        destroy_callback = [](void* backing) {
-          platform::CPUPlace cpu;
-          memory::Free(cpu, backing);
-        };
-#endif
-      } else {
-        payload = slr->mutable_value()->data<void>();
-      }
-      payload_size = tensor->numel() * framework::SizeOfType(tensor->type());
-      e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
-    } break;
-    default:
-      PADDLE_THROW("Serialize does not support type: %s",
-                   typeid(var->Type()).name());
-      break;
+  if (var->IsType<ncclUniqueId>()) {
+    e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
+                              NCCL_UNIQUE_ID_BYTES);
+    const ncclUniqueId& uid = var->Get<ncclUniqueId>();
+    e.WriteRawBytes(std::string(uid.internal, NCCL_UNIQUE_ID_BYTES));
+
+    // for serialize NCCL_ID
+    ::grpc::Slice slices(e.size());
+    memcpy(const_cast<uint8_t*>(slices.begin()), e.data(), e.size());
+    ::grpc::ByteBuffer tmp(&slices, 1);
+    msg->Swap(&tmp);
+    return;
   }
+#endif
+
+  e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
   // steal reference of tensor data
   ::grpc::Slice slices[4];  // metadata, tensor, rows meta, rows
   int num_slices = 2;       // only SelectedRows have rows buffer
@@ -162,12 +194,9 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                                     static_cast<char*>(payload)),
       ::grpc::Slice::STEAL_REF);
 
-  if (framework::ToVarType(var->Type()) ==
-      framework::proto::VarType_Type_SELECTED_ROWS) {
+  if (var->IsType<framework::SelectedRows>()) {
     auto* slr = var->GetMutable<framework::SelectedRows>();
-
     ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
-    // NOTE: rows is of type int64_t
     size_t rows_memory_size =
         slr->rows().size() * framework::SizeOfType(typeid(int64_t));
     e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
@@ -178,10 +207,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
         grpc_slice_new_with_user_data(
             const_cast<void*>(
                 reinterpret_cast<const void*>(slr->rows().data())),
-            rows_memory_size,
-            [](void* backing) {
-              // TODO(typhoonzero): add unref here, same as above.
-            },
+            rows_memory_size, [](void* backing) {},
             const_cast<char*>(
                 reinterpret_cast<const char*>(slr->rows().data()))),
         ::grpc::Slice::STEAL_REF);
diff --git a/paddle/fluid/operators/detail/serde_test.cc b/paddle/fluid/operators/detail/serde_test.cc
index e9eaaf1cbcd07ed1c8d6fb0b025bc1f1500718fd..15892295e6901fe649788c9e34604008fc8cbdfa 100644
--- a/paddle/fluid/operators/detail/serde_test.cc
+++ b/paddle/fluid/operators/detail/serde_test.cc
@@ -117,11 +117,11 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
   // serialize var to ByteBuffer
   framework::Variable var;
   auto* tensor = var.GetMutable<framework::LoDTensor>();
-  tensor->Resize(framework::make_ddim({4, 8, 4, 2}));
+  tensor->Resize(framework::make_ddim({512, 8, 4, 2}));
   framework::LoD lod;
   lod.push_back(framework::Vector<size_t>({1, 3, 8}));
   tensor->set_lod(lod);
-  int tensor_numel = 4 * 8 * 4 * 2;
+  int tensor_numel = 512 * 8 * 4 * 2;
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto& ctx = *pool.Get(place);
   tensor->mutable_data<float>(place);
@@ -142,7 +142,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
   EXPECT_TRUE(varmsg.ParseFromString(tmp));
   EXPECT_EQ(varmsg.varname(), "myvar");
   EXPECT_EQ(varmsg.type(), 0);
-  EXPECT_EQ(varmsg.dims()[0], 4);
+  EXPECT_EQ(varmsg.dims()[0], 512);
   EXPECT_EQ(varmsg.dims()[1], 8);
   EXPECT_EQ(varmsg.dims()[2], 4);
   EXPECT_EQ(varmsg.dims()[3], 2);
diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc
index f4a374d56d28a30201f0d482e97e1a40e7a8bf41..24cb91a3bb820a0e5d51aaa49154434919080f69 100644
--- a/paddle/fluid/operators/detail/variable_response.cc
+++ b/paddle/fluid/operators/detail/variable_response.cc
@@ -17,6 +17,9 @@
 #include <string>
 #include <utility>
 #include <vector>
+#ifdef PADDLE_WITH_CUDA
+#include <nccl.h>
+#endif
 #include "paddle/fluid/platform/profiler.h"
 
 #include "paddle/fluid/operators/detail/send_recv.pb.h"
@@ -210,15 +213,15 @@ bool ParseLodData(::google::protobuf::io::CodedInputStream* input,
         }
 
         if (wt == WIRETYPE_LENGTH_DELIMITED) {
-          int length = 0;
-          if (!input->ReadVarintSizeAsInt(&length)) {
+          int num_bytes = 0;
+          if (!input->ReadVarintSizeAsInt(&num_bytes)) {
             return tag;
           }
-
-          for (int i = 0; i < length; i++) {
+          int start_pos = input->CurrentPosition();
+          while (input->CurrentPosition() - start_pos < num_bytes) {
             uint64_t v;
             if (!input->ReadVarint64(&v)) {
-              return false;
+              return tag;
             }
             lod->push_back(v);
           }
@@ -275,8 +278,8 @@ int VariableResponse::Parse(Source* source) {
         break;
       }
       case sendrecv::VariableMessage::kTypeFieldNumber: {
-        uint64_t v;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
+        uint32_t v;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
           return tag;
         }
 
@@ -284,8 +287,8 @@ int VariableResponse::Parse(Source* source) {
         break;
       }
       case sendrecv::VariableMessage::kDataTypeFieldNumber: {
-        uint64_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
+        uint32_t v = 0;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
           return tag;
         }
 
@@ -305,11 +308,12 @@ int VariableResponse::Parse(Source* source) {
 
         // packed
         if (wt == WIRETYPE_LENGTH_DELIMITED) {
-          int length = 0;
-          if (!input.ReadVarintSizeAsInt(&length)) {
+          int num_bytes = 0;
+          if (!input.ReadVarintSizeAsInt(&num_bytes)) {
             return tag;
           }
-          for (int i = 0; i < length; i++) {
+          int start_pos = input.CurrentPosition();
+          while (input.CurrentPosition() - start_pos < num_bytes) {
             uint64_t v;
             if (!input.ReadVarint64(&v)) {
               return tag;
@@ -318,7 +322,6 @@ int VariableResponse::Parse(Source* source) {
           }
           break;
         }
-
         return tag;
       }
       case sendrecv::VariableMessage::kLodLevelFieldNumber: {
@@ -368,28 +371,45 @@ int VariableResponse::Parse(Source* source) {
       }
       case sendrecv::VariableMessage::kSerializedFieldNumber: {
         PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
-                        meta_.type() == sendrecv::LOD_TENSOR) &&
+                        meta_.type() == sendrecv::LOD_TENSOR ||
+                        meta_.type() == sendrecv::NCCL_ID) &&
                            meta_.varname() != "",
                        "meta info should be got first!");
 
-        int length = 0;
+        int num_bytes = 0;
         if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &length)) {
+            !ReadVarintSizeAsInt(&input, &num_bytes)) {
           return tag;
         }
 
+        if (meta_.type() == sendrecv::NCCL_ID) {
+#ifdef PADDLE_WITH_CUDA
+          auto* var = scope_->FindVar(meta_.varname());
+          if (var != nullptr) {
+            ncclUniqueId* id = var->GetMutable<ncclUniqueId>();
+            if (!ReadRaw(&input, *dev_ctx_, platform::CPUPlace(), id->internal,
+                         num_bytes)) {
+              return tag;
+            }
+          }
+          break;
+#else
+          PADDLE_THROW("Not compiled with CUDA!");
+#endif
+        }
+
         framework::DDim dims = GetDims(meta_.dims());
         if (meta_.type() == sendrecv::LOD_TENSOR) {
           PADDLE_ENFORCE(meta_.lod_size() >= 0,
                          "lod info should be got first!");
-          if (!CopyLodTensorData(&input, *dev_ctx_, dims, length)) {
+          if (!CopyLodTensorData(&input, *dev_ctx_, dims, num_bytes)) {
             return tag;
           }
           break;
         }
 
         if (meta_.type() == sendrecv::SELECTED_ROWS) {
-          if (!CopySelectRowsTensorData(&input, *dev_ctx_, dims, length)) {
+          if (!CopySelectRowsTensorData(&input, *dev_ctx_, dims, num_bytes)) {
             return tag;
           }
           break;
@@ -403,13 +423,13 @@ int VariableResponse::Parse(Source* source) {
                            meta_.varname() != "",
                        "meta info should be got first!");
 
-        int length = 0;
+        int num_bytes = 0;
         if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &length)) {
+            !ReadVarintSizeAsInt(&input, &num_bytes)) {
           return tag;
         }
 
-        if (!CopySelectRowsData(&input, *dev_ctx_, length)) {
+        if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) {
           return tag;
         }
         break;
@@ -429,8 +449,8 @@ int VariableResponse::Parse(Source* source) {
         break;
       }
       case sendrecv::VariableMessage::kProfileFieldNumber: {
-        bool profiling;
-        if (!input.ReadRaw(reinterpret_cast<void*>(&profiling), 1)) {
+        uint64_t profiling = 0;
+        if (!input.ReadVarint64(&profiling)) {
           return tag;
         }
         meta_.set_profile(profiling);
@@ -438,9 +458,11 @@ int VariableResponse::Parse(Source* source) {
         if (listener_id <= 0) {
           break;
         }
-        if (profiling && !platform::IsProfileEnabled()) {
+        if (profiling == platform::kEnableProfiler &&
+            !platform::IsProfileEnabled()) {
           platform::EnableProfiler(platform::ProfilerState::kCPU);
-        } else if (!profiling && platform::IsProfileEnabled()) {
+        } else if (profiling == platform::kDisableProfiler &&
+                   platform::IsProfileEnabled()) {
           // TODO(panyx0718): Should we allow to customize file dir.
           platform::DisableProfiler(
               platform::EventSortingKey::kDefault,
diff --git a/paddle/fluid/operators/detail/variable_response.h b/paddle/fluid/operators/detail/variable_response.h
index bf624da2a6c26472e47711b3c6409f78afba0a64..69cfd784f8dd4f129f50c6882061e53e8535b949 100644
--- a/paddle/fluid/operators/detail/variable_response.h
+++ b/paddle/fluid/operators/detail/variable_response.h
@@ -67,8 +67,8 @@ class VariableResponse {
 
   framework::Scope* GetMutableLocalScope() const { return local_scope_; }
 
-  inline std::string Varname() { return meta_.varname(); }
-  inline std::string OutVarname() { return meta_.out_varname(); }
+  inline std::string Varname() const { return meta_.varname(); }
+  inline std::string OutVarname() const { return meta_.out_varname(); }
 
   // should call parse first.
   framework::Variable* GetVar() {
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..20d960f9fee1eae42b2241fb96c163e15db5e24d
--- /dev/null
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -0,0 +1,31 @@
+set(LOCAL_DETECTION_LIBS)
+
+function(detection_library TARGET_NAME)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    set(options "")
+    set(common_deps op_registry)
+    set(pybind_flag 0)
+    cmake_parse_arguments(detection_library "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN})
+    op_library(${TARGET_NAME} SRCS ${detection_library_SRCS} DEPS ${common_deps} ${detection_library_DEPS})
+    set(LOCAL_DETECTION_LIBS
+            ${TARGET_NAME}
+            ${LOCAL_DETECTION_LIBS}
+        PARENT_SCOPE)
+endfunction()
+
+detection_library(bipartite_match_op SRCS bipartite_match_op.cc)
+detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu)
+detection_library(iou_similarity_op SRCS iou_similarity_op.cc
+iou_similarity_op.cu)
+detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
+detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc)
+detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
+detection_library(target_assign_op SRCS target_assign_op.cc
+target_assign_op.cu)
+detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
+    polygon_box_transform_op.cu)
+
+# Export local libraries to parent
+set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
diff --git a/paddle/fluid/operators/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc
similarity index 98%
rename from paddle/fluid/operators/bipartite_match_op.cc
rename to paddle/fluid/operators/detection/bipartite_match_op.cc
index 1218d9fdc1e6101d17bc09a4ae769f5fbf8e7b15..d437ad5c19828331c749244404ba80d0f3acda2a 100644
--- a/paddle/fluid/operators/bipartite_match_op.cc
+++ b/paddle/fluid/operators/detection/bipartite_match_op.cc
@@ -182,8 +182,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
 
 class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BipartiteMatchOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(
         "DistMat",
         "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
diff --git a/paddle/fluid/operators/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc
similarity index 87%
rename from paddle/fluid/operators/box_coder_op.cc
rename to paddle/fluid/operators/detection/box_coder_op.cc
index ec416f725e75fae57484751ee8a066c0b9da8a70..8c4b4321b7582a5cfad89f23e3d298ed16162d99 100644
--- a/paddle/fluid/operators/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/box_coder_op.h"
+#include "paddle/fluid/operators/detection/box_coder_op.h"
 
 namespace paddle {
 namespace operators {
@@ -22,21 +22,21 @@ class BoxCoderOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
                    "Input(PriorBox) of BoxCoderOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("PriorBoxVar"),
-                   "Input(PriorBoxVar) of BoxCoderOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("TargetBox"),
                    "Input(TargetBox) of BoxCoderOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("OutputBox"),
                    "Output(OutputBox) of BoxCoderOp should not be null.");
 
     auto prior_box_dims = ctx->GetInputDim("PriorBox");
-    auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
     auto target_box_dims = ctx->GetInputDim("TargetBox");
 
     PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
                       "The rank of Input of PriorBoxVar must be 2");
     PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
-    PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
+    if (ctx->HasInput("PriorBoxVar")) {
+      auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
+      PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
+    }
 
     auto code_type = GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
     if (code_type == BoxCodeType::kEncodeCenterSize) {
@@ -60,8 +60,7 @@ class BoxCoderOp : public framework::OperatorWithKernel {
 
 class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BoxCoderOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(
         "PriorBox",
         "(Tensor, default Tensor<float>) "
@@ -72,9 +71,11 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
         "of the coordinate system. [xmax, ymax] is the right bottom "
         "coordinate of the anchor box.");
     AddInput("PriorBoxVar",
-             "(Tensor, default Tensor<float>) "
+             "(Tensor, default Tensor<float>, optional) "
              "PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group "
-             "of variance.");
+             "of variance. PriorBoxVar will set all elements to 1 by "
+             "default.")
+        .AsDispensable();
     AddInput(
         "TargetBox",
         "(LoDTensor or Tensor) This input can be a 2-D LoDTensor with shape "
@@ -92,6 +93,10 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
                          "the code type used with the target box")
         .SetDefault("encode_center_size")
         .InEnum({"encode_center_size", "decode_center_size"});
+    AddAttr<bool>("box_normalized",
+                  "(bool, default true) "
+                  "whether treat the priorbox as a noramlized box")
+        .SetDefault(true);
     AddOutput("OutputBox",
               "(LoDTensor or Tensor) "
               "When code_type is 'encode_center_size', the output tensor of "
@@ -128,5 +133,6 @@ width and height.
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker,
                   paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel<float>,
-                       ops::BoxCoderKernel<double>);
+REGISTER_OP_CPU_KERNEL(
+    box_coder, ops::BoxCoderKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::BoxCoderKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/box_coder_op.cu b/paddle/fluid/operators/detection/box_coder_op.cu
similarity index 61%
rename from paddle/fluid/operators/box_coder_op.cu
rename to paddle/fluid/operators/detection/box_coder_op.cu
index 708c7a5fa96c2f9ce6a2d913ca5f30126bb6192f..a7af111f63d654319dd1d90d2032956951dfe49e 100644
--- a/paddle/fluid/operators/box_coder_op.cu
+++ b/paddle/fluid/operators/detection/box_coder_op.cu
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/box_coder_op.h"
+#include "paddle/fluid/operators/detection/box_coder_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
@@ -20,15 +20,16 @@ __global__ void EncodeCenterSizeKernel(const T* prior_box_data,
                                        const T* prior_box_var_data,
                                        const T* target_box_data, const int row,
                                        const int col, const int len,
-                                       T* output) {
+                                       const bool normalized, T* output) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < row * col) {
     const int row_idx = idx / col;
     const int col_idx = idx % col;
-    T prior_box_width =
-        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
-    T prior_box_height =
-        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
+    T prior_box_width = prior_box_data[col_idx * len + 2] -
+                        prior_box_data[col_idx * len] + (normalized == false);
+    T prior_box_height = prior_box_data[col_idx * len + 3] -
+                         prior_box_data[col_idx * len + 1] +
+                         (normalized == false);
     T prior_box_center_x =
         (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
     T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
@@ -41,20 +42,24 @@ __global__ void EncodeCenterSizeKernel(const T* prior_box_data,
     T target_box_center_y = (target_box_data[row_idx * len + 3] +
                              target_box_data[row_idx * len + 1]) /
                             2;
-    T target_box_width =
-        target_box_data[row_idx * len + 2] - target_box_data[row_idx * len];
-    T target_box_height =
-        target_box_data[row_idx * len + 3] - target_box_data[row_idx * len + 1];
+    T target_box_width = target_box_data[row_idx * len + 2] -
+                         target_box_data[row_idx * len] + (normalized == false);
+    T target_box_height = target_box_data[row_idx * len + 3] -
+                          target_box_data[row_idx * len + 1] +
+                          (normalized == false);
 
-    output[idx * len] = (target_box_center_x - prior_box_center_x) /
-                        prior_box_width / prior_box_var_data[col_idx * len];
-    output[idx * len + 1] = (target_box_center_y - prior_box_center_y) /
-                            prior_box_height /
-                            prior_box_var_data[col_idx * len + 1];
-    output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)) /
-                            prior_box_var_data[col_idx * len + 2];
-    output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)) /
-                            prior_box_var_data[col_idx * len + 3];
+    output[idx * len] =
+        (target_box_center_x - prior_box_center_x) / prior_box_width;
+    output[idx * len + 1] =
+        (target_box_center_y - prior_box_center_y) / prior_box_height;
+    output[idx * len + 2] = log(fabs(target_box_width / prior_box_width));
+    output[idx * len + 3] = log(fabs(target_box_height / prior_box_height));
+    if (prior_box_var_data) {
+      output[idx * len] /= prior_box_var_data[col_idx * len];
+      output[idx * len + 1] /= prior_box_var_data[col_idx * len + 1];
+      output[idx * len + 2] /= prior_box_var_data[col_idx * len + 2];
+      output[idx * len + 3] /= prior_box_var_data[col_idx * len + 3];
+    }
   }
 }
 
@@ -63,42 +68,56 @@ __global__ void DecodeCenterSizeKernel(const T* prior_box_data,
                                        const T* prior_box_var_data,
                                        const T* target_box_data, const int row,
                                        const int col, const int len,
-                                       T* output) {
+                                       const bool normalized, T* output) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < row * col) {
     const int col_idx = idx % col;
-    T prior_box_width =
-        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
-    T prior_box_height =
-        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
+    T prior_box_width = prior_box_data[col_idx * len + 2] -
+                        prior_box_data[col_idx * len] + (normalized == false);
+    T prior_box_height = prior_box_data[col_idx * len + 3] -
+                         prior_box_data[col_idx * len + 1] +
+                         (normalized == false);
     T prior_box_center_x =
         (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
     T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
                             prior_box_data[col_idx * len + 1]) /
                            2;
-
-    T target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
+    T target_box_width, target_box_height;
+    T target_box_center_x, target_box_center_y;
+    if (prior_box_var_data) {
+      target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
                              target_box_data[idx * len + 2]) *
                          prior_box_width;
-    T target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
+      target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
                               target_box_data[idx * len + 3]) *
                           prior_box_height;
-    T target_box_center_x = prior_box_var_data[col_idx * len] *
+      target_box_center_x = prior_box_var_data[col_idx * len] *
                                 target_box_data[idx * len] * prior_box_width +
                             prior_box_center_x;
-    T target_box_center_y = prior_box_var_data[col_idx * len + 1] *
+      target_box_center_y = prior_box_var_data[col_idx * len + 1] *
                                 target_box_data[idx * len + 1] *
                                 prior_box_height +
                             prior_box_center_y;
+    } else {
+      target_box_width = exp(target_box_data[idx * len + 2]) * prior_box_width;
+      target_box_height =
+          exp(target_box_data[idx * len + 3]) * prior_box_height;
+      target_box_center_x =
+          target_box_data[idx * len] * prior_box_width + prior_box_center_x;
+      target_box_center_y = target_box_data[idx * len + 1] * prior_box_height +
+                            prior_box_center_y;
+    }
 
     output[idx * len] = target_box_center_x - target_box_width / 2;
     output[idx * len + 1] = target_box_center_y - target_box_height / 2;
-    output[idx * len + 2] = target_box_center_x + target_box_width / 2;
-    output[idx * len + 3] = target_box_center_y + target_box_height / 2;
+    output[idx * len + 2] =
+        target_box_center_x + target_box_width / 2 - (normalized == false);
+    output[idx * len + 3] =
+        target_box_center_y + target_box_height / 2 - (normalized == false);
   }
 }
 
-template <typename T>
+template <typename DeviceContext, typename T>
 class BoxCoderCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -109,6 +128,11 @@ class BoxCoderCUDAKernel : public framework::OpKernel<T> {
     auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
     auto* output_box = context.Output<framework::Tensor>("OutputBox");
 
+    const T* prior_box_data = prior_box->data<T>();
+    const T* target_box_data = target_box->data<T>();
+    const T* prior_box_var_data = nullptr;
+    if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
+
     if (target_box->lod().size()) {
       PADDLE_ENFORCE_EQ(target_box->lod().size(), 1,
                         "Only support 1 level of LoD.");
@@ -120,22 +144,19 @@ class BoxCoderCUDAKernel : public framework::OpKernel<T> {
     int grid = (row * col + block - 1) / block;
     auto& device_ctx = context.cuda_device_context();
 
-    const T* prior_box_data = prior_box->data<T>();
-    const T* prior_box_var_data = prior_box_var->data<T>();
-    const T* target_box_data = target_box->data<T>();
-
     output_box->mutable_data<T>({row, col, len}, context.GetPlace());
     T* output = output_box->data<T>();
 
     auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
+    bool normalized = context.Attr<bool>("box_normalized");
     if (code_type == BoxCodeType::kEncodeCenterSize) {
       EncodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
           prior_box_data, prior_box_var_data, target_box_data, row, col, len,
-          output);
+          normalized, output);
     } else if (code_type == BoxCodeType::kDecodeCenterSize) {
       DecodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
           prior_box_data, prior_box_var_data, target_box_data, row, col, len,
-          output);
+          normalized, output);
     }
   }
 };
@@ -144,5 +165,7 @@ class BoxCoderCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(box_coder, ops::BoxCoderCUDAKernel<float>,
-                        ops::BoxCoderCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(
+    box_coder,
+    ops::BoxCoderCUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::BoxCoderCUDAKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h
similarity index 50%
rename from paddle/fluid/operators/box_coder_op.h
rename to paddle/fluid/operators/detection/box_coder_op.h
index 77fc6c2b62af42e6526b889aeef2d9bab795baec..5ed8520acddfa8fe2105a7c1615bcb3243cb130f 100644
--- a/paddle/fluid/operators/box_coder_op.h
+++ b/paddle/fluid/operators/detection/box_coder_op.h
@@ -28,26 +28,28 @@ inline BoxCodeType GetBoxCodeType(const std::string& type) {
   PADDLE_THROW("Not support type %s.", type);
 }
 
-template <typename T>
+template <typename DeviceContext, typename T>
 class BoxCoderKernel : public framework::OpKernel<T> {
  public:
-  void EncodeCenterSize(const framework::Tensor& target_box,
-                        const framework::Tensor& prior_box,
-                        const framework::Tensor& prior_box_var,
-                        T* output) const {
-    int64_t row = target_box.dims()[0];
-    int64_t col = prior_box.dims()[0];
-    int64_t len = prior_box.dims()[1];
-    auto* target_box_data = target_box.data<T>();
-    auto* prior_box_data = prior_box.data<T>();
-    auto* prior_box_var_data = prior_box_var.data<T>();
+  void EncodeCenterSize(const framework::Tensor* target_box,
+                        const framework::Tensor* prior_box,
+                        const framework::Tensor* prior_box_var,
+                        const bool normalized, T* output) const {
+    int64_t row = target_box->dims()[0];
+    int64_t col = prior_box->dims()[0];
+    int64_t len = prior_box->dims()[1];
+    auto* target_box_data = target_box->data<T>();
+    auto* prior_box_data = prior_box->data<T>();
+    const T* prior_box_var_data = nullptr;
+    if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
 
     for (int64_t i = 0; i < row; ++i) {
       for (int64_t j = 0; j < col; ++j) {
-        T prior_box_width =
-            prior_box_data[j * len + 2] - prior_box_data[j * len];
-        T prior_box_height =
-            prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
+        T prior_box_width = prior_box_data[j * len + 2] -
+                            prior_box_data[j * len] + (normalized == false);
+        T prior_box_height = prior_box_data[j * len + 3] -
+                             prior_box_data[j * len + 1] +
+                             (normalized == false);
         T prior_box_center_x =
             (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
         T prior_box_center_y =
@@ -57,67 +59,89 @@ class BoxCoderKernel : public framework::OpKernel<T> {
             (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
         T target_box_center_y =
             (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
-        T target_box_width =
-            target_box_data[i * len + 2] - target_box_data[i * len];
-        T target_box_height =
-            target_box_data[i * len + 3] - target_box_data[i * len + 1];
+        T target_box_width = target_box_data[i * len + 2] -
+                             target_box_data[i * len] + (normalized == false);
+        T target_box_height = target_box_data[i * len + 3] -
+                              target_box_data[i * len + 1] +
+                              (normalized == false);
 
         size_t offset = i * col * len + j * len;
-        output[offset] = (target_box_center_x - prior_box_center_x) /
-                         prior_box_width / prior_box_var_data[j * len];
-        output[offset + 1] = (target_box_center_y - prior_box_center_y) /
-                             prior_box_height / prior_box_var_data[j * len + 1];
+        output[offset] =
+            (target_box_center_x - prior_box_center_x) / prior_box_width;
+        output[offset + 1] =
+            (target_box_center_y - prior_box_center_y) / prior_box_height;
         output[offset + 2] =
-            std::log(std::fabs(target_box_width / prior_box_width)) /
-            prior_box_var_data[j * len + 2];
+            std::log(std::fabs(target_box_width / prior_box_width));
         output[offset + 3] =
-            std::log(std::fabs(target_box_height / prior_box_height)) /
-            prior_box_var_data[j * len + 3];
+            std::log(std::fabs(target_box_height / prior_box_height));
+        if (prior_box_var) {
+          output[offset] /= prior_box_var_data[j * len];
+          output[offset + 1] /= prior_box_var_data[j * len + 1];
+          output[offset + 2] /= prior_box_var_data[j * len + 2];
+          output[offset + 3] /= prior_box_var_data[j * len + 3];
+        }
       }
     }
   }
-  void DecodeCenterSize(const framework::Tensor& target_box,
-                        const framework::Tensor& prior_box,
-                        const framework::Tensor& prior_box_var,
-                        T* output) const {
-    int64_t row = target_box.dims()[0];
-    int64_t col = prior_box.dims()[0];
-    int64_t len = prior_box.dims()[1];
-
-    auto* target_box_data = target_box.data<T>();
-    auto* prior_box_data = prior_box.data<T>();
-    auto* prior_box_var_data = prior_box_var.data<T>();
+  void DecodeCenterSize(const framework::Tensor* target_box,
+                        const framework::Tensor* prior_box,
+                        const framework::Tensor* prior_box_var,
+                        const bool normalized, T* output) const {
+    int64_t row = target_box->dims()[0];
+    int64_t col = prior_box->dims()[0];
+    int64_t len = prior_box->dims()[1];
+
+    auto* target_box_data = target_box->data<T>();
+    auto* prior_box_data = prior_box->data<T>();
+    const T* prior_box_var_data = nullptr;
+    if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
 
     for (int64_t i = 0; i < row; ++i) {
       for (int64_t j = 0; j < col; ++j) {
         size_t offset = i * col * len + j * len;
-        T prior_box_width =
-            prior_box_data[j * len + 2] - prior_box_data[j * len];
-        T prior_box_height =
-            prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
+        T prior_box_width = prior_box_data[j * len + 2] -
+                            prior_box_data[j * len] + (normalized == false);
+        T prior_box_height = prior_box_data[j * len + 3] -
+                             prior_box_data[j * len + 1] +
+                             (normalized == false);
         T prior_box_center_x =
             (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
         T prior_box_center_y =
             (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
 
-        T target_box_center_x = prior_box_var_data[j * len] *
+        T target_box_center_x = 0, target_box_center_y = 0;
+        T target_box_width = 0, target_box_height = 0;
+        if (prior_box_var) {
+          target_box_center_x = prior_box_var_data[j * len] *
                                     target_box_data[offset] * prior_box_width +
                                 prior_box_center_x;
-        T target_box_center_y = prior_box_var_data[j * len + 1] *
+          target_box_center_y = prior_box_var_data[j * len + 1] *
                                     target_box_data[offset + 1] *
                                     prior_box_height +
                                 prior_box_center_y;
-        T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
+          target_box_width = std::exp(prior_box_var_data[j * len + 2] *
                                       target_box_data[offset + 2]) *
                              prior_box_width;
-        T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
+          target_box_height = std::exp(prior_box_var_data[j * len + 3] *
                                        target_box_data[offset + 3]) *
                               prior_box_height;
+        } else {
+          target_box_center_x =
+              target_box_data[offset] * prior_box_width + prior_box_center_x;
+          target_box_center_y = target_box_data[offset + 1] * prior_box_height +
+                                prior_box_center_y;
+          target_box_width =
+              std::exp(target_box_data[offset + 2]) * prior_box_width;
+          target_box_height =
+              std::exp(target_box_data[offset + 3]) * prior_box_height;
+        }
 
         output[offset] = target_box_center_x - target_box_width / 2;
         output[offset + 1] = target_box_center_y - target_box_height / 2;
-        output[offset + 2] = target_box_center_x + target_box_width / 2;
-        output[offset + 3] = target_box_center_y + target_box_height / 2;
+        output[offset + 2] =
+            target_box_center_x + target_box_width / 2 - (normalized == false);
+        output[offset + 3] =
+            target_box_center_y + target_box_height / 2 - (normalized == false);
       }
     }
   }
@@ -139,11 +163,14 @@ class BoxCoderKernel : public framework::OpKernel<T> {
     output_box->mutable_data<T>({row, col, len}, context.GetPlace());
 
     auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
+    bool normalized = context.Attr<bool>("box_normalized");
     T* output = output_box->data<T>();
     if (code_type == BoxCodeType::kEncodeCenterSize) {
-      EncodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
+      EncodeCenterSize(target_box, prior_box, prior_box_var, normalized,
+                       output);
     } else if (code_type == BoxCodeType::kDecodeCenterSize) {
-      DecodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
+      DecodeCenterSize(target_box, prior_box, prior_box_var, normalized,
+                       output);
     }
   }
 };
diff --git a/paddle/fluid/operators/iou_similarity_op.cc b/paddle/fluid/operators/detection/iou_similarity_op.cc
similarity index 95%
rename from paddle/fluid/operators/iou_similarity_op.cc
rename to paddle/fluid/operators/detection/iou_similarity_op.cc
index 4b78ec510d1fb73592ee8af9a641622f4d713f8d..8e58605fcea04f9ffa97ce8cca53c073e7068aaf 100644
--- a/paddle/fluid/operators/iou_similarity_op.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/iou_similarity_op.h"
+#include "paddle/fluid/operators/detection/iou_similarity_op.h"
 
 namespace paddle {
 namespace operators {
@@ -42,8 +42,7 @@ class IOUSimilarityOp : public framework::OperatorWithKernel {
 
 class IOUSimilarityOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  IOUSimilarityOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(LoDTensor, default LoDTensor<float>) "
              "Box list X is a 2-D LoDTensor with shape [N, 4] holds N boxes, "
diff --git a/paddle/fluid/operators/iou_similarity_op.cu b/paddle/fluid/operators/detection/iou_similarity_op.cu
similarity index 92%
rename from paddle/fluid/operators/iou_similarity_op.cu
rename to paddle/fluid/operators/detection/iou_similarity_op.cu
index f40a388d62e66a110656ebb71094d46b5ac147eb..8342b4138c87e6ea1803146bac6d6954a569ef5f 100644
--- a/paddle/fluid/operators/iou_similarity_op.cu
+++ b/paddle/fluid/operators/detection/iou_similarity_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/iou_similarity_op.h"
+#include "paddle/fluid/operators/detection/iou_similarity_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/fluid/operators/iou_similarity_op.h b/paddle/fluid/operators/detection/iou_similarity_op.h
similarity index 100%
rename from paddle/fluid/operators/iou_similarity_op.h
rename to paddle/fluid/operators/detection/iou_similarity_op.h
diff --git a/paddle/fluid/operators/mine_hard_examples_op.cc b/paddle/fluid/operators/detection/mine_hard_examples_op.cc
similarity index 99%
rename from paddle/fluid/operators/mine_hard_examples_op.cc
rename to paddle/fluid/operators/detection/mine_hard_examples_op.cc
index 277901cff493445e1e85e92e22ea0ada0e1cba43..d4a09bae3a98e4518f9885c1e9182f7033a0d262 100644
--- a/paddle/fluid/operators/mine_hard_examples_op.cc
+++ b/paddle/fluid/operators/detection/mine_hard_examples_op.cc
@@ -253,8 +253,7 @@ class MineHardExamplesOp : public framework::OperatorWithKernel {
 
 class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MineHardExamplesOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(
         "ClsLoss",
         "(Tensor, default Tensor<float>), The classification loss with shape "
diff --git a/paddle/fluid/operators/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
similarity index 99%
rename from paddle/fluid/operators/multiclass_nms_op.cc
rename to paddle/fluid/operators/detection/multiclass_nms_op.cc
index a12b975326519c776c9f4a1d9f2894b4028c2440..60b93efdce810f8552374449fe5a6fc79b1a92c1 100644
--- a/paddle/fluid/operators/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -309,8 +309,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
 
 class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MultiClassNMSOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("BBoxes",
              "(Tensor) A 3-D Tensor with shape [N, M, 4] represents the "
              "predicted locations of M bounding bboxes, N is the batch size. "
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cc b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..335e8dd470f851d8c5f6bdbc94cfc343da269034
--- /dev/null
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
@@ -0,0 +1,105 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class PolygonBoxTransformCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto* in = ctx.Input<Tensor>("Input");
+    auto in_dims = in->dims();
+    const T* in_data = in->data<T>();
+    auto* out = ctx.Output<Tensor>("Output");
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    int batch_size = in_dims[0];
+    int geo_channel = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    int id = 0;
+    for (int id_n = 0; id_n < batch_size * geo_channel; ++id_n) {
+      for (int id_h = 0; id_h < height; ++id_h) {
+        for (int id_w = 0; id_w < width; ++id_w) {
+          id = id_n * height * width + width * id_h + id_w;
+          if (id_n % 2 == 0) {
+            out_data[id] = id_w - in_data[id];
+          } else {
+            out_data[id] = id_h - in_data[id];
+          }
+        }
+      }
+    }
+  }
+};
+
+class PolygonBoxTransformOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("Input"),
+        "Input (Input) of polygon_box transform op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Output"),
+        "Output (Output) of polygon_box transform op should not be null.");
+
+    auto in_dim = ctx->GetInputDim("Input");
+
+    PADDLE_ENFORCE_EQ(in_dim.size(), 4, "input's rank must be 4.");
+    PADDLE_ENFORCE_EQ(in_dim[1] % 2, 0,
+                      "input's second dimension must be even.");
+
+    ctx->SetOutputDim("Output", in_dim);
+  }
+};
+
+class PolygonBoxTransformOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "Input",
+        "The input with shape [batch_size, geometry_channels, height, width]");
+    AddOutput("Output", "The output with the same shape as input");
+
+    AddComment(R"DOC(
+PolygonBoxTransform Operator.
+The input is the final geometry output in detection network.
+We use 2*n numbers to denote the coordinate shift from n corner vertices of
+the polygon_box to the pixel location. As each distance offset contains two numbers (xi, yi),
+the geometry output contains 2*n channels.
+PolygonBoxTransform Operator is used to transform the coordinate shift to the real coordinate.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(polygon_box_transform, ops::PolygonBoxTransformOp,
+                  ops::PolygonBoxTransformOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    polygon_box_transform,
+    ops::PolygonBoxTransformCPUKernel<paddle::platform::CPUPlace, float>,
+    ops::PolygonBoxTransformCPUKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6187ac6622c65d2bbc525c3fe2cb397cf74ac612
--- /dev/null
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
@@ -0,0 +1,76 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using platform::PADDLE_CUDA_NUM_THREADS;
+#define CUDA_BLOCK_SIZE 16
+
+template <typename T>
+__global__ void PolygonBoxTransformKernel(const int n, const int h, const int w,
+                                          const T* input, T* output) {
+  int id_n = threadIdx.x + blockDim.x * blockIdx.x;
+  int id_h = threadIdx.y + blockDim.y * blockIdx.y;
+  int id_w = threadIdx.z + blockDim.z * blockIdx.z;
+  if (id_n < n && id_h < h && id_w < w) {
+    int id = id_n * h * w + w * id_h + id_w;
+    if (id_n % 2 == 0) {
+      output[id] = id_w - input[id];
+    } else {
+      output[id] = id_h - input[id];
+    }
+  }
+}
+
+template <typename T>
+class PolygonBoxTransformOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto* in = ctx.Input<Tensor>("Input");
+    auto in_dims = in->dims();
+    const T* in_data = in->data<T>();
+    auto* out = ctx.Output<Tensor>("Output");
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    int batch_size = in_dims[0];
+    int geo_channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    dim3 threadsPerBlock(
+        PADDLE_CUDA_NUM_THREADS / (CUDA_BLOCK_SIZE * CUDA_BLOCK_SIZE),
+        CUDA_BLOCK_SIZE, CUDA_BLOCK_SIZE);
+    dim3 numBlocks((batch_size * geo_channels) / threadsPerBlock.x,
+                   (height + threadsPerBlock.y - 1) / threadsPerBlock.y,
+                   (width + threadsPerBlock.z - 1) / threadsPerBlock.z);
+    auto stream = ctx.cuda_device_context().stream();
+    PolygonBoxTransformKernel<T><<<numBlocks, threadsPerBlock, 0, stream>>>(
+        batch_size * geo_channels, height, width, in_data, out_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(
+    polygon_box_transform,
+    paddle::operators::PolygonBoxTransformOpCUDAKernel<float>,
+    paddle::operators::PolygonBoxTransformOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/prior_box_op.cc b/paddle/fluid/operators/detection/prior_box_op.cc
similarity index 97%
rename from paddle/fluid/operators/prior_box_op.cc
rename to paddle/fluid/operators/detection/prior_box_op.cc
index 058b13eeb872aaa77a88da37db64a6d59fbdd1cf..4e35c38e4e03d4d0f00601812fdc4803519b89ae 100644
--- a/paddle/fluid/operators/prior_box_op.cc
+++ b/paddle/fluid/operators/detection/prior_box_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/prior_box_op.h"
+#include "paddle/fluid/operators/detection/prior_box_op.h"
 
 namespace paddle {
 namespace operators {
@@ -79,8 +79,7 @@ class PriorBoxOp : public framework::OperatorWithKernel {
 
 class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PriorBoxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Input",
              "(Tensor, default Tensor<float>), "
              "the input feature data of PriorBoxOp, The layout is NCHW.");
diff --git a/paddle/fluid/operators/prior_box_op.cu b/paddle/fluid/operators/detection/prior_box_op.cu
similarity index 99%
rename from paddle/fluid/operators/prior_box_op.cu
rename to paddle/fluid/operators/detection/prior_box_op.cu
index 0ea8909296f8f52d252b0ec258666cf32d69a8bb..f67e6ca91c0852b5a3be35d23246884d1157caa4 100644
--- a/paddle/fluid/operators/prior_box_op.cu
+++ b/paddle/fluid/operators/detection/prior_box_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/prior_box_op.h"
+#include "paddle/fluid/operators/detection/prior_box_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/prior_box_op.h b/paddle/fluid/operators/detection/prior_box_op.h
similarity index 100%
rename from paddle/fluid/operators/prior_box_op.h
rename to paddle/fluid/operators/detection/prior_box_op.h
diff --git a/paddle/fluid/operators/target_assign_op.cc b/paddle/fluid/operators/detection/target_assign_op.cc
similarity index 97%
rename from paddle/fluid/operators/target_assign_op.cc
rename to paddle/fluid/operators/detection/target_assign_op.cc
index 33ff967e5e8f5afbaa62ba39ce596687ae0a71cd..367001939251114a9cf442fd85c734958ccb2da8 100644
--- a/paddle/fluid/operators/target_assign_op.cc
+++ b/paddle/fluid/operators/detection/target_assign_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/target_assign_op.h"
+#include "paddle/fluid/operators/detection/target_assign_op.h"
 
 namespace paddle {
 namespace operators {
@@ -65,8 +65,7 @@ class TargetAssignOp : public framework::OperatorWithKernel {
 
 class TargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  TargetAssignOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(LoDTensor), This input is a 3D LoDTensor with shape [M, P, K]. "
              "Some elements in X will be assigned to Out based on the "
diff --git a/paddle/fluid/operators/target_assign_op.cu b/paddle/fluid/operators/detection/target_assign_op.cu
similarity index 97%
rename from paddle/fluid/operators/target_assign_op.cu
rename to paddle/fluid/operators/detection/target_assign_op.cu
index 24664f99b20f92108220d27ec58e8fdf3ba6193c..ddf6889942355457fb281b6c33430ab8337db3ed 100644
--- a/paddle/fluid/operators/target_assign_op.cu
+++ b/paddle/fluid/operators/detection/target_assign_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/target_assign_op.h"
+#include "paddle/fluid/operators/detection/target_assign_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/target_assign_op.h b/paddle/fluid/operators/detection/target_assign_op.h
similarity index 100%
rename from paddle/fluid/operators/target_assign_op.h
rename to paddle/fluid/operators/detection/target_assign_op.h
diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc
index 38f43b6d031372948bd82c686a2d9ce5f8ecd07c..716c8625d35308f98582e6802e90d99d643e188b 100644
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -51,7 +51,8 @@ class DetectionMAPOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(label_dims.size(), 2,
                       "The rank of Input(Label) must be 2, "
                       "the shape is [N, 6].");
-    PADDLE_ENFORCE_EQ(label_dims[1], 6, "The shape is of Input(Label) [N, 6].");
+    PADDLE_ENFORCE(label_dims[1] == 6 || label_dims[1] == 5,
+                   "The shape of Input(Label) is [N, 6] or [N, 5].");
 
     if (ctx->HasInput("PosCount")) {
       PADDLE_ENFORCE(ctx->HasInput("TruePos"),
@@ -78,8 +79,7 @@ class DetectionMAPOp : public framework::OperatorWithKernel {
 
 class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  DetectionMAPOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("DetectRes",
              "(LoDTensor) A 2-D LoDTensor with shape [M, 6] represents the "
              "detections. Each row has 6 values: "
@@ -89,9 +89,10 @@ class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker {
              "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
              "no detected data.");
     AddInput("Label",
-             "(LoDTensor) A 2-D LoDTensor with shape[N, 6] represents the"
+             "(LoDTensor) A 2-D LoDTensor represents the"
              "Labeled ground-truth data. Each row has 6 values: "
-             "[label, is_difficult, xmin, ymin, xmax, ymax], N is the total "
+             "[label, xmin, ymin, xmax, ymax, is_difficult] or 5 values: "
+             "[label, xmin, ymin, xmax, ymax], where N is the total "
              "number of ground-truth data in this mini-batch. For each "
              "instance, the offsets in first dimension are called LoD, "
              "the number of offset is N + 1, if LoD[i + 1] - LoD[i] == 0, "
diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h
index 431812e2bfcf926cadf8d7be6a7d1a79e78c7762..dd1ab85fd8d0c8170afcd9dd2a49ee55c41dc8be 100644
--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
@@ -72,7 +72,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
     auto* out_false_pos = ctx.Output<framework::LoDTensor>("AccumFalsePos");
 
     float overlap_threshold = ctx.Attr<float>("overlap_threshold");
-    float evaluate_difficult = ctx.Attr<bool>("evaluate_difficult");
+    bool evaluate_difficult = ctx.Attr<bool>("evaluate_difficult");
     auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type"));
     int class_num = ctx.Attr<int>("class_num");
 
@@ -175,14 +175,20 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
     for (int n = 0; n < batch_size; ++n) {
       std::map<int, std::vector<Box>> boxes;
       for (size_t i = label_index[n]; i < label_index[n + 1]; ++i) {
-        Box box(labels(i, 2), labels(i, 3), labels(i, 4), labels(i, 5));
         int label = labels(i, 0);
-        auto is_difficult = labels(i, 1);
-        if (std::abs(is_difficult - 0.0) < 1e-6)
-          box.is_difficult = false;
-        else
-          box.is_difficult = true;
-        boxes[label].push_back(box);
+        if (input_label.dims()[1] == 6) {
+          Box box(labels(i, 2), labels(i, 3), labels(i, 4), labels(i, 5));
+          auto is_difficult = labels(i, 1);
+          if (std::abs(is_difficult - 0.0) < 1e-6)
+            box.is_difficult = false;
+          else
+            box.is_difficult = true;
+          boxes[label].push_back(box);
+        } else {
+          PADDLE_ENFORCE_EQ(input_label.dims()[1], 5);
+          Box box(labels(i, 1), labels(i, 2), labels(i, 3), labels(i, 4));
+          boxes[label].push_back(box);
+        }
       }
       gt_boxes->push_back(boxes);
     }
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
index 4ed1b548840fabd2383632beb5f35fa6aa096443..07322e720f26213ea777be3cd22f2fead28507f0 100644
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -37,8 +37,7 @@ class DropoutOp : public framework::OperatorWithKernel {
 
 class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  DropoutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The input of dropout op.");
     AddOutput("Out", "The output of dropout op.");
     AddOutput("Mask", "The random sampled dropout mask.").AsIntermediate();
diff --git a/paddle/fluid/operators/edit_distance_op.cc b/paddle/fluid/operators/edit_distance_op.cc
index c7f037d2df4372d0c4e3a261c0dff1fd6704d182..de25a3dab53492e38a92fbcf07ccbe43f7546950 100644
--- a/paddle/fluid/operators/edit_distance_op.cc
+++ b/paddle/fluid/operators/edit_distance_op.cc
@@ -49,8 +49,7 @@ class EditDistanceOp : public framework::OperatorWithKernel {
 
 class EditDistanceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  EditDistanceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Hyps",
              "(2-D LoDTensor<int64_t>, 2nd dim. equal to 1) "
              "The indices for hypothesis strings.");
diff --git a/paddle/fluid/operators/elementwise_add_op.cc b/paddle/fluid/operators/elementwise_add_op.cc
index 4aab54f60236ecc5fa7f70e22f1553c3bfe68198..d2c20537136fc3ac9d1bece24a2238f26215c922 100644
--- a/paddle/fluid/operators/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise_add_op.cc
@@ -14,26 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise_add_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-class ElementwiseAddOpMaker : public ElementwiseOpMaker {
- public:
-  ElementwiseAddOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Add", "Out = X + Y");
-    AddComment(comment_);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(elementwise_add, ops::ElementwiseOp,
-                  ops::ElementwiseAddOpMaker, ops::ElementwiseOpInferVarType,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(elementwise_add_grad, ops::ElementwiseOpGrad);
-
+REGISTER_ELEMWISE_OP(elementwise_add, "Add", "Out = X + Y");
 REGISTER_OP_CPU_KERNEL(
     elementwise_add,
     ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_add_op.h b/paddle/fluid/operators/elementwise_add_op.h
index 253964562c8d34e0fda3b4760761206895f749aa..baf04c30b17cb333fc8a6544afd6c479442f835b 100644
--- a/paddle/fluid/operators/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise_add_op.h
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/blas.h"
 
 namespace paddle {
 namespace operators {
@@ -24,19 +26,57 @@ struct AddFunctor {
   inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
 };
 
+template <typename DeviceContext, typename T>
+void default_elementwise_add(const framework::ExecutionContext& ctx,
+                             const framework::Tensor* x,
+                             const framework::Tensor* y, framework::Tensor* z) {
+  int axis = ctx.Attr<int>("axis");
+  ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                        AddFunctor<T>(), z);
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_floating_point<T>::value &&
+    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_add(const framework::ExecutionContext& ctx,
+                const framework::Tensor* x, const framework::Tensor* y,
+                framework::Tensor* z) {
+  auto eigen_x = framework::EigenVector<T>::Flatten(*x);
+  auto eigen_y = framework::EigenVector<T>::Flatten(*y);
+  auto eigen_z = framework::EigenVector<T>::Flatten(*z);
+
+  auto blas = math::GetBlas<DeviceContext, T>(ctx);
+  blas.VADD(x->numel(), eigen_x.data(), eigen_y.data(), eigen_z.data());
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    !std::is_floating_point<T>::value ||
+    !std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_add(const framework::ExecutionContext& ctx,
+                const framework::Tensor* x, const framework::Tensor* y,
+                framework::Tensor* z) {
+  default_elementwise_add<DeviceContext, T>(ctx, x, y, z);
+}
+
 template <typename DeviceContext, typename T>
 class ElementwiseAddKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     using Tensor = framework::Tensor;
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* z = ctx.Output<Tensor>("Out");
+    const auto x = ctx.Input<Tensor>("X");
+    const auto y = ctx.Input<Tensor>("Y");
+    auto z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          AddFunctor<T>(), z);
+
+    auto dims_equal = x->dims() == y->dims();
+    if (dims_equal) {
+      elementwise_add<DeviceContext, T>(ctx, x, y, z);
+    } else {
+      default_elementwise_add<DeviceContext, T>(ctx, x, y, z);
+    }
   }
 };
 
@@ -45,6 +85,55 @@ struct IdentityGrad {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
 };
 
+template <typename DeviceContext, typename T>
+void default_elementwise_add_grad(const framework::ExecutionContext& ctx,
+                                  const framework::Tensor* x,
+                                  const framework::Tensor* y,
+                                  const framework::Tensor* out,
+                                  const framework::Tensor* dout,
+                                  framework::Tensor* dx,
+                                  framework::Tensor* dy) {
+  int axis = ctx.Attr<int>("axis");
+
+  ElemwiseGradCompute<DeviceContext, T, IdentityGrad<T>, IdentityGrad<T>>(
+      ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
+      IdentityGrad<T>());
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_floating_point<T>::value &&
+    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_add_grad(const framework::ExecutionContext& ctx,
+                     const framework::Tensor* x, const framework::Tensor* y,
+                     const framework::Tensor* out,
+                     const framework::Tensor* dout, framework::Tensor* dx,
+                     framework::Tensor* dy) {
+  auto blas = math::GetBlas<DeviceContext, T>(ctx);
+
+  if (dx) {
+    blas.VCOPY(dout->numel(), dout->data<T>(),
+               dx->mutable_data<T>(ctx.GetPlace()));
+  }
+
+  if (dy) {
+    blas.VCOPY(dout->numel(), dout->data<T>(),
+               dy->mutable_data<T>(ctx.GetPlace()));
+  }
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    !std::is_floating_point<T>::value ||
+    !std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_add_grad(const framework::ExecutionContext& ctx,
+                     const framework::Tensor* x, const framework::Tensor* y,
+                     const framework::Tensor* out,
+                     const framework::Tensor* dout, framework::Tensor* dx,
+                     framework::Tensor* dy) {
+  default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
+}
+
 template <typename DeviceContext, typename T>
 class ElementwiseAddGradKernel : public framework::OpKernel<T> {
  public:
@@ -57,10 +146,13 @@ class ElementwiseAddGradKernel : public framework::OpKernel<T> {
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    ElemwiseGradCompute<DeviceContext, T, IdentityGrad<T>, IdentityGrad<T>>(
-        ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
-        IdentityGrad<T>());
+
+    if (platform::is_cpu_place(ctx.GetPlace()) && (x->dims() == y->dims())) {
+      elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
+    } else {
+      default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx,
+                                                     dy);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise_div_op.cc b/paddle/fluid/operators/elementwise_div_op.cc
index c7ddafcad1d1f6c14791fde665f43881d6b49836..824b1221e5a77c8799dc34820b7f0db180c2439e 100644
--- a/paddle/fluid/operators/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise_div_op.cc
@@ -14,26 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise_div_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-class ElementwiseDivOpMaker : public ElementwiseOpMaker {
- public:
-  ElementwiseDivOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Div", "Out = X / Y");
-    AddComment(comment_);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(elementwise_div, ops::ElementwiseOp,
-                  ops::ElementwiseDivOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(elementwise_div_grad, ops::ElementwiseOpGrad);
+REGISTER_ELEMWISE_OP(elementwise_div, "Div", "Out = X / Y");
 REGISTER_OP_CPU_KERNEL(
     elementwise_div,
     ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_max_op.cc b/paddle/fluid/operators/elementwise_max_op.cc
index a4fe386bb1907bf7c0099d2b1109077b21146948..411671335a19ae2283ca9db8b8f6bcbb6a6b630a 100644
--- a/paddle/fluid/operators/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise_max_op.cc
@@ -14,25 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise_max_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-class ElementwiseMaxOpMaker : public ElementwiseOpMaker {
- public:
-  ElementwiseMaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Max", "Out = max(X, Y)");
-    AddComment(comment_);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(elementwise_max, ops::ElementwiseOp,
-                  ops::ElementwiseMaxOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(elementwise_max_grad, ops::ElementwiseOpGrad);
+REGISTER_ELEMWISE_OP(elementwise_max, "Max", "Out = max(X, Y)");
 REGISTER_OP_CPU_KERNEL(
     elementwise_max,
     ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_min_op.cc b/paddle/fluid/operators/elementwise_min_op.cc
index 68cd6ddb4a938b2b1c33e3f89c6d1151acb27f48..816192083d2275b26e6dd9afc76f2c021a01cf73 100644
--- a/paddle/fluid/operators/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise_min_op.cc
@@ -14,25 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise_min_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-class ElementwiseMinOpMaker : public ElementwiseOpMaker {
- public:
-  ElementwiseMinOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Max", "Out = min(X, Y)");
-    AddComment(comment_);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(elementwise_min, ops::ElementwiseOp,
-                  ops::ElementwiseMinOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(elementwise_min_grad, ops::ElementwiseOpGrad);
+REGISTER_ELEMWISE_OP(elementwise_min, "Min", "Out = min(X, Y)");
 REGISTER_OP_CPU_KERNEL(
     elementwise_min,
     ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise_mul_op.cc
index 2dec27136ad57ea032d5abb51799bd04ccc0b2e3..ba343909bb87b4f2efa56c0a4ff664b278e90c60 100644
--- a/paddle/fluid/operators/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_op.cc
@@ -14,27 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise_mul_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-
-class ElementwiseMulOpMaker : public ElementwiseOpMaker {
- public:
-  ElementwiseMulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Mul", "Out = X \\odot\\ Y");
-    AddComment(comment_);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(elementwise_mul, ops::ElementwiseOp,
-                  ops::ElementwiseMulOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(elementwise_mul_grad, ops::ElementwiseOpGrad);
+REGISTER_ELEMWISE_OP(elementwise_mul, "Mul", "Out = X \\odot\\ Y");
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h
index a33634ab2503f988a8a692682ddb238d4794a3c0..f4cec8ad971abebe8d6dff1a384c8414269148a5 100644
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -46,16 +46,17 @@ class ElementwiseOpInferVarType : public framework::VarTypeInference {
  public:
   void operator()(const framework::OpDesc& op_desc,
                   framework::BlockDesc* block) const override {
-    auto x_var = op_desc.Input("X")[0];
-    auto out_var = op_desc.Output("Out")[0];
-    block->Var(out_var)->SetType(block->Var(x_var)->GetType());
+    auto x_name = op_desc.Input("X")[0];
+    auto out_name = op_desc.Output("Out")[0];
+    auto& x = block->FindRecursiveOrCreateVar(x_name);
+    auto& out = block->FindRecursiveOrCreateVar(out_name);
+    out.SetType(x.GetType());
   }
 };
 
 class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ElementwiseOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() final {
     AddInput("X", "(Tensor), The first input tensor of elementwise op.");
     AddInput("Y", "(Tensor), The second input tensor of elementwise op.");
     AddOutput("Out", "The output of elementwise op.");
@@ -64,12 +65,12 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
                  "for broadcasting Y onto X.")
         .SetDefault(-1)
         .EqualGreaterThan(-1);
-    comment_ = R"DOC(
-Limited Elementwise {name} Operator.
+    AddComment(string::Sprintf(R"DOC(
+Limited Elementwise %s Operator.
 
 The equation is:
 
-$${equation}$$
+$$%s$$
 
 $X$ is a tensor of any dimension and the dimensions of tensor $Y$ must be
 smaller than or equal to the dimensions of $X$.
@@ -100,26 +101,13 @@ For example
 Either of the inputs $X$ and $Y$ or none can carry the LoD (Level of Details)
 information. However, the output only shares the LoD information with input $X$.
 
-)DOC";
-    AddComment(comment_);
+)DOC",
+                               GetName(), GetEquation()));
   }
 
  protected:
-  std::string comment_;
-
-  void Replace(std::string* src, std::string from, std::string to) {
-    std::size_t len_from = std::strlen(from.c_str());
-    std::size_t len_to = std::strlen(to.c_str());
-    for (std::size_t pos = src->find(from); pos != std::string::npos;
-         pos = src->find(from, pos + len_to)) {
-      src->replace(pos, len_from, to);
-    }
-  }
-
-  void SetComment(std::string name, std::string equation) {
-    Replace(&comment_, "{name}", name);
-    Replace(&comment_, "{equation}", equation);
-  }
+  virtual std::string GetName() const = 0;
+  virtual std::string GetEquation() const = 0;
 };
 
 class ElementwiseOpGrad : public framework::OperatorWithKernel {
@@ -152,3 +140,16 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
 };
 }  // namespace operators
 }  // namespace paddle
+
+#define REGISTER_ELEMWISE_OP(op_type, op_name, equation)                \
+  class __ElemwiseOp##op_type##Maker__                                  \
+      : public ::paddle::operators::ElementwiseOpMaker {                \
+   protected:                                                           \
+    virtual std::string GetName() const { return op_name; }             \
+    virtual std::string GetEquation() const { return equation; }        \
+  };                                                                    \
+  REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp,        \
+                    __ElemwiseOp##op_type##Maker__,                     \
+                    ::paddle::operators::ElementwiseOpInferVarType,     \
+                    ::paddle::framework::DefaultGradOpDescMaker<true>); \
+  REGISTER_OPERATOR(op_type##_grad, ::paddle::operators::ElementwiseOpGrad)
diff --git a/paddle/fluid/operators/elementwise_pow_op.cc b/paddle/fluid/operators/elementwise_pow_op.cc
index 60302c5e59f8ce595861405713045b05d90002e3..5fd6bde9ba0930e29f2161f1ff23ff9f5e7dc85d 100644
--- a/paddle/fluid/operators/elementwise_pow_op.cc
+++ b/paddle/fluid/operators/elementwise_pow_op.cc
@@ -13,17 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise_pow_op.h"
+#include <string>
 #include "paddle/fluid/operators/elementwise_op.h"
 
 namespace paddle {
 namespace operators {
 class ElementwisePowOpMaker : public ElementwiseOpMaker {
- public:
-  ElementwisePowOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Pow", "Out = X ^ Y");
-    AddComment(comment_);
-  }
+ protected:
+  std::string GetName() const override { return "Pow"; }
+  std::string GetEquation() const override { return "Out = X ^ Y"; }
 };
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise_sub_op.cc
index 9d0598fc39a3922fa830f18729d90a7dac6a890b..a7562b166b373ee2a8c9b6f379431d88d3e45fcb 100644
--- a/paddle/fluid/operators/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise_sub_op.cc
@@ -14,25 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise_sub_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-class ElementwiseSubOpMaker : public ElementwiseOpMaker {
- public:
-  ElementwiseSubOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Sub", "Out = X - Y");
-    AddComment(comment_);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(elementwise_sub, ops::ElementwiseOp,
-                  ops::ElementwiseSubOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(elementwise_sub_grad, ops::ElementwiseOpGrad);
+REGISTER_ELEMWISE_OP(elementwise_sub, "Sub", "Out = X - Y");
 REGISTER_OP_CPU_KERNEL(
     elementwise_sub,
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index 4ae91d074d3df8b910a7f5d816a22b6f1d51dff6..5ad0ec251328cc1ba580026bb47bf05316e7dc77 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -56,8 +56,7 @@ class ExpandOp : public framework::OperatorWithKernel {
 
 class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
              "X is the input to be expanded.");
diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..43f949111104ee56efc8625bdd609e412ef7f37d
--- /dev/null
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -0,0 +1,76 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fake_dequantize_op.h"
+#include <string>
+
+namespace paddle {
+namespace operators {
+
+class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel {
+ public:
+  FakeDequantizeMaxAbsOp(const std::string &type,
+                         const framework::VariableNameMap &inputs,
+                         const framework::VariableNameMap &outputs,
+                         const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of FakeDequantizeMaxAbsOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of FakeDequantizeMaxAbsOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class FakeDequantizeMaxAbsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor) The input with float-32/64 type is the "
+             "low precision tensor.");
+    AddOutput("Out",
+              "(Tensor) The output is the dequantized high "
+              "precision tensor.");
+    AddAttr<int>("num_bits",
+                 "(int) `num_bits` is the quantization level bits, "
+                 "such as 2, 5, 8.");
+    AddAttr<float>("scale",
+                   "(float) The maximum absolute value of low precision tensor."
+                   "It is usually calculated by the fake_quantize_max_abs_op.");
+    AddComment(R"DOC(
+FakeDequantizeMaxAbsOp operator.
+
+This calculation is an opposite operation of FakeQuantizeMaxAbsOp:
+
+$$Out = \frac{scale*X}{2^{num_bits} - 1}$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsOp,
+                  ops::FakeDequantizeMaxAbsOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(fake_dequantize_max_abs,
+                       ops::FakeDequantizeMaxAbsKernel<CPU, float>,
+                       ops::FakeDequantizeMaxAbsKernel<CPU, double>);
diff --git a/paddle/fluid/operators/matmul_op.cu.cc b/paddle/fluid/operators/fake_dequantize_op.cu
similarity index 68%
rename from paddle/fluid/operators/matmul_op.cu.cc
rename to paddle/fluid/operators/fake_dequantize_op.cu
index e021bbe645399e410cde5c3ff7035d4d68c71744..1bd38d1bd2c3a6f90d2fbad415d61efaead3afe9 100644
--- a/paddle/fluid/operators/matmul_op.cu.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
@@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/matmul_op.h"
+#include "paddle/fluid/operators/fake_dequantize_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    matmul, ops::MatMulKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    matmul_grad,
-    ops::MatMulGradKernel<paddle::platform::CUDADeviceContext, float>);
+using CUDA = paddle::platform::CUDADeviceContext;
+REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs,
+                        ops::FakeDequantizeMaxAbsKernel<CUDA, float>,
+                        ops::FakeDequantizeMaxAbsKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..0901e68b3761159c3cc9c6684567bee38ec3f16d
--- /dev/null
+++ b/paddle/fluid/operators/fake_dequantize_op.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& ctx) const {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(in->place());
+
+    int num_bits = ctx.Attr<int>("num_bits");
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    int range = std::pow(2, num_bits) - 1;
+
+    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
+    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+    eigen_out.device(dev) = (scale / range) * eigen_in;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 45e4d5b2b863a55ae0aa0414ff8697141fd2aa6f..a9ae1396db8d7dab0364779e506d5c0a3e2ff6ed 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -43,7 +43,7 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType FCOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   framework::LibraryType library{framework::LibraryType::kMKLDNN};
-  framework::DataLayout layout{framework::DataLayout::kAnyLayout};
+  framework::DataLayout layout{framework::DataLayout::kMKLDNN};
 
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
@@ -65,15 +65,14 @@ void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType FCOpGrad::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   framework::LibraryType library{framework::LibraryType::kMKLDNN};
-  framework::DataLayout layout{framework::DataLayout::kAnyLayout};
+  framework::DataLayout layout{framework::DataLayout::kMKLDNN};
 
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
       layout, library);
 }
 
-FCOpMaker::FCOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
+void FCOpMaker::Make() {
   AddInput("Input", "(Tensor) The input tensor of fully connected operator. ");
   AddInput("W", "(Tensor), The second input tensor of fc op.");
   AddOutput("Out", "(Tensor) The output tensor of fully connected operator. ");
diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h
index 70fa96440d344397a7427c1338afee85bde923d4..e1b780fc0c401fbf34a9db03aa31137cbc016939 100644
--- a/paddle/fluid/operators/fc_op.h
+++ b/paddle/fluid/operators/fc_op.h
@@ -45,7 +45,7 @@ class FCOpGrad : public framework::OperatorWithKernel {
 
 class FCOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FCOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+  void Make() override;
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/feed_op.cc b/paddle/fluid/operators/feed_op.cc
index debacf07c360b9aa69000a0d891f04239ed08807..bcb3e63ed7dbc775c1de6c4522f0548ea48a6cf0 100644
--- a/paddle/fluid/operators/feed_op.cc
+++ b/paddle/fluid/operators/feed_op.cc
@@ -66,8 +66,7 @@ class FeedOp : public framework::OperatorBase {
 
 class FeedOpInfoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FeedOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The input of feed op");
     AddOutput("Out", "The output of feed op");
     AddAttr<int>("col", "(int) The column of feed");
diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ad67585e485b237f5d4b809af712ce658ef602bb
--- /dev/null
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <future>  // NOLINT
+#include <ostream>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/rpc_client.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+
+class FetchBarrierOp : public framework::OperatorBase {
+ public:
+  FetchBarrierOp(const std::string& type,
+                 const framework::VariableNameMap& inputs,
+                 const framework::VariableNameMap& outputs,
+                 const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
+
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(place);
+    // For profiling
+    platform::RecordEvent record_event(Type(), &ctx);
+
+    detail::RPCClient* rpc_client =
+        detail::RPCClient::GetInstance<detail::GRPCClient>();
+
+    rpc_client->Wait();
+
+    for (auto& ep : eps) {
+      VLOG(3) << "fetch barrier, ep: " << ep;
+      rpc_client->AsyncSendFetchBarrier(ep);
+    }
+    rpc_client->Wait();
+  }
+};
+
+class FetchBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddComment(R"DOC(
+SendBarrier operator
+
+This operator will send a send barrier signal to list_and_serv op, so that
+the Parameter Server would knew all variables have been sent.
+)DOC");
+
+    AddAttr<std::vector<std::string>>("endpoints",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints to send variables to.")
+        .SetDefault({"127.0.0.1:6164"});
+  }
+};
+
+class FetchBarrierOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(fetch_barrier, ops::FetchBarrierOp,
+                  paddle::framework::EmptyGradOpMaker, ops::FetchBarrierOpMaker,
+                  ops::FetchBarrierOpShapeInference);
diff --git a/paddle/fluid/operators/fetch_op.cc b/paddle/fluid/operators/fetch_op.cc
index 18deec58137676a0b2c8d559e49d0f7a840cd5ba..1640a2a22c69a0e3ab81a2889d6105b2cf4162b7 100644
--- a/paddle/fluid/operators/fetch_op.cc
+++ b/paddle/fluid/operators/fetch_op.cc
@@ -66,8 +66,7 @@ class FetchOp : public framework::OperatorBase {
 
 class FetchOpInfoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FetchOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The input of fetch op");
     AddOutput("Out", "The output of fetch op");
     AddAttr<int>("col", "(int) The column of fetch");
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
index 72da80baaf9bb3286f09b7ae5fcf24326b391906..1ae78675a0cac8a72aeaef1227b631a41e4a10b2 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
@@ -30,9 +30,8 @@ class FillConstantBatchSizeLikeOp : public BatchSizeLikeOp {
 };
 
 class FillConstantBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
- public:
-  FillConstantBatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : BatchSizeLikeOpMaker(proto, op_checker) {
+ protected:
+  void Apply() override {
     AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.h b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
index 2a7df149a9f4b03676f172da980c927d7fa5e8a4..63ea60678f80708f5a8340edd22588553b9ec139 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.h
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
@@ -24,6 +24,14 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* in = ctx.Input<framework::LoDTensor>("Input");
+    if (in->lod().size() && ctx.Attr<int>("input_dim_idx") == 0) {
+      // set the correct batch size for the LoDTensor.
+      auto odims = out->dims();
+      int output_dim_idx = ctx.Attr<int>("output_dim_idx");
+      odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1;
+      out->mutable_data<T>(odims, ctx.GetPlace());
+    }
     out->mutable_data<T>(ctx.GetPlace());
     auto value = ctx.Attr<float>("value");
 
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 07e0a80f8d644d4d011f2821785d49ece6cecfb5..130f18dde4f979a6a9925ede9cbf745fcec14d48 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -59,8 +59,7 @@ class FillConstantOp : public framework::OperatorBase {
 
 class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FillConstantOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
diff --git a/paddle/fluid/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc
index ee8a2fc353f86cdabd35459a9195c3aa35f63e31..925dc19061e2196a40411f415eb6e5ad59ab52ff 100644
--- a/paddle/fluid/operators/fill_op.cc
+++ b/paddle/fluid/operators/fill_op.cc
@@ -82,8 +82,7 @@ class FillOp : public framework::OperatorBase {
 
 class FillOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FillOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddComment(R"DOC(Fill operator
 
 Fill an tensor with `value` and `shape`. The type of the tensor is specify by
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cc b/paddle/fluid/operators/fill_zeros_like_op.cc
index 58c814ba6413626a48310da595a13238994f5ef1..d67bec36b3248be8602da562a88aeb58f5effe39 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cc
@@ -33,8 +33,7 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
 
 class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FillZerosLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The input of fill-zeros-like op.");
     AddOutput("Out", "The variable will be filled up with zeros.");
     AddComment(R"DOC(
diff --git a/paddle/fluid/operators/ftrl_op.cc b/paddle/fluid/operators/ftrl_op.cc
index cbdcce9beb3fafb0775d0b5fc39cb381ad128d0c..70ba25c213046cc934f46be067080d5fdbb42f9e 100644
--- a/paddle/fluid/operators/ftrl_op.cc
+++ b/paddle/fluid/operators/ftrl_op.cc
@@ -64,8 +64,7 @@ class FTRLOp : public framework::OperatorWithKernel {
 
 class FTRLOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FTRLOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
              "Input parameter value that has to be updated.");
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 4c82f5c429038504d9876ee240a705911feb0b7a..aa3e05b83b23569a4dd9c83294916e289f993abc 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -33,7 +33,6 @@ class GatherOp : public framework::OperatorWithKernel {
     auto index_dims = ctx->GetInputDim("Index");
     PADDLE_ENFORCE(index_dims.size() == 1);
     int batch_size = ctx->GetInputDim("Index")[0];
-    PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
     framework::DDim output_dims(ctx->GetInputDim("X"));
     output_dims[0] = batch_size;
     ctx->SetOutputDim("Out", output_dims);
@@ -67,8 +66,7 @@ class GatherGradOp : public framework::OperatorWithKernel {
 
 class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  GatherOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The source input of gather op");
     AddInput("Index", "The index input of gather op");
     AddOutput("Out", "The output of gather op");
diff --git a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
index 53c706a83e5bfb9e93d485141314e8b652d73593..8050f61d4546f3351645f23ddcc63b2c49f17929 100644
--- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
@@ -32,9 +32,8 @@ class GaussianRandomBatchSizeLikeOp : public BatchSizeLikeOp {
 };
 
 class GaussianRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
- public:
-  GaussianRandomBatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : BatchSizeLikeOpMaker(proto, op_checker) {
+ protected:
+  void Apply() override {
     AddAttr<float>("mean",
                    "(float, default 0.0) "
                    "mean of random tensor.")
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index 4d197637b3f49f7e63f5b1a5cba212d1bf774f7e..815c1bb50988be49ca9996e368a59344c6583d58 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -70,8 +70,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
 
 class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  GaussianRandomOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddOutput("Out", "Output matrix of gaussian random op");
 
     AddAttr<std::vector<int>>("shape",
diff --git a/paddle/fluid/operators/gen_nccl_id_op.cc b/paddle/fluid/operators/gen_nccl_id_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..547de4fa49dc16182c424118c0f5705d2396100a
--- /dev/null
+++ b/paddle/fluid/operators/gen_nccl_id_op.cc
@@ -0,0 +1,134 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <nccl.h>
+#include <stdint.h>
+#include <ostream>
+#include <string>
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/detail/request_handler_impl.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+
+namespace paddle {
+namespace operators {
+
+class GenNCCLIdOp : public framework::OperatorBase {
+ public:
+  GenNCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
+              const framework::VariableNameMap& outputs,
+              const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    // put nccl id in CPUPlace
+    auto& dev_ctx = *pool.Get(platform::CPUPlace());
+    int trainer_id = Attr<int>("trainer_id");
+    framework::Scope& local_scope = scope.NewScope();
+
+    if (trainer_id == 0) {
+      GenerateAndSend(&local_scope, dev_ctx);
+    } else {
+      GetIdByServer(&local_scope, dev_ctx);
+    }
+  }
+
+ private:
+  void GenerateAndSend(framework::Scope* scope,
+                       const platform::DeviceContext& dev_ctx) const {
+    auto var = scope->FindVar(NCCL_ID_VARNAME);
+    PADDLE_ENFORCE_NOT_NULL(var);
+    auto id = var->GetMutable<ncclUniqueId>();
+    PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(id));
+
+    std::vector<std::string> endpoint_list =
+        Attr<std::vector<std::string>>("endpoint_list");
+    detail::RPCClient* client =
+        detail::RPCClient::GetInstance<detail::GRPCClient>();
+    for (auto& ep : endpoint_list) {
+      VLOG(3) << "sending nccl id to " << ep;
+      client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME);
+    }
+    client->Wait();
+    VLOG(3) << "sending completed...";
+  }
+
+  void GetIdByServer(framework::Scope* scope,
+                     const platform::DeviceContext& dev_ctx) const {
+    std::string endpoint = Attr<std::string>("endpoint");
+    // NOTE: Can not use unique_ptr here because the default
+    // deleter will call GRPC Server's base class's dtor and
+    // that will cause a wired crash.
+    detail::RequestSendHandler rpc_h(true);
+    detail::AsyncGRPCServer rpc_service(endpoint, 1);
+    rpc_service.RegisterRPC(detail::kRequestSend, &rpc_h);
+    rpc_h.SetRPCServer(&rpc_service);
+
+    framework::ProgramDesc empty_program;
+    framework::Executor executor(dev_ctx.GetPlace());
+    rpc_h.SetScope(scope);
+    rpc_h.SetDevCtx(&dev_ctx);
+    rpc_h.SetProgram(&empty_program);
+    rpc_h.SetExecutor(&executor);
+
+    std::thread server_thread(
+        std::bind(&detail::AsyncGRPCServer::StartServer, &rpc_service));
+    rpc_service.SetCond(detail::kRequestSend);
+    VLOG(3) << "start getting nccl id from trainer 0...";
+    rpc_service.WaitBarrier(detail::kRequestSend);
+    VLOG(3) << "got nccl id and stop server...";
+    rpc_service.ShutDown();
+    VLOG(3) << "rpc server stopped";
+    server_thread.join();
+  }
+};
+
+class GenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddOutput("NCCLID", "Raw variable contains a NCCL UniqueId instaces.");
+    AddComment(R"DOC(
+GenNCCLId operator
+
+For trainer 0: generate a new UniqueId and send it to all the other trainers.
+For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
+)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string), e.g. 127.0.0.1:6175 "
+                         "current listen endpoint");
+    AddAttr<std::vector<std::string>>(
+        "endpoint_list",
+        "['trainer1_ip:port', 'trainer2_ip:port', ...] "
+        "list of trainer endpoints start from trainer 1")
+        .SetDefault({});
+    AddAttr<int>("trainer_id",
+                 "(int default 0) "
+                 "The index of the trainer in distributed training.")
+        .SetDefault(0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(gen_nccl_id, ops::GenNCCLIdOp, ops::GenNCCLIdOpMaker);
diff --git a/paddle/fluid/operators/get_places_op.cc b/paddle/fluid/operators/get_places_op.cc
index 0d7219ac5c624236b85916d5faf6810dbed2198a..eafc364a15fa17cc5107bba737b0b44e712b0bef 100644
--- a/paddle/fluid/operators/get_places_op.cc
+++ b/paddle/fluid/operators/get_places_op.cc
@@ -78,8 +78,7 @@ class GetPlacesOp : public framework::OperatorBase {
 
 class GetPlacesOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  GetPlacesOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddOutput("Out", "vector of Place");
     AddAttr<int>("device_count", "device count").SetDefault(0);
     AddAttr<std::string>("device_type", "device type")
diff --git a/paddle/fluid/operators/go_op.cc b/paddle/fluid/operators/go_op.cc
index b8e1556c23a3b7357ed56d1b83c09622559040a4..48f9d967adc90838dc4c7a09bfaf5a5a1ac9c99b 100644
--- a/paddle/fluid/operators/go_op.cc
+++ b/paddle/fluid/operators/go_op.cc
@@ -89,8 +89,7 @@ class GoOp : public framework::OperatorBase {
 
 class GoOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  GoOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(kX,
              "A set of variables, which are required by operators inside the "
              "block of Go Op.")
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index 0a524c914d305661745c5d85cbbee2edb57c97ba..5c746878823b3dcde2573feec00d3d9dac5ceab8 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -71,8 +71,7 @@ class GRUOp : public framework::OperatorWithKernel {
 
 class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  GRUOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Input",
              "(LoDTensor) The first input is a LodTensor, which supports "
              "variable-time length input sequence. The underlying tensor in "
diff --git a/paddle/fluid/operators/gru_unit_op.cc b/paddle/fluid/operators/gru_unit_op.cc
index f8d1d44b5423dd09fe5aad11434911af6f14fe77..82a808b01e99ec33b0ca00a065fb301d3c633b19 100644
--- a/paddle/fluid/operators/gru_unit_op.cc
+++ b/paddle/fluid/operators/gru_unit_op.cc
@@ -71,8 +71,7 @@ class GRUUnitOp : public framework::OperatorWithKernel {
 
 class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  GRUUnitOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Input",
              "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the "
              "input.");
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
index 086b5a97dec9a3d5b8f91b802b92d64ca73bf57c..69e7fa4490b892373d85898b13b976a474a6096a 100644
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -46,8 +46,7 @@ class HingeLossOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class HingeLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  HingeLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Logits",
              "The input value (Logits) of Hinge loss op."
              "Logits is a 2-D tensor with shape [batch_size, 1].");
diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc
index 74d8e0e2b76adc7a3e69649f277a8c0df6f38056..4ecd8634ff41ff4eba6b5ed1d0fc78068190dce5 100644
--- a/paddle/fluid/operators/huber_loss_op.cc
+++ b/paddle/fluid/operators/huber_loss_op.cc
@@ -45,8 +45,7 @@ class HuberLossOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class HuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  HuberLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "The input value of huber loss op."
              "X is a 2-D tensor with shape [batch_size, 1].");
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index 8c120eec86601146500721bbb4249bc458190093..0669661d225c664010fce97f0a526b62988b92c5 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -54,8 +54,7 @@ class Im2SequenceOp : public framework::OperatorWithKernel {
 
 class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Im2SequenceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(Tensor) The input tensor has NCHW format."
              "N: batch size"
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
index d8c97b27b328b1470bece4a6c1872b5ccc75115e..f0ffc9706689f5afe4546c3483114b38bc2b7872 100644
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -47,8 +47,7 @@ class IncrementOp : public framework::OperatorWithKernel {
 
 class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  IncrementOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(Tensor) The input tensor of increment operator");
     AddOutput("Out", "(Tensor) The output tensor of increment operator.");
     AddAttr<float>("step",
diff --git a/paddle/fluid/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc
index 2a7be90dab1cc23ffe5e1c296c37a4bbeacb7d8e..29b73951bbddd9bfd73c932d7801797590de5e8e 100644
--- a/paddle/fluid/operators/is_empty_op.cc
+++ b/paddle/fluid/operators/is_empty_op.cc
@@ -12,46 +12,41 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/is_empty_op.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace operators {
 
-constexpr char kInput[] = "X";
-constexpr char kOutput[] = "Out";
-
-class IsEmptyOp : public framework::OperatorBase {
+class IsEmptyOp : public framework::OperatorWithKernel {
  public:
-  IsEmptyOp(const std::string &type, const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    // get input
-    auto *var = scope.FindVar(Input(kInput));
-    PADDLE_ENFORCE_NOT_NULL(var);
-    auto &tensor = var->Get<framework::LoDTensor>();
-    // get output
-    auto *out = scope.FindVar(Output(kOutput));
-    PADDLE_ENFORCE_NOT_NULL(out);
-    auto *out_tensor = out->GetMutable<framework::LoDTensor>();
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of IsEmptyOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of IsEmptyOp should not be null.");
+    ctx->SetOutputDim("Out", {1});
+  }
 
-    out_tensor->Resize({1});
-    out_tensor->mutable_data<bool>(platform::CPUPlace())[0] =
-        framework::product(tensor.dims()) == 0;
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        platform::CPUPlace());
+    return kt;
   }
 };
 
-class IsEmptyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+class IsEmptyOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  IsEmptyOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(kInput, "(Tensor) Tensor which is to be checked.");
-    AddOutput(kOutput, "(Tensor) a boolean Tensor that indicate empty or not.");
+  void Make() override {
+    AddInput("X", "(LoDTensor) Tensor which is to be checked.");
+    AddOutput("Out",
+              "(LoDTensor) a boolean Tensor that indicate empty or not.");
     AddComment(R"DOC(
 IsEmpty Operator which checks whether a tensor is empty.
 
@@ -63,5 +58,12 @@ It will just return product(tensor.ddims()) > 0;
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_WITHOUT_GRADIENT(is_empty, paddle::operators::IsEmptyOp,
-                             paddle::operators::IsEmptyOpProtoMaker);
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(is_empty, ops::IsEmptyOp, ops::IsEmptyOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    is_empty, ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/is_empty_op.h b/paddle/fluid/operators/is_empty_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e3af22fa8d842b6a1e67418446f1a40949e046b
--- /dev/null
+++ b/paddle/fluid/operators/is_empty_op.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class IsEmptyOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    // get input
+    auto* input_tensor = context.Input<framework::LoDTensor>("X");
+    // get output
+    auto* output_tensor = context.Output<framework::LoDTensor>("Out");
+
+    output_tensor->mutable_data<bool>(platform::CPUPlace())[0] =
+        framework::product(input_tensor->dims()) == 0;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc
index 0c143b7c8aed13a202e2597632d17d8bccc8b66d..bc115090acb473ac3175999ca96c5e00c0aeaeae 100644
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
@@ -48,8 +48,7 @@ class L1NormGradOp : public framework::OperatorWithKernel {
 
 class L1NormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  L1NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(Tensor) The input of l1_norm op.");
     AddOutput("Out", "(Scalar) The output of l1_norm op.");
     AddComment(R"DOC(
diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc
index a73c626032f3bf6e97ac5974424e76bacb9a0799..da59bd53bce010d0d6ad2ab14acaffb9cc2f99e6 100644
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -47,8 +47,7 @@ class LabelSmoothOp : public framework::OperatorWithKernel {
 
 class LabelSmoothOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LabelSmoothOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(LoDTensor) The input labels of LabelSmooth operator. This "
              "input can be batched labels in one-hot encoding or output from "
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
index de1056aef7bfa2f53f8a92b262e7d15aa7c2b75c..ab097d31e9ab5eafa788539170e7e405df697625 100644
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -61,8 +61,7 @@ class LayerNormOp : public framework::OperatorWithKernel {
 
 class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LayerNormOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(LoDTensor) The input tensor.");
     AddInput("Scale",
              "(Tensor, optional) Scale is a 1-dimensional tensor of size "
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
index 2f29e377fdada918f2c9dca8c2d94eb06278320d..e38525cd7f44de020f364ffd16e71a439048347f 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -19,8 +19,7 @@ namespace operators {
 
 class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LinearChainCRFOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Emission",
              "(LoDTensor, default LoDTensor<float>) "
              "A 2-D LoDTensor with shape [N x D], where N is the size of the "
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 8acbf820250957163397342c645b333f0da0801c..ee7b01a54ca96e7685b96f58ebfd1a454b19a976 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -12,22 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <stdio.h>  // for removing the port file
+#include <csignal>
+#include <cstdlib>
 #include <fstream>
-#include <ostream>
 #include <thread>  // NOLINT
 #include <vector>
 
+#include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/detail/request_handler_impl.h"
 #include "paddle/fluid/operators/listen_and_serv_op.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
 
-void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
-  service->RunSyncUpdate();
+void RunServer(std::shared_ptr<detail::RPCServer> service) {
+  service->StartServer();
   VLOG(4) << "RunServer thread end";
 }
-
 static void split(const std::string &str, char sep,
                   std::vector<std::string> *pieces) {
   pieces->clear();
@@ -57,9 +60,8 @@ static void ParallelExecuteBlocks(
         framework::Async([&executor, &prepared, &program, &scope, idx]() {
           int run_block = idx;  // thread local
           try {
-            executor->RunPreparedContext(prepared[run_block].get(), scope,
-                                         false, false);
-          } catch (std::exception &e) {
+            executor->RunPreparedContext(prepared[run_block].get(), scope);
+          } catch (const std::exception &e) {
             LOG(ERROR) << "run sub program error " << e.what();
           }
         }));
@@ -67,41 +69,30 @@ static void ParallelExecuteBlocks(
   for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
 }
 
-std::atomic_int ListenAndServOp::selected_port_{0};
-
 ListenAndServOp::ListenAndServOp(const std::string &type,
                                  const framework::VariableNameMap &inputs,
                                  const framework::VariableNameMap &outputs,
                                  const framework::AttributeMap &attrs)
     : OperatorBase(type, inputs, outputs, attrs) {}
 
+ListenAndServOp::~ListenAndServOp() { Stop(); }
+
 void ListenAndServOp::Stop() {
-  rpc_service_->Push(LISTEN_TERMINATE_MESSAGE);
+  rpc_service_->ShutDown();
   server_thread_->join();
+  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
+  remove(file_path.c_str());
 }
 
-void ListenAndServOp::SavePort(const std::string &file_path) const {
+void ListenAndServOp::SavePort() const {
   // NOTE: default write file to /tmp/paddle.selected_port
-  selected_port_ = rpc_service_->GetSelectedPort();
-
-  std::ofstream port_file;
-  port_file.open(file_path);
-  port_file << selected_port_.load();
-  port_file.close();
-  VLOG(4) << "selected port written to " << file_path;
-}
-
-void ListenAndServOp::WaitServerReady() {
-  while (selected_port_.load() == 0) {
-  }
+  rpc_service_->SavePort();
 }
 
 void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
                                   framework::ProgramDesc *program,
                                   framework::Scope *recv_scope,
                                   framework::BlockDesc *prefetch_block) const {
-  auto fan_in = Attr<int>("Fanin");
-
   size_t num_blocks = program->Size();
   PADDLE_ENFORCE_GE(num_blocks, 2,
                     "server program should have at least 2 blocks");
@@ -116,49 +107,21 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
       optimize_prepared.begin(),
       std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
 
-  bool exit_flag = false;
-  // Record received sparse variables, so that
-  // we could reset those after execute optimize program
-  std::vector<framework::Variable *> sparse_vars;
-  while (!exit_flag) {
+  rpc_service_->ResetBarrierCounter();
+  while (true) {
     // Get from multiple trainers, we don't care about the order in which
     // the gradients arrives, just add suffix 0~n and merge the gradient.
-    rpc_service_->SetCond(0);
-    size_t recv_var_cnt = 0;
-    int batch_barrier = 0;
-    while (batch_barrier != fan_in) {
-      const detail::ReceivedMessage v = rpc_service_->Get();
-      auto recv_var_name = v.first;
-      if (recv_var_name == LISTEN_TERMINATE_MESSAGE) {
-        LOG(INFO) << "received terminate message and exit";
-        exit_flag = true;
-        break;
-      } else if (recv_var_name == BATCH_BARRIER_MESSAGE) {
-        VLOG(3) << "recv batch barrier message";
-        batch_barrier++;
-        continue;
-      } else {
-        VLOG(3) << "received grad: " << recv_var_name;
-        recv_var_cnt++;
-        auto var = v.second->GetVar();
-        if (var == nullptr) {
-          LOG(ERROR) << "Can not find server side var: " << recv_var_name;
-          PADDLE_THROW("Can not find server side var");
-        }
-        if (var->IsType<framework::SelectedRows>()) {
-          sparse_vars.push_back(var);
-        }
-      }
-    }
-    if (exit_flag) {
-      rpc_service_->SetCond(1);
-      rpc_service_->ShutDown();
+    rpc_service_->SetCond(detail::kRequestSend);
+    rpc_service_->WaitBarrier(detail::kRequestSend);
+
+    if (rpc_service_->IsExit()) {
+      LOG(WARNING) << "get exit!rpc_processor break!";
+      rpc_service_->SetCond(detail::kRequestGet);
       break;
     }
 
     // NOTE: if is_gpu_place, CUDA kernels are launched by multiple threads
     // and this will still work.
-
     // The optimize blocks which have the same parent ID would run parallel
     // TODO(Yancey1989): need to use ParallelExecutor for future
     int32_t last_parent_blkid = program->Block(1).Parent();
@@ -180,67 +143,31 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
                           recv_scope);
     VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)";
 
-    // Reset the received sparse variables, the sum operator would not
-    // sum the input sparse variables which rows is empty at the next
-    // mini-batch.
-    // TODO(Yancey1989): move the reset action into an operator, we couldn't
-    // have any hide logic in the operator.
-    for (auto &var : sparse_vars) {
-      var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
-    }
-    rpc_service_->SetCond(1);
-    // FIXME(typhoonzero): use another condition to sync wait clients get.
-    rpc_service_->WaitClientGet(fan_in);
-    sparse_vars.clear();
+    rpc_service_->SetCond(detail::kRequestGet);
+    rpc_service_->WaitBarrier(detail::kRequestGet);
+    rpc_service_->ResetBarrierCounter();
+    rpc_service_->ResetSparseVarsRecorder();
   }  // while(true)
 }
 
-static void AsyncUpdateThread(
-    const std::string &var_name, const bool &exit_flag,
-    const std::shared_ptr<detail::ReceivedQueue> &queue,
-    framework::Executor *executor,
-    framework::ExecutorPrepareContext *prepared) {
-  VLOG(3) << "update thread for " << var_name << " started";
-  while (!exit_flag) {
-    const detail::ReceivedMessage v = queue->Pop();
-    auto recv_var_name = v.first;
-    auto var = v.second->GetVar();
-    if (var == nullptr) {
-      LOG(ERROR) << "Can not find server side var: " << recv_var_name;
-      PADDLE_THROW("Can not find server side var");
-    }
-    auto fs = framework::Async([var_name, &executor, &v, prepared] {
-      try {
-        executor->RunPreparedContext(prepared, v.second->GetMutableLocalScope(),
-                                     false, false);
-      } catch (std::exception &e) {
-        LOG(ERROR) << "run sub program error " << e.what();
-      }
-    });
-    fs.wait();
-  }
-}
-
 void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
                                    framework::ProgramDesc *program) const {
   VLOG(3) << "RunAsyncLoop in";
   // grad name to block id
   std::unordered_map<std::string, int32_t> grad_to_block_id;
   std::unordered_map<int32_t, std::string> id_to_grad;
-  std::unordered_map<std::string, std::shared_ptr<detail::ReceivedQueue>>
-      grad_to_queue;
 
   auto grad_to_block_id_str =
       Attr<std::vector<std::string>>("grad_to_block_id");
-  for (auto &grad_and_id : grad_to_block_id_str) {
+  for (const auto &grad_and_id : grad_to_block_id_str) {
     std::vector<std::string> pieces;
     split(grad_and_id, ':', &pieces);
     VLOG(3) << "after split, grad = " << pieces[0] << ", id=" << pieces[1];
     PADDLE_ENFORCE_EQ(pieces.size(), 2);
     PADDLE_ENFORCE_EQ(grad_to_block_id.count(pieces[0]), 0);
+
     int block_id = std::stoi(pieces[1]);
     grad_to_block_id[pieces[0]] = block_id;
-    grad_to_queue[pieces[0]] = std::make_shared<detail::ReceivedQueue>();
     id_to_grad[block_id] = pieces[0];
   }
   size_t num_blocks = program->Size();
@@ -259,40 +186,36 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
     grad_to_prepared_ctx[id_to_grad[block_list[i]]] = optimize_prepared[i];
   }
 
-  bool exit_flag = false;
-
-  VLOG(3) << "start async optimize threads";
-  std::vector<std::future<void>> fs;
-  for (auto iter = grad_to_queue.begin(); iter != grad_to_queue.end(); iter++) {
-    std::string grad_name = iter->first;
-    VLOG(3) << "create async update thread for " << grad_name;
-    fs.push_back(framework::AsyncIO([grad_name, &exit_flag, &executor,
-                                     &grad_to_queue, &grad_to_prepared_ctx]() {
-      AsyncUpdateThread(grad_name, exit_flag, grad_to_queue[grad_name],
-                        executor, grad_to_prepared_ctx[grad_name].get());
-    }));
-  }
+  request_send_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
+  request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
+  request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
 
   VLOG(3) << "RunAsyncLoop into while";
-  while (!exit_flag) {
-    const detail::ReceivedMessage v = rpc_service_->Get();
-    auto recv_var_name = v.first;
-    if (recv_var_name == LISTEN_TERMINATE_MESSAGE) {
-      LOG(INFO) << "received terminate message and exit";
-      exit_flag = true;
+  while (true) {
+    if (rpc_service_->IsExit()) {
+      LOG(INFO) << "get exit!rpc_processor break!";
       break;
-    } else {
-      VLOG(3) << "received grad: " << recv_var_name;
-      grad_to_queue[recv_var_name]->Push(v);
     }
 
-    if (exit_flag) {
-      rpc_service_->ShutDown();
-      break;
-    }
+    sleep(1);
   }  // while(true)
 }
 
+static void FillRequestCtx(detail::RequestHandler *h, framework::Scope *scope,
+                           platform::DeviceContext *dev_ctx,
+                           framework::Executor *executor,
+                           framework::ProgramDesc *program,
+                           framework::ExecutorPrepareContext *prefetch_ctx,
+                           detail::RPCServer *rpc_server) {
+  h->SetScope(scope);
+  h->SetDevCtx(dev_ctx);
+  h->SetExecutor(executor);
+  h->SetProgram(program);
+  h->SetPrefetchPreparedCtx(
+      std::unique_ptr<framework::ExecutorPrepareContext>(prefetch_ctx));
+  h->SetRPCServer(rpc_server);
+}
+
 void ListenAndServOp::RunImpl(const framework::Scope &scope,
                               const platform::Place &dev_place) const {
   // Mark this as PS that it should decide profiling by listening from trainer.
@@ -302,38 +225,54 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   framework::Scope &recv_scope = scope.NewScope();
 
   bool sync_mode = Attr<bool>("sync_mode");
+  auto fan_in = Attr<int>("Fanin");
 
   PADDLE_ENFORCE(!rpc_service_);
   std::string endpoint = Attr<std::string>("endpoint");
 
-  rpc_service_.reset(new detail::AsyncGRPCServer(endpoint, sync_mode));
+  LOG(INFO) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in
+            << ", end_point:" << endpoint;
+
+  // request_handler_.reset(new detail::GRPCRequestSendHandler(sync_mode));
+  rpc_service_.reset(new detail::AsyncGRPCServer(endpoint, fan_in));
+  request_send_handler_.reset(new detail::RequestSendHandler(sync_mode));
+  request_get_handler_.reset(new detail::RequestGetHandler(sync_mode));
+  request_prefetch_handler_.reset(
+      new detail::RequestPrefetchHandler(sync_mode));
+
+  rpc_service_->RegisterRPC(detail::kRequestSend, request_send_handler_.get());
+  rpc_service_->RegisterRPC(detail::kRequestGet, request_get_handler_.get());
+  rpc_service_->RegisterRPC(detail::kRequestPrefetch,
+                            request_prefetch_handler_.get());
 
   auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock);
   auto *prefetch_block = Attr<framework::BlockDesc *>(kPrefetchBlock);
   auto *program = optimize_block->Program();
   framework::Executor executor(dev_place);
 
-  // prepare rpc_service
-  rpc_service_->SetScope(&recv_scope);
-  rpc_service_->SetDevCtx(&dev_ctx);
-  rpc_service_->SetProgram(program);
-  rpc_service_->SetExecutor(&executor);
-
   // prepare for prefetch
   VLOG(3) << "prefetch block id is " << prefetch_block->ID();
   auto prefetch_prepared = executor.Prepare(*program, prefetch_block->ID());
-  rpc_service_->SetPrefetchPreparedCtx(prefetch_prepared.get());
-  prefetch_prepared.release();
+
+  auto f = std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope,
+                     &dev_ctx, &executor, program, prefetch_prepared.release(),
+                     rpc_service_.get());
+
+  f(request_send_handler_.get());
+  f(request_get_handler_.get());
+  f(request_prefetch_handler_.get());
 
   // start the server listening after all member initialized.
   server_thread_.reset(new std::thread(RunServer, rpc_service_));
   VLOG(3) << "wait server thread to become ready...";
   rpc_service_->WaitServerReady();
 
+  // register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers
+  signal(SIGINT, SignalHandler::StopAndExit);
+  signal(SIGTERM, SignalHandler::StopAndExit);
+
   // Write to a file of server selected port for python use.
-  std::string file_path = string::Sprintf("/tmp/paddle.%d.selected_port",
-                                          static_cast<int>(::getpid()));
-  SavePort(file_path);
+  SavePort();
   if (sync_mode) {
     RunSyncLoop(&executor, program, &recv_scope, prefetch_block);
   } else {
@@ -343,15 +282,11 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
 
 class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ListenAndServOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() {
     AddInput("X", "(Tensor) Variables that server recv.").AsDuplicable();
-    AddComment(R"DOC(
-ListenAndServ operator
-
-This operator will start a RPC server which can receive variables
-from send_op and send back variables to recv_op.
-)DOC");
+    AddComment(R"DOC(" + "ListenAndServ operator" + "\n" + "This operator" +
+" will start a RPC server which can receive variables from send_op and send" +
+"back variables to recv_op.)DOC");
     AddAttr<std::string>("endpoint",
                          "(string, default 127.0.0.1:6164)"
                          "IP address to listen on.")
@@ -372,6 +307,11 @@ from send_op and send back variables to recv_op.
   }
 };
 
+void SignalHandler::StopAndExit(int signal_num) {
+  VLOG(3) << "Catch interrupt signal: " << signal_num << ", program will exit";
+  exit(0);
+}
+
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h
index f52a55c5c2d6902df6cb7e0a0d7242c6e86dc786..87952cb0e683596b2b0395890b6e25b15f74d7e2 100644
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -16,14 +16,15 @@ limitations under the License. */
 
 #include <stdint.h>
 #include <atomic>
-#include <ostream>
+#include <set>
 #include <string>
 
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/detail/request_handler.h"
+#include "paddle/fluid/operators/detail/rpc_server.h"
 
 namespace paddle {
 namespace operators {
@@ -31,7 +32,7 @@ namespace operators {
 constexpr char kOptimizeBlock[] = "OptimizeBlock";
 constexpr char kPrefetchBlock[] = "PrefetchBlock";
 
-void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service);
+void RunServer(std::shared_ptr<detail::RPCServer> service);
 
 class ListenAndServOp : public framework::OperatorBase {
  public:
@@ -40,6 +41,8 @@ class ListenAndServOp : public framework::OperatorBase {
                   const framework::VariableNameMap& outputs,
                   const framework::AttributeMap& attrs);
 
+  virtual ~ListenAndServOp();
+
   void RunSyncLoop(framework::Executor* executor,
                    framework::ProgramDesc* program,
                    framework::Scope* recv_scope,
@@ -48,25 +51,30 @@ class ListenAndServOp : public framework::OperatorBase {
   void RunAsyncLoop(framework::Executor* executor,
                     framework::ProgramDesc* program) const;
 
-  void SavePort(
-      const std::string& file_path = "/tmp/paddle.selected_port") const;
-
-  void WaitServerReady();
+  void SavePort() const;
 
-  int GetSelectedPort() { return selected_port_; }
+  int GetSelectedPort() { return rpc_service_->GetSelectedPort(); }
 
   void Stop() override;
 
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override;
 
-  static void ResetPort() { selected_port_ = 0; }
-
  protected:
-  mutable std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
+  mutable std::shared_ptr<detail::RPCServer> rpc_service_;
+  mutable std::shared_ptr<detail::RequestHandler> request_send_handler_;
+  mutable std::shared_ptr<detail::RequestHandler> request_get_handler_;
+  mutable std::shared_ptr<detail::RequestHandler> request_prefetch_handler_;
+
   mutable std::shared_ptr<std::thread> server_thread_;
-  // FIXME(wuyi): it's static so that the operator can be cloned.
-  static std::atomic_int selected_port_;
+};
+
+class SignalHandler {
+ public:
+  static void StopAndExit(int signal_num);
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(SignalHandler);
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc
index e5353144e91455fc71460459e6e799b54f750f71..0522a94195786c767194ec727d982a60451e7c62 100644
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <fstream>
-
+#include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
 
@@ -31,6 +31,7 @@ class LoadCombineOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
+    auto load_as_fp16 = Attr<bool>("load_as_fp16");
 
     std::ifstream fin(filename);
     PADDLE_ENFORCE(static_cast<bool>(fin),
@@ -59,17 +60,25 @@ class LoadCombineOp : public framework::OperatorBase {
       // Get data from fin to tensor
       DeserializeFromStream(fin, tensor, dev_ctx);
 
-      if (platform::is_gpu_place(place)) {
-        // copy CPU to GPU
-        framework::LoDTensor cpu_tensor;
-        cpu_tensor.ShareDataWith(*tensor);
-        cpu_tensor.set_lod(tensor->lod());
-
-        // reset tensor
+      auto in_dtype = framework::ToDataType(tensor->type());
+      auto out_dtype =
+          load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
+
+      if (in_dtype != out_dtype) {
+        // convert to float16 tensor
+        auto in_kernel_type = framework::OpKernelType(in_dtype, place);
+        auto out_kernel_type = framework::OpKernelType(out_dtype, place);
+        framework::LoDTensor fp16_tensor;
+        // copy LoD info to the new tensor
+        fp16_tensor.set_lod(tensor->lod());
+        framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
+                                 &fp16_tensor);
+
+        // reset output tensor
         out_var->Clear();
         tensor = out_var->GetMutable<framework::LoDTensor>();
-        tensor->set_lod(cpu_tensor.lod());
-        TensorCopy(cpu_tensor, place, dev_ctx, tensor);
+        tensor->set_lod(fp16_tensor.lod());
+        tensor->ShareDataWith(fp16_tensor);
       }
     }
   }
@@ -77,12 +86,18 @@ class LoadCombineOp : public framework::OperatorBase {
 
 class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoadCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddOutput(
         "Out",
         "(vector) The output LoDTensors that will be read from the input file.")
         .AsDuplicable();
+    AddAttr<bool>(
+        "load_as_fp16",
+        "(boolean, default false)"
+        "If true, the tensor will be first loaded and then "
+        "converted to float16 data type. Otherwise, the tensor will be "
+        "directly loaded without data type conversion.")
+        .SetDefault(false);
     AddAttr<std::string>("file_path",
                          "(string) "
                          "LoDTensors will be loaded from \"file_path\".")
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index 6ffe0bec5e38432676ecadfa1abbbe70a1425bb1..93f45cff8a26201b1fbb1c44141e125a67c44037 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <fstream>
 
+#include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -47,26 +48,40 @@ class LoadOp : public framework::OperatorBase {
 
     DeserializeFromStream(fin, tensor, *dev_ctx);
 
-    if (platform::is_gpu_place(place)) {
-      // copy CPU to GPU
-      framework::LoDTensor cpu_tensor;
-      cpu_tensor.ShareDataWith(*tensor);
-      cpu_tensor.set_lod(tensor->lod());
-
-      // reset tensor
+    auto load_as_fp16 = Attr<bool>("load_as_fp16");
+    auto in_dtype = framework::ToDataType(tensor->type());
+    auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
+
+    if (in_dtype != out_dtype) {
+      // convert to float16 tensor
+      auto in_kernel_type = framework::OpKernelType(in_dtype, place);
+      auto out_kernel_type = framework::OpKernelType(out_dtype, place);
+      framework::LoDTensor fp16_tensor;
+      // copy LoD info to the new tensor
+      fp16_tensor.set_lod(tensor->lod());
+      framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
+                               &fp16_tensor);
+
+      // reset output tensor
       out_var->Clear();
       tensor = out_var->GetMutable<framework::LoDTensor>();
-      tensor->set_lod(cpu_tensor.lod());
-      TensorCopy(cpu_tensor, place, *dev_ctx, tensor);
+      tensor->set_lod(fp16_tensor.lod());
+      tensor->ShareDataWith(fp16_tensor);
     }
   }
 };
 
 class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoadOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddOutput("Out", "(Tensor) The tensor need to be loaded");
+    AddAttr<bool>(
+        "load_as_fp16",
+        "(boolean, default false)"
+        "If true, the tensor will be first loaded and then "
+        "converted to float16 data type. Otherwise, the tensor will be "
+        "directly loaded without data type conversion.")
+        .SetDefault(false);
     AddAttr<std::string>("file_path",
                          "(string) "
                          "Variable will be loaded from \"file_path\".")
diff --git a/paddle/fluid/operators/lod_array_length_op.cc b/paddle/fluid/operators/lod_array_length_op.cc
index e6212405770093455ec89bde9dc0a092b956fc83..e4551b8ba681fe92ac5f21bb0b509f43439f6b66 100644
--- a/paddle/fluid/operators/lod_array_length_op.cc
+++ b/paddle/fluid/operators/lod_array_length_op.cc
@@ -40,8 +40,7 @@ class LoDArrayLengthOp : public framework::OperatorBase {
 
 class LoDArrayLengthProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoDArrayLengthProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(LoDTensorArray) The input tensor array.");
     AddOutput("Out", "(Tensor) 1x1 CPU Tensor of length, int64_t");
     AddComment(R"DOC(
diff --git a/paddle/fluid/operators/lod_rank_table_op.cc b/paddle/fluid/operators/lod_rank_table_op.cc
index 590b44e14f518c3c60c141c9a0dfe7f2b96f69c6..166952fe23192799443ef9c9d1f7ba5056d19290 100644
--- a/paddle/fluid/operators/lod_rank_table_op.cc
+++ b/paddle/fluid/operators/lod_rank_table_op.cc
@@ -38,8 +38,7 @@ class LoDRankTableOp : public framework::OperatorBase {
 
 class LoDRankTableOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoDRankTableOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(LoDTensor) input lod tensor, must contain lod information.");
     AddOutput("Out", "(LoDRankTable) The rank table of specific level.");
diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc
index 92ebfc274b84f738f5bd688a9a6d9f437b6318aa..0d4e84e85083399e3803d0648dc7a10aa276d536 100644
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
@@ -47,8 +47,7 @@ class LoDResetOp : public framework::OperatorWithKernel {
 
 class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoDResetOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(Tensor, LoDTensor) Input variable of LoDResetOp which "
              "could be a Tensor or LoDTensor, where the data of output "
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index 543495ce4e66c0955c9ce1b0db480088069b36db..00ba5ce8ee5e4084c8af204cfc37fe80c437f0d7 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -105,8 +105,7 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
 
 class LoDTensorToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoDTensorToArrayOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "");
     AddInput("RankTable", "");
     AddOutput("Out", "");
diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc
index a8258a1afd70574c174abe8d5630ade5d4ac3de6..9d248e03218b83a65b9786cb317aafbe3dbb67ee 100644
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -46,8 +46,7 @@ class LogLossOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class LogLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LogLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Predicted",
              "The input value (Predicted) of Log loss op."
              "Predicted is a 2-D tensor with shape [batch_size, 1].");
diff --git a/paddle/fluid/operators/logical_op.cc b/paddle/fluid/operators/logical_op.cc
index 41aa00ee8ac10e0776c066fc3c37f97b0dd40cc3..db109f5cd053d84718ac85bd4693ecece12ce172 100644
--- a/paddle/fluid/operators/logical_op.cc
+++ b/paddle/fluid/operators/logical_op.cc
@@ -21,8 +21,7 @@ namespace operators {
 template <typename OpComment>
 class BinaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BinaryLogicalOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     OpComment comment;
     AddInput("X",
              string::Sprintf("(LoDTensor) Left hand operand of %s operator",
@@ -45,8 +44,7 @@ Each element of Out is calculated by %s
 template <typename OpComment>
 class UnaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  UnaryLogicalOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     OpComment comment;
     AddInput("X", string::Sprintf("(LoDTensor) Operand of %s operator",
                                   comment.type));
diff --git a/paddle/fluid/operators/lookup_sparse_table_op.cc b/paddle/fluid/operators/lookup_sparse_table_op.cc
index 66b626ed792ddec9d57fcf6c81655dffcc23ca99..2ce11e712fb1a8aa9748313ec7cf4e895a931465 100644
--- a/paddle/fluid/operators/lookup_sparse_table_op.cc
+++ b/paddle/fluid/operators/lookup_sparse_table_op.cc
@@ -105,8 +105,7 @@ class LookupSparseTableOp : public framework::OperatorBase {
 
 class LookupSparseTableOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LookupSparseTableOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("W",
              "(SelectedRows) The input represents embedding table, "
              "which is a learnable parameter.");
@@ -128,7 +127,7 @@ class LookupSparseTableOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(-1.0f);
     AddAttr<float>("max",
                    "(float, default 1.0) "
-                   "Maximun value of uniform random")
+                   "Maximum value of uniform random")
         .SetDefault(1.0f);
     AddAttr<int>("seed",
                  "(int, default 0) "
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 5e59bd1b178ad1803f6f70c5f3f9fd7af495ac3c..bda499432214b8841c8dfc406ee45ca0367920e7 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -58,8 +58,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
 
 class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LookupTableOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("W",
              "(Tensor) The input represents embedding tensors, "
              "which is a learnable parameter.");
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index f5c0e47fda913b4635833c31496644b60a0a8504..52b459a6a2e56b7c256efdb535b4652c64bae23c 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -124,16 +124,17 @@ namespace {
 framework::OpKernelType GetExpectedLRNKernel(
     const framework::ExecutionContext& ctx) {
   framework::LibraryType library_{framework::LibraryType::kPlain};
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
 #ifdef PADDLE_WITH_MKLDNN
   if (library_ == framework::LibraryType::kPlain &&
       platform::CanMKLDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kMKLDNN;
+    layout_ = framework::DataLayout::kMKLDNN;
   }
 #endif
 
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
       layout_, library_);
@@ -169,8 +170,7 @@ class LRNOp : public framework::OperatorWithKernel {
 template <typename T>
 class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LRNOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(Tensor) The input of LRN operator. "
              "It must be a 4D tenor with NCHW format.");
diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc
index 084ee1cfe602af3622ef2a3f35f2892d5540cec7..4751e3e8025e51a687f8fcfd25e603b61e762f6d 100644
--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -103,8 +103,7 @@ class LSTMOp : public framework::OperatorWithKernel {
 
 class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LSTMOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Input",
              "(LoDTensor) the first input is a LodTensor, which support "
              "variable-time length input sequence. The underlying tensor in "
diff --git a/paddle/fluid/operators/lstm_unit_op.cc b/paddle/fluid/operators/lstm_unit_op.cc
index e1157ef6c640be17e7f48abe1ab972cf88504526..0895c58f5f58afd444000ebeac7a92e3eb7778d3 100644
--- a/paddle/fluid/operators/lstm_unit_op.cc
+++ b/paddle/fluid/operators/lstm_unit_op.cc
@@ -48,8 +48,7 @@ class LstmUnitOp : public framework::OperatorWithKernel {
 
 class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LstmUnitOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "Lstm unit only applies non-linear activations, please make sure"
              "that linear tranformation has already been applied to `X`. "
diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc
index f9261323f0f50c78b3b4b66a9fa8abcdf5ba27e9..e398b51480f6fc0c6c568770b3b2a9746360744e 100644
--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -120,8 +120,7 @@ class LSTMPOp : public framework::OperatorWithKernel {
 
 class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LSTMPOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Input",
              "(LoDTensor) the input for sequence data, which supports "
              "variable-time length input sequence. The underlying tensor in "
diff --git a/paddle/fluid/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc
index 0b41a3e1ffdb32d248bb55651aba242336307e74..b643ba9d7fa61d758e871ebe7a463c22e937fa2c 100644
--- a/paddle/fluid/operators/margin_rank_loss_op.cc
+++ b/paddle/fluid/operators/margin_rank_loss_op.cc
@@ -42,8 +42,7 @@ class MarginRankLossOp : public framework::OperatorWithKernel {
 template <typename T>
 class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MarginRankLossOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X1",
              "(2-D tensor with shape [batch_size x 1]) The score for "
              "one item X1 to be ranked, from pairwise ranking model.");
diff --git a/paddle/fluid/operators/math/blas.cc b/paddle/fluid/operators/math/blas.cc
index 3eeb77546b97a0337b46216d837a4f4cff12c89f..6a143b3c056455595fdedc131b0c5f4ee756e1e0 100644
--- a/paddle/fluid/operators/math/blas.cc
+++ b/paddle/fluid/operators/math/blas.cc
@@ -13,10 +13,40 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/math/blas.h"
+
+#include <utility>
 namespace paddle {
 namespace operators {
 namespace math {
-// Do nothing. Blas is a header only library.
+MatDescriptor CreateMatrixDescriptor(const framework::DDim &tensor_dim,
+                                     int num_flatten_cols, bool trans) {
+  PADDLE_ENFORCE_GT(tensor_dim.size(), 1);
+  MatDescriptor retv;
+  if (num_flatten_cols > 1) {
+    auto flatten_dim = framework::flatten_to_2d(tensor_dim, num_flatten_cols);
+    retv.height_ = flatten_dim[0];
+    retv.width_ = flatten_dim[1];
+  } else {
+    if (tensor_dim.size() == 2) {
+      retv.height_ = tensor_dim[0];
+      retv.width_ = tensor_dim[1];
+    } else {
+      auto dim_vec = framework::vectorize(tensor_dim);
+      retv.batch_size_ = 1;
+      for (size_t i = 0; i < dim_vec.size() - 2; ++i) {
+        retv.batch_size_ *= dim_vec[i];
+      }
+      retv.height_ = dim_vec[dim_vec.size() - 2];
+      retv.width_ = dim_vec[dim_vec.size() - 1];
+      retv.stride_ = retv.height_ * retv.width_;
+    }
+  }
+  if (trans) {
+    std::swap(retv.width_, retv.height_);
+  }
+  retv.trans_ = trans;
+  return retv;
+}
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index 5cd2f855d1135e6dd8343efdaa9855d2526a3520..1a37cb39d56066b8380338b9710a441e41518c39 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -46,6 +46,50 @@ namespace paddle {
 namespace operators {
 namespace math {
 
+/**
+ * Matrix Descriptor of a memory buffer.
+ *
+ * It is used for Blas::MatMul. MatMul operator can be batched.
+ * if Mat A is [BatchSize, H, W], Mat B is [BatchSize, H, W]. It will be a
+ * `batch_size` times of GEMM. The batched GEMM could be faster base on the
+ * implementation of the blas library. The batch size could be zero. If any
+ * matrix of `matmul` has a batch size, the will be a batched GEMM, too. e.g.,
+ * Mat A is [BatchSize, H1, W2], and Mat B [H2, W2], The result matrix wil be
+ * [BatchSize, H1, W2]
+ *
+ * The boolean flag, `trans`, describe the memory is the transpose of matrix or
+ * not. If the trans is true, the last two dims of matrix are transposed. The
+ * memory layout of the matrix is [Width, Height] or [BatchSize, Width, Height].
+ *
+ * The MatDescriptor is not only the dimension or shape of a matrix, it also
+ * contains the layout, stride of matrix. It is clearer to have a structure than
+ * reuse `DDim`.
+ */
+struct MatDescriptor {
+  int64_t height_;
+  int64_t width_;
+  int64_t stride_{0};
+  int64_t batch_size_{0};
+  bool trans_;
+};
+
+/**
+ * Create Matrix Descriptor from a tensor dim, num_flatten_cols, and transpose
+ * flag
+ *
+ * @param tensor_dim: The dimension of the tensor. The rank of this dimension
+ * must larger than 1.
+ *
+ * @param num_flatten_cols:  Reshape a tensor to a matrix. The matrix's first
+ * dimension(column length) will be the product of tensor's first `num_col_dims`
+ * dimensions. If num_flatten_cols is zero, the first N-2 dimension will be the
+ * batch_size of descriptor.
+ *
+ * @param trans: True if the matrix is transposed.
+ */
+extern MatDescriptor CreateMatrixDescriptor(const framework::DDim& tensor_dim,
+                                            int num_flatten_cols, bool trans);
+
 template <typename DeviceContext>
 class Blas {
  public:
@@ -81,6 +125,12 @@ class Blas {
   template <typename T>
   void AXPY(int n, T alpha, const T* x, T* y) const;
 
+  template <typename T>
+  void VADD(int n, const T* x, const T* y, T* z) const;
+
+  template <typename T>
+  void VCOPY(int n, const T* x, T* y) const;
+
   template <typename T>
   void GEMV(bool trans_a, int M, int N, T alpha, const T* A, const T* B, T beta,
             T* C) const;
@@ -90,6 +140,11 @@ class Blas {
                    int K, T alpha, const T* A, const T* B, T beta, T* C,
                    int batchCount, int64_t strideA, int64_t strideB) const;
 
+  template <typename T>
+  void MatMul(const framework::Tensor& mat_a, const MatDescriptor& dim_a,
+              const framework::Tensor& mat_b, const MatDescriptor& dim_b,
+              T alpha, framework::Tensor* mat_out, T beta) const;
+
  private:
   const DeviceContext& context_;
 };
@@ -114,6 +169,16 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template AXPY<T>(args...);
   }
 
+  template <typename... ARGS>
+  void VADD(ARGS... args) const {
+    Base()->template VADD<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void VCOPY(ARGS... args) const {
+    Base()->template VCOPY<T>(args...);
+  }
+
   template <typename... ARGS>
   void GEMV(ARGS... args) const {
     Base()->template GEMV<T>(args...);
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index c76fc17d78cce514b5e35ce8e5ca890d7cec1e98..d84c88cb3bc1a13acb83b3444dbd1bfca3cba503 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -96,10 +96,22 @@ struct CUBlas<platform::float16> {
                                        reinterpret_cast<__half *>(C), ldc));
   }
 
-  template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
+  static void GEMM_BATCH(cublasHandle_t handle, cublasOperation_t transa,
+                         cublasOperation_t transb, int m, int n, int k,
+                         const float16 *alpha, const float16 *A, int lda,
+                         long long int strideA, const float16 *B,  // NOLINT
+                         int ldb, long long int strideB,           // NOLINT
+                         const float16 *beta, float16 *C, int ldc,
+                         long long int strideC,  // NOLINT
+                         int batchCount) {
 #if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(args...));
+    PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
+        handle, transa, transb, m, n, k,
+        reinterpret_cast<const __half *>(alpha),
+        reinterpret_cast<const __half *>(A), lda, strideA,
+        reinterpret_cast<const __half *>(B), ldb, strideB,
+        reinterpret_cast<const __half *>(beta), reinterpret_cast<__half *>(C),
+        ldc, strideC, batchCount));
 #else
     PADDLE_THROW("HgemmStridedBatched is not supported on cuda <= 7.5");
 #endif
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 7360cc0a90da499c372c6fb3f8d40a26f9093dd8..ae20406bc21d5e08359be8295cd98495dda7813b 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -34,6 +34,18 @@ struct CBlas<float> {
     cblas_saxpy(args...);
   }
 
+#ifdef PADDLE_WITH_MKLML
+  template <typename... ARGS>
+  static void VADD(ARGS... args) {
+    vsAdd(args...);
+  }
+#endif
+
+  template <typename... ARGS>
+  static void VCOPY(ARGS... args) {
+    cblas_scopy(args...);
+  }
+
   template <typename... ARGS>
   static void GEMV(ARGS... args) {
     cblas_sgemv(args...);
@@ -59,6 +71,18 @@ struct CBlas<double> {
     cblas_daxpy(args...);
   }
 
+#ifdef PADDLE_WITH_MKLML
+  template <typename... ARGS>
+  static void VADD(ARGS... args) {
+    vdAdd(args...);
+  }
+#endif
+
+  template <typename... ARGS>
+  static void VCOPY(ARGS... args) {
+    cblas_dcopy(args...);
+  }
+
   template <typename... ARGS>
   static void GEMV(ARGS... args) {
     cblas_dgemv(args...);
@@ -139,6 +163,24 @@ void Blas<platform::CPUDeviceContext>::AXPY(int n, T alpha, const T *x,
   CBlas<T>::AXPY(n, alpha, x, 1, y, 1);
 }
 
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::VCOPY(int n, const T *x, T *y) const {
+  CBlas<T>::VCOPY(n, x, 1, y, 1);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::VADD(int n, const T *x, const T *y,
+                                            T *z) const {
+#ifdef PADDLE_WITH_MKLML
+  CBlas<T>::VADD(n, x, y, z);
+#else
+  this->template VCOPY<T>(n, y, z);
+  this->template AXPY<T>(n, 1., x, z);
+#endif
+}
+
 template <>
 template <typename T>
 void Blas<platform::CPUDeviceContext>::GEMV(bool trans_a, int M, int N, T alpha,
@@ -172,14 +214,39 @@ void Blas<platform::CPUDeviceContext>::BatchedGEMM(
                        c_array.data(), &ldc, 1 /* group_count */, &batchCount);
 #else
   for (int k = 0; k < batchCount; ++k) {
-    const float *Ak = &A[k * strideA];
-    const float *Bk = &B[k * strideB];
-    float *Ck = &C[k * M * N];
+    auto *Ak = &A[k * strideA];
+    auto *Bk = &B[k * strideB];
+    auto *Ck = &C[k * M * N];
     this->template GEMM<T>(transA, transB, M, N, K, alpha, Ak, Bk, beta, Ck);
   }
 #endif
 }
 
+template <typename DeviceContext>
+template <typename T>
+void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
+                                 const MatDescriptor &dim_a,
+                                 const framework::Tensor &mat_b,
+                                 const MatDescriptor &dim_b, T alpha,
+                                 framework::Tensor *mat_out, T beta) const {
+  PADDLE_ENFORCE_EQ(dim_a.width_, dim_b.height_);
+  CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
+  CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
+  if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
+    this->template GEMM<T>(transA, transB, dim_a.height_, dim_b.width_,
+                           dim_a.width_, alpha, mat_a.data<T>(),
+                           mat_b.data<T>(), beta, mat_out->data<T>());
+  } else {
+    PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ ||
+                   dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0);
+    this->template BatchedGEMM<T>(
+        transA, transB, dim_a.height_, dim_b.width_, dim_a.width_, alpha,
+        mat_a.data<T>(), mat_b.data<T>(), beta, mat_out->data<T>(),
+        dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_,
+        dim_a.stride_, dim_b.stride_);
+  }
+}
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/cross_entropy.cc b/paddle/fluid/operators/math/cross_entropy.cc
index fc0fca5ad3370633b2f60db65fdb7c01c417dc50..caff35e03ae3a144f799d982c859ded62cb3e93d 100644
--- a/paddle/fluid/operators/math/cross_entropy.cc
+++ b/paddle/fluid/operators/math/cross_entropy.cc
@@ -46,7 +46,10 @@ class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
 
       const int64_t* label_data = labels->data<int64_t>();
       for (int i = 0; i < batch_size; ++i) {
-        int index = i * class_num + label_data[i];
+        int lbl = label_data[i];
+        PADDLE_ENFORCE_GE(lbl, 0);
+        PADDLE_ENFORCE_LT(lbl, class_num);
+        int index = i * class_num + lbl;
         loss_data[i] = -math::TolerableValue<T>()(std::log(prob_data[index]));
       }
     }
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index d62ea387cc55c7399973b6f35bace491a49666dc..d39154c6f88d6d17c1719eb9a5b048211f4bb52b 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -38,7 +38,9 @@ template struct SetConstant<platform::CPUDeviceContext, bool>;
   template struct Transpose<platform::CPUDeviceContext, double, RANK>;     \
   template struct Transpose<platform::CPUDeviceContext, int, RANK>;        \
   template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;    \
-  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;
+  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;       \
+  template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>;    \
+  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;
 
 DEFINE_CPU_TRANS(1);
 DEFINE_CPU_TRANS(2);
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index b5bf84e5178c143de35ec6dcb16b1bde5577c166..d5af718723e8d44da0971ea7756b8c36e771cca2 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -33,9 +33,10 @@ template struct SetConstant<platform::CUDADeviceContext, int>;
 template struct SetConstant<platform::CUDADeviceContext, int64_t>;
 template struct SetConstant<platform::CUDADeviceContext, bool>;
 
-#define DEFINE_GPU_TRANS(RANK)                                         \
-  template struct Transpose<platform::CUDADeviceContext, float, RANK>; \
-  template struct Transpose<platform::CUDADeviceContext, double, RANK>;
+#define DEFINE_GPU_TRANS(RANK)                                          \
+  template struct Transpose<platform::CUDADeviceContext, float, RANK>;  \
+  template struct Transpose<platform::CUDADeviceContext, double, RANK>; \
+  template struct Transpose<platform::CUDADeviceContext, float16, RANK>;
 
 DEFINE_GPU_TRANS(1);
 DEFINE_GPU_TRANS(2);
diff --git a/paddle/fluid/operators/math/matmul.h b/paddle/fluid/operators/math/matmul.h
deleted file mode 100644
index 87fd38a324e007bcc939c31b6ae8e5d38c3e658c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/matmul.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/operators/math/blas.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-// Implements the logic of numpy matmul:
-// https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html
-//
-// but allowing also for a, b to be transposed
-//
-// Both a & b can be 1- to 3-dimensional. Higher rank tensors are not supported
-// yet.
-template <typename DeviceContext, typename T>
-class MatMulFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& a,
-                  bool trans_a, const framework::Tensor& b, bool trans_b,
-                  T alpha, framework::Tensor* out, T beta) {
-    auto dim_a = a.dims();
-    auto dim_b = b.dims();
-
-    PADDLE_ENFORCE(a.place() == b.place() && b.place() == out->place(),
-                   "Tensors must all be in the same place.");
-    PADDLE_ENFORCE_GE(dim_a.size(), 1,
-                      "Input tensor a must be at least 1-dimensional.");
-    PADDLE_ENFORCE_GE(dim_b.size(), 1,
-                      "Input tensor b must be at least 1-dimensional.");
-
-    std::vector<int64_t> out_dim;
-    int64_t batch_count = 1;
-    if (dim_a.size() > 3) {
-      PADDLE_ENFORCE(dim_b.size() == dim_a.size(),
-                     "The dimensions of X and Y must be the same, and both of "
-                     "them should be %d-dimensional.",
-                     dim_b.size());
-      // The first rank-2 dimensions are accumulated on the batch_count, and the
-      // last two dimensions are used for matrix multiplication.
-      for (int j = 0; j < dim_a.size() - 2; ++j) {
-        PADDLE_ENFORCE_EQ(dim_b[j], dim_a[j],
-                          "The %d-th dimension of X and Y must be the same.",
-                          j);
-        out_dim.push_back(dim_a[j]);
-        batch_count *= dim_a[j];
-      }
-    }
-
-    int M = 0, N = 0, kA = 0, kB = 0, batchCountA = 0, batchCountB = 0,
-        strideA = 0, strideB = 0;
-
-    switch (dim_a.size()) {
-      case 1:
-        // similar to np.matmul:
-        // prepend dimension 1 (no transpose) or append dimension 1 (transpose)
-        M = trans_a ? dim_a[0] : 1;
-        kA = trans_a ? 1 : dim_a[0];
-        break;
-      case 2:
-        M = trans_a ? dim_a[1] : dim_a[0];
-        kA = trans_a ? dim_a[0] : dim_a[1];
-        break;
-      case 3:
-        batchCountA = dim_a[0];
-        M = trans_a ? dim_a[2] : dim_a[1];
-        kA = trans_a ? dim_a[1] : dim_a[2];
-        strideA = M * kA;
-        break;
-      default:
-        batchCountA = batch_count;
-        size_t mat_s = dim_a.size() - 2;
-        M = trans_a ? dim_a[mat_s + 1] : dim_a[mat_s];
-        kA = trans_a ? dim_a[mat_s] : dim_a[mat_s + 1];
-        strideA = M * kA;
-    }
-
-    switch (dim_b.size()) {
-      case 1:
-        // similar to np.matmul:
-        // append dimension 1 (no transpose) or prepend dimension 1 (transpose)
-        kB = trans_b ? 1 : dim_b[0];
-        N = trans_b ? dim_b[0] : 1;
-        break;
-      case 2:
-        kB = trans_b ? dim_b[1] : dim_b[0];
-        N = trans_b ? dim_b[0] : dim_b[1];
-        break;
-      case 3:
-        batchCountB = dim_b[0];
-        kB = trans_b ? dim_b[2] : dim_b[1];
-        N = trans_b ? dim_b[1] : dim_b[2];
-        strideB = kB * N;
-        break;
-      default:
-        batchCountB = batch_count;
-        size_t mat_s = dim_b.size() - 2;
-        kB = trans_b ? dim_b[mat_s + 1] : dim_b[mat_s];
-        N = trans_b ? dim_b[mat_s] : dim_b[mat_s + 1];
-        strideB = kB * N;
-    }
-
-    PADDLE_ENFORCE_EQ(
-        kA, kB,
-        "First matrix's width must be equal with second matrix's height.");
-    if (batchCountA && batchCountB) {
-      PADDLE_ENFORCE_EQ(
-          batchCountA, batchCountB,
-          "When input tensors a and b are both batched, they must have the "
-          "same batch dimension.");
-    }
-    int batchCount = std::max(batchCountA, batchCountB);
-
-    CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
-    CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
-
-    auto blas = GetBlas<DeviceContext, T>(context);
-
-    if (!batchCount) {
-      // regular matrix multiplication
-      blas.GEMM(transA, transB, M, N, kA, alpha, a.data<T>(), b.data<T>(), beta,
-                out->data<T>());
-    } else {
-      // batched matrix multiplication
-      blas.BatchedGEMM(transA, transB, M, N, kA, alpha, a.data<T>(),
-                       b.data<T>(), beta, out->data<T>(), batchCount, strideA,
-                       strideB);
-    }
-  }
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence2batch.h b/paddle/fluid/operators/math/sequence2batch.h
index 0abda999a52bcbb94e6503692bd11aff26e849ba..62e6307ae9f4236a38c49daaf09fc05c54268159 100644
--- a/paddle/fluid/operators/math/sequence2batch.h
+++ b/paddle/fluid/operators/math/sequence2batch.h
@@ -64,18 +64,22 @@ class LoDTensor2BatchFunctor {
                   bool is_reverse = false) const {
     if (!is_cal_batch_lod) {
       auto lods = batch->lod();
-      PADDLE_ENFORCE_GT(lods.size(), 2UL);
-      PADDLE_ENFORCE_EQ(lods[1].size(),
-                        static_cast<size_t>(lod_tensor.dims()[0]));
+      PADDLE_ENFORCE_GT(lods.size(), 2UL,
+                        "The LoD of LoDTensor should inlcude at least 2-level "
+                        "sequence information.");
+      PADDLE_ENFORCE_EQ(
+          lods[1].size(), static_cast<size_t>(lod_tensor.dims()[0]),
+          "The LoD information should be consistent with the dims.");
       CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
       to_batch(context, lod_tensor, lods[1], batch, true);
       return;
     }
 
     auto lods = lod_tensor.lod();
-    auto lod = lods[0];
     PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
 
+    auto lod = lods[0];
+
     std::vector<SeqInfo> seq_info;
     for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
       int length = lod[seq_id + 1] - lod[seq_id];
@@ -157,9 +161,12 @@ class Batch2LoDTensorFunctor {
                   const framework::LoDTensor& batch,
                   framework::LoDTensor* lod_tensor) const {
     auto in_lod = batch.lod();
-    PADDLE_ENFORCE_GT(in_lod.size(), 2UL);
-    PADDLE_ENFORCE_EQ(in_lod[1].size(),
-                      static_cast<size_t>(lod_tensor->dims()[0]));
+    PADDLE_ENFORCE_GT(in_lod.size(), 2UL,
+                      "The LoD of LoDTensor should inlcude at least 2-level "
+                      "sequence information.");
+    PADDLE_ENFORCE_EQ(
+        in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0]),
+        "The LoD information should be consistent with the dims.");
     CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
     to_seq(context, batch, in_lod[1], lod_tensor, false);
   }
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index e5d33fbc36438f97ff5b604e4efdbfbfa91fcee4..7182149164854038bb67a9f06cdbec8a4a0f1fb2 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -12,21 +12,264 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/matmul_op.h"
 #include <algorithm>
+#include <utility>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/math/blas.h"
 
 namespace paddle {
 namespace operators {
+/**
+ * Get row matrix shape from a vector shape. If the rank of x_dim > 1, the
+ * original x_dim is returned.
+ */
+static framework::DDim RowMatrixFromVector(const framework::DDim &x_dim) {
+  if (x_dim.size() > 1) {
+    return x_dim;
+  }
+  return framework::make_ddim({1, x_dim[0]});
+}
+
+/**
+ * Get column matrix shape from a vector shape. If the ran of y_dim > 1, the
+ * original y_dim is returned.
+ */
+static framework::DDim ColumnMatrixFromVector(const framework::DDim &y_dim) {
+  if (y_dim.size() > 1) {
+    return y_dim;
+  }
+  return framework::make_ddim({y_dim[0], 1});
+}
+
+template <typename DeviceContext, typename T>
+class MatMulKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto &x =
+        detail::Ref(context.Input<framework::Tensor>("X"), "Cannot find X");
+    auto &y =
+        detail::Ref(context.Input<framework::Tensor>("Y"), "Cannot find Y");
+    auto *out = context.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+
+    auto blas = math::GetBlas<DeviceContext, T>(context);
+    auto mat_dim_a = math::CreateMatrixDescriptor(
+        RowMatrixFromVector(x.dims()), 0, context.Attr<bool>("transpose_X"));
+    auto mat_dim_b = math::CreateMatrixDescriptor(
+        ColumnMatrixFromVector(y.dims()), 0, context.Attr<bool>("transpose_Y"));
+    blas.MatMul(x, mat_dim_a, y, mat_dim_b, T(1), out, T(0));
+  }
+};
+
+// Reshape a rank-3 tensor from P x M x N to (P * M) x N.
+// Identity op if the tensor is not of rank 3.
+static framework::Tensor FoldInitDims(const framework::Tensor &input) {
+  auto output = input;
+  auto in_dims = input.dims();
+  if (in_dims.size() == 3) {
+    output.Resize({in_dims[0] * in_dims[1], in_dims[2]});
+  }
+  return output;
+}
+
+// Reshape a rank-3 tensor from P x M x N to M x (P * N).
+// (Warning: This requires transposing data and writes into new memory.)
+// Identity op if the tensor is not of rank 3.
+template <typename DeviceContext, typename T>
+static framework::Tensor FoldHeadAndLastDims(const DeviceContext &context,
+                                             const framework::Tensor &input) {
+  auto in_dims = input.dims();
+  if (in_dims.size() != 3) {
+    return input;
+  }
+  framework::Tensor output;
+  output.Resize({in_dims[1], in_dims[0], in_dims[2]});
+  output.mutable_data<T>(context.GetPlace());
+  std::vector<int> axis = {1, 0, 2};
+  math::Transpose<DeviceContext, T, 3> trans;
+  trans(context, input, &output, axis);
+  output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
+
+  return output;
+}
+
+/**
+ * Reshape a tensor to 3-D or 2-D tensor by matrix descriptor.
+ *
+ * The shape would be [BatchSize, H, W] or [H, W].
+ * If transposed, `H,W` will be swapped.
+ */
+static void ReshapeTensorIntoMatrixSequence(
+    framework::Tensor *x, const math::MatDescriptor &descriptor) {
+  int64_t h, w;
+  h = descriptor.height_;
+  w = descriptor.width_;
+  if (descriptor.trans_) {
+    std::swap(w, h);
+  }
+  if (descriptor.batch_size_) {
+    x->Resize({descriptor.batch_size_, h, w});
+  } else {
+    x->Resize({h, w});
+  }
+}
+
+/**
+ * Reshape the x,y,out tensor to 3-D or 2-D tensor by matrix descriptor
+ * Out = matmul(x, y)
+ *
+ * This method will first calculate X,Y matrix sequence, and then calculate
+ * the out shape.
+ *
+ * Assume X = [BatchSize, H1, W1], Y = [BatchSize, H2, W2]
+ * The out = [BatchSize, H1, W2]
+ *
+ * If there is no batch size in `X` and `Y`, the out will be [H1, W2]
+ * If any of `X` and `Y` has batch size BatchSize, the out will have the
+ * BatchSize.
+ */
+static void ReshapeXYOutIntoMatrixSequence(framework::Tensor *x,
+                                           framework::Tensor *y,
+                                           framework::Tensor *out, bool trans_x,
+                                           bool trans_y) {
+  auto x_dim = RowMatrixFromVector(x->dims());
+  auto y_dim = ColumnMatrixFromVector(y->dims());
+  auto mat_dim_x = math::CreateMatrixDescriptor(x_dim, 0, trans_x);
+  auto mat_dim_y = math::CreateMatrixDescriptor(y_dim, 0, trans_y);
+  if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) {
+    out->Resize({mat_dim_x.height_, mat_dim_y.width_});
+  } else {
+    out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_),
+                 mat_dim_x.height_, mat_dim_y.width_});
+  }
+
+  ReshapeTensorIntoMatrixSequence(x, mat_dim_x);
+  ReshapeTensorIntoMatrixSequence(y, mat_dim_y);
+}
+
+// Using dimensional constraints on matrix multiplication, it is
+// straight-forward to check the following table for when X and Y
+// are both matrices.
+//
+// transpose_X | False    | True     | False    | True
+// transpose_Y | False    | False    | True     | True
+// -----------+----------+----------+----------+-----------
+//        dX = | dOut Y^T | Y dOut^T | dOut Y   | Y^T dOut^T
+//        dY = | X^T dOut | X dOut   | dOut^T X | dOut^T X^T
+//
+// When X is a vector of size K, we treat it instead as a matrix of shape
+// (1, K). Similarly, when Y is a vector of size K, we treat it instead as
+// a matrix of shape (K, 1).
+//
+// When X and Y are both 3-dimensional tensors, then the first dimension
+// the batch dimension can be ignored and the exact same formulas apply
+// as for two matrices.
+//
+// Finally, when, e.g., X is a 3-dimensional tensor but Y is a matrix, we end
+// up with formulas like
+//
+//   dY_{ij} = \sum_{p, m} X_{pmi} dOut_{pmj}
+//
+// To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N
+// to X: (P * M) x K, dOut: (P * M) x N.
+template <typename DeviceContext, typename T>
+class MatMulGradKernel : public framework::OpKernel<T> {
+ public:
+  void MatMul(const framework::ExecutionContext &context,
+              const framework::Tensor &a, bool trans_a,
+              const framework::Tensor &b, bool trans_b,
+              framework::Tensor *out) const {
+    out->mutable_data<T>(context.GetPlace());
+    auto blas = math::GetBlas<DeviceContext, T>(context);
+    auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a);
+    auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b);
+    blas.MatMul(a, mat_dim_a, b, mat_dim_b, T(1), out, T(0));
+  }
+
+  void CalcInputGrad(const framework::ExecutionContext &context,
+                     const framework::Tensor &a, bool trans_a,
+                     bool is_fold_init_dims_a, const framework::Tensor &b,
+                     bool trans_b, bool is_fold_init_dims_b,
+                     framework::Tensor *out) const {
+    if (out == nullptr) return;
+    bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) &&
+                        out->dims().size() == 2;
+    if (!need_combine) {
+      MatMul(context, a, trans_a, b, trans_b, out);
+    } else {
+      auto &ctx = context.template device_context<DeviceContext>();
+      MatMul(context, is_fold_init_dims_a
+                          ? FoldInitDims(a)
+                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, a),
+             trans_a, is_fold_init_dims_b
+                          ? FoldInitDims(b)
+                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, b),
+             trans_b, out);
+    }
+  }
+
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto x = *context.Input<framework::Tensor>("X");
+    auto y = *context.Input<framework::Tensor>("Y");
+    auto dout =
+        *context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto *dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *dy = context.Output<framework::Tensor>(framework::GradVarName("Y"));
+    bool transpose_x = context.Attr<bool>("transpose_X");
+    bool transpose_y = context.Attr<bool>("transpose_Y");
+
+    ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y);
+    framework::DDim dx_dims;
+    if (dx) {
+      dx_dims = dx->dims();
+      if (dx_dims != x.dims()) {
+        dx->Resize(x.dims());
+      }
+    }
+
+    framework::DDim dy_dims;
+    if (dy) {
+      dy_dims = dy->dims();
+      if (dy_dims != y.dims()) {
+        dy->Resize(y.dims());
+      }
+    }
 
-using framework::Tensor;
+    if (transpose_x && transpose_y) {
+      CalcInputGrad(context, y, true, true, dout, true, false, dx);
+      CalcInputGrad(context, dout, true, true, x, true, false, dy);
+    } else if (transpose_x) {
+      CalcInputGrad(context, y, false, false, dout, true, false, dx);
+      CalcInputGrad(context, x, false, false, dout, false, true, dy);
+    } else if (transpose_y) {
+      CalcInputGrad(context, dout, false, false, y, false, true, dx);
+      CalcInputGrad(context, dout, true, true, x, false, true, dy);
+    } else {
+      CalcInputGrad(context, dout, false, false, y, true, false, dx);
+      CalcInputGrad(context, x, true, true, dout, false, true, dy);
+    }
+
+    if (dx) {
+      if (dx_dims != x.dims()) {
+        dx->Resize(dx_dims);
+      }
+    }
+    if (dy) {
+      if (dy_dims != y.dims()) {
+        dy->Resize(dy_dims);
+      }
+    }
+  }
+};
 
 class MatMulOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(framework::InferShapeContext* context) const override {
+  void InferShape(framework::InferShapeContext *context) const override {
     PADDLE_ENFORCE(context->HasInput("X"),
                    "Input(X) of MatMulOp should not be null.");
     PADDLE_ENFORCE(context->HasInput("Y"),
@@ -36,121 +279,41 @@ class MatMulOp : public framework::OperatorWithKernel {
 
     auto dim_x = context->GetInputDim("X");
     auto dim_y = context->GetInputDim("Y");
-    bool transpose_x = context->Attrs().Get<bool>("transpose_X");
-    bool transpose_y = context->Attrs().Get<bool>("transpose_Y");
-
-    PADDLE_ENFORCE_GE(dim_x.size(), 1,
-                      "Input tensor X must be at least 1-dimensional.");
-    PADDLE_ENFORCE_GE(dim_y.size(), 1,
-                      "Input tensor Y must be at least 1-dimensional.");
-
-    std::vector<int64_t> out_dim;
-    int64_t batch_count = 1;
-    if (dim_x.size() > 3) {
-      PADDLE_ENFORCE_EQ(
-          dim_y.size(), dim_x.size(),
-          "The dimensions of X and Y must be the same, and both of "
-          "them should be %d-dimensional.",
-          dim_x.size());
-
-      // The first rank-2 dimensions are accumulated on the batch_count, and the
-      // last two dimensions are used for matrix multiplication.
-      for (int j = 0; j < dim_x.size() - 2; ++j) {
-        PADDLE_ENFORCE_EQ(dim_y[j], dim_x[j],
-                          "The %d-th dimension of X and Y must be the same.",
-                          j);
-        out_dim.push_back(dim_x[j]);
-        batch_count *= dim_x[j];
-      }
-    }
 
-    int M = 0, N = 0, KX = 0, KY = 0, batchCountX = 0, batchCountY = 0;
-    bool remove_initial_dim = false, remove_final_dim = false;
-
-    switch (dim_x.size()) {
-      case 1:
-        if (transpose_x) {
-          M = dim_x[0];
-          KX = 1;
-        } else {
-          M = 1;
-          KX = dim_x[0];
-          remove_initial_dim = true;
-        }
-        break;
-      case 2:
-        M = transpose_x ? dim_x[1] : dim_x[0];
-        KX = transpose_x ? dim_x[0] : dim_x[1];
-        break;
-      case 3:
-        batchCountX = dim_x[0];
-        M = transpose_x ? dim_x[2] : dim_x[1];
-        KX = transpose_x ? dim_x[1] : dim_x[2];
-        break;
-      default:
-        batchCountX = batch_count;
-        size_t mat_s = dim_x.size() - 2;
-        M = transpose_x ? dim_x[mat_s + 1] : dim_x[mat_s];
-        KX = transpose_x ? dim_x[mat_s] : dim_x[mat_s + 1];
-        break;
-    }
+    auto mat_dim_x =
+        math::CreateMatrixDescriptor(RowMatrixFromVector(dim_x), 0,
+                                     context->Attrs().Get<bool>("transpose_X"));
+    auto mat_dim_y =
+        math::CreateMatrixDescriptor(ColumnMatrixFromVector(dim_y), 0,
+                                     context->Attrs().Get<bool>("transpose_Y"));
 
-    switch (dim_y.size()) {
-      case 1:
-        if (transpose_y) {
-          N = dim_y[0];
-          KY = 1;
-        } else {
-          N = 1;
-          KY = dim_y[0];
-          remove_final_dim = true;
-        }
-        break;
-      case 2:
-        KY = transpose_y ? dim_y[1] : dim_y[0];
-        N = transpose_y ? dim_y[0] : dim_y[1];
-        break;
-      case 3:
-        batchCountY = dim_y[0];
-        KY = transpose_y ? dim_y[2] : dim_y[1];
-        N = transpose_y ? dim_y[1] : dim_y[2];
-        break;
-      default:
-        batchCountY = batch_count;
-        size_t mat_s = dim_y.size() - 2;
-        KY = transpose_y ? dim_y[mat_s + 1] : dim_y[mat_s];
-        N = transpose_y ? dim_y[mat_s] : dim_y[mat_s + 1];
+    PADDLE_ENFORCE_EQ(mat_dim_x.width_, mat_dim_y.height_);
+    PADDLE_ENFORCE(mat_dim_x.batch_size_ == mat_dim_y.batch_size_ ||
+                   mat_dim_x.batch_size_ == 0 || mat_dim_y.batch_size_ == 0);
+    std::vector<int64_t> dim_out;
+    if (mat_dim_x.batch_size_ != 0) {
+      dim_out = framework::vectorize(dim_x);
+      dim_out[dim_out.size() - 2] = mat_dim_x.height_;
+      dim_out[dim_out.size() - 1] = mat_dim_y.width_;
+    } else if (mat_dim_y.batch_size_ != 0) {
+      dim_out = framework::vectorize(dim_y);
+      dim_out[dim_out.size() - 2] = mat_dim_x.height_;
+      dim_out[dim_out.size() - 1] = mat_dim_y.width_;
+    } else {
+      dim_out = {mat_dim_x.height_, mat_dim_y.width_};
     }
 
-    PADDLE_ENFORCE_EQ(
-        KX, KY,
-        "First matrix's width must be equal with second matrix's height.");
-    if (batchCountX && batchCountY) {
-      PADDLE_ENFORCE_EQ(
-          batchCountX, batchCountY,
-          "When Input(X) and Input(Y) are both three dimensional, they "
-          "must have the same batch dimension.");
+    if (dim_x.size() == 1 && dim_out[dim_out.size() - 2] == 1) {
+      std::swap(dim_out[dim_out.size() - 2], dim_out[dim_out.size() - 1]);
+      dim_out.resize(dim_out.size() - 1);
     }
-    int batchCount = std::max(batchCountX, batchCountY);
 
-    std::vector<int64_t> dim_out;
-    if (batchCount) {
-      if (dim_x.size() > 3) {
-        dim_out.insert(dim_out.begin(), out_dim.begin(), out_dim.end());
-      } else {
-        dim_out.push_back(batchCount);
-      }
+    if (dim_y.size() == 1 && dim_out[dim_out.size() - 1] == 1) {
+      dim_out.resize(dim_out.size() - 1);
     }
-    if (!remove_initial_dim) {
-      dim_out.push_back(M);
-    }
-    if (!remove_final_dim) {
-      dim_out.push_back(N);
-    }
-    if (dim_out.size() == 0) {
-      // We don't support 0-dimensional Tensors (scalars), so instead
-      // treat the output as a Tensor of shape (1, ) in this case.
-      dim_out.push_back(1);
+
+    if (dim_out.empty()) {
+      dim_out = {1};
     }
     context->SetOutputDim("Out", framework::make_ddim(dim_out));
     context->ShareLoD("X", /*->*/ "Out");
@@ -159,8 +322,7 @@ class MatMulOp : public framework::OperatorWithKernel {
 
 class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MatMulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The first input of MatMul op");
     AddInput("Y", "The second input of MatMul op");
     AddOutput("Out", "The output of MatMul op");
@@ -213,7 +375,7 @@ class MatMulOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(framework::InferShapeContext* context) const override {
+  void InferShape(framework::InferShapeContext *context) const override {
     PADDLE_ENFORCE(context->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(context->HasInput("Y"), "Input(Y) should not be null");
     PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
@@ -233,15 +395,52 @@ class MatMulOpGrad : public framework::OperatorWithKernel {
   }
 };
 
+class MatMulOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *retv = new framework::OpDesc();
+    retv->SetType("matmul_grad");
+    retv->SetInput("X", Input("X"));
+    retv->SetInput("Y", Input("Y"));
+    retv->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    retv->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    retv->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
+    retv->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(retv);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(matmul, ops::MatMulOp, ops::MatMulOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::MatMulOpGradMaker);
 REGISTER_OPERATOR(matmul_grad, ops::MatMulOpGrad);
 REGISTER_OP_CPU_KERNEL(
-    matmul, ops::MatMulKernel<paddle::platform::CPUDeviceContext, float>);
+    matmul, ops::MatMulKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MatMulKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::MatMulKernel<paddle::platform::CPUDeviceContext,
+                      paddle::platform::float16>);
 REGISTER_OP_CPU_KERNEL(
     matmul_grad,
-    ops::MatMulGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::MatMulGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MatMulGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::MatMulGradKernel<paddle::platform::CPUDeviceContext,
+                          paddle::platform::float16>);
+
+#ifdef PADDLE_WITH_CUDA
+REGISTER_OP_CUDA_KERNEL(
+    matmul, ops::MatMulKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MatMulKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MatMulKernel<paddle::platform::CUDADeviceContext,
+                      paddle::platform::float16>);
+REGISTER_OP_CUDA_KERNEL(
+    matmul_grad,
+    ops::MatMulGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MatMulGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MatMulGradKernel<paddle::platform::CUDADeviceContext,
+                          paddle::platform::float16>);
+#endif
diff --git a/paddle/fluid/operators/matmul_op.h b/paddle/fluid/operators/matmul_op.h
deleted file mode 100644
index f2e9cfdcdbf93326ae193776a7d5f6a324373603..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/matmul_op.h
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <functional>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/matmul.h"
-
-namespace paddle {
-namespace operators {
-namespace matmul_detail {
-
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-using framework::make_ddim;
-using framework::vectorize;
-
-template <typename DeviceContext, typename T>
-class MatMulKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor& x = *context.Input<Tensor>("X");
-    const Tensor& y = *context.Input<Tensor>("Y");
-    Tensor* out = context.Output<Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-    bool transpose_x = context.Attr<bool>("transpose_X");
-    bool transpose_y = context.Attr<bool>("transpose_Y");
-
-    math::MatMulFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), x, transpose_x, y,
-        transpose_y, T(1), out, T(0));
-  }
-};
-
-template <typename T>
-inline Tensor Reshape(const Tensor& input, const DDim& dims) {
-  Tensor output;
-  output.ShareDataWith(input);
-  output.Resize(dims);
-  return output;
-}
-
-// Reshape a rank-3 tensor from P x M x N to (P * M) x N.
-// Identity op if the tensor is not of rank 3.
-template <typename T>
-Tensor CombineBatchAndM(const Tensor& input) {
-  Tensor output;
-  output.ShareDataWith(input);
-  auto in_dims = input.dims();
-  if (in_dims.size() == 3) {
-    std::vector<int64_t> out_dims = {in_dims[0] * in_dims[1], in_dims[2]};
-    output.Resize(make_ddim(out_dims));
-  }
-  return output;
-}
-
-// Reshape a rank-3 tensor from P x M x N to M x (P * N).
-// (Warning: This requires transposing data and writes into new memory.)
-// Identity op if the tensor is not of rank 3.
-template <typename DeviceContext, typename T>
-Tensor CombineBatchAndN(const DeviceContext& context, const Tensor& input) {
-  Tensor output;
-  auto in_dims = input.dims();
-  if (in_dims.size() == 3) {
-    output.Resize({in_dims[1], in_dims[0], in_dims[2]});
-    output.mutable_data<T>(context.GetPlace());
-    std::vector<int> axis = {1, 0, 2};
-    math::Transpose<DeviceContext, T, 3> trans;
-    trans(context, input, &output, axis);
-    std::vector<int64_t> out_dims = {in_dims[1], in_dims[0] * in_dims[2]};
-    output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
-  } else {
-    output.ShareDataWith(input);
-  }
-  return output;
-}
-
-// Using dimensional constraints on matrix multiplication, it is
-// straight-forward to check the following table for when X and Y
-// are both matrices.
-//
-// transpose_X | False    | True     | False    | True
-// transpose_Y | False    | False    | True     | True
-// -----------+----------+----------+----------+-----------
-//        dX = | dOut Y^T | Y dOut^T | dOut Y   | Y^T dOut^T
-//        dY = | X^T dOut | X dOut   | dOut^T X | dOut^T X^T
-//
-// When X is a vector of size K, we treat it instead as a matrix of shape
-// (1, K). Similarly, when Y is a vector of size K, we treat it instead as
-// a matrix of shape (K, 1).
-//
-// When X and Y are both 3-dimensional tensors, then the first dimension
-// the batch dimension can be ignored and the exact same formulas apply
-// as for two matrices.
-//
-// Finally, when, e.g., X is a 3-dimensional tensor but Y is a matrix, we end
-// up with formulas like
-//
-//   dY_{ij} = \sum_{p, m} X_{pmi} dOut_{pmj}
-//
-// To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N
-// to X: (P * M) x K, dOut: (P * M) x N.
-template <typename DeviceContext, typename T>
-class MatMulGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor& x = *context.Input<Tensor>("X");
-    const Tensor& y = *context.Input<Tensor>("Y");
-    const Tensor& dout = *context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* dx = context.Output<Tensor>(framework::GradVarName("X"));
-    Tensor* dy = context.Output<Tensor>(framework::GradVarName("Y"));
-    bool transpose_x = context.Attr<bool>("transpose_X");
-    bool transpose_y = context.Attr<bool>("transpose_Y");
-
-    std::vector<int64_t> x_dims = vectorize(x.dims());
-    std::vector<int64_t> y_dims = vectorize(y.dims());
-
-    // If X is a vector, reshape it to a matrix.
-    if (x_dims.size() == 1) {
-      x_dims.insert(x_dims.begin(), 1);
-    }
-
-    // If Y is a vector, reshape it to a matrix.
-    if (y_dims.size() == 1) {
-      y_dims.push_back(1);
-    }
-
-    int batch_count = 0;
-    // The first rank-2 dimensions are accumulated on the batch_count, and the
-    // last two dimensions are used for matrix multiplication.
-    if (x_dims.size() > 3) {
-      batch_count = accumulate(x_dims.begin(), x_dims.end() - 2, 1,
-                               std::multiplies<int>());
-    }
-    // Fix the dOut dimensions.
-    int M = 0, N = 0, batchCountX = 0, batchCountY = 0;
-
-    switch (x_dims.size()) {
-      case 2:
-        M = transpose_x ? x_dims[1] : x_dims[0];
-        break;
-      case 3:
-        batchCountX = x_dims[0];
-        M = transpose_x ? x_dims[2] : x_dims[1];
-        break;
-      default:
-        batchCountX = batch_count;
-        size_t mat_s = x_dims.size() - 2;
-        M = transpose_x ? x_dims[mat_s + 1] : x_dims[mat_s];
-    }
-
-    switch (y_dims.size()) {
-      case 2:
-        N = transpose_y ? y_dims[0] : y_dims[1];
-        break;
-      case 3:
-        batchCountY = y_dims[0];
-        N = transpose_y ? y_dims[1] : y_dims[2];
-        break;
-      default:
-        batchCountY = batch_count;
-        size_t mat_s = y_dims.size() - 2;
-        N = transpose_y ? y_dims[mat_s] : y_dims[mat_s + 1];
-    }
-    if (batchCountX && batchCountY) {
-      PADDLE_ENFORCE_EQ(
-          batchCountX, batchCountY,
-          "When Input(X) and Input(Y) are both three dimensional, they "
-          "must have the same batch dimension.");
-    }
-    int batchCount = std::max(batchCountX, batchCountY);
-    std::vector<int64_t> dout_dims = {M, N};
-    if (batchCount) {
-      if (x_dims.size() > 3) {
-        dout_dims.insert(dout_dims.begin(), x_dims.begin(), x_dims.end() - 2);
-      } else {
-        dout_dims.insert(dout_dims.begin(), batchCount);
-      }
-    }
-    Tensor X = Reshape<T>(x, make_ddim(x_dims));
-    Tensor Y = Reshape<T>(y, make_ddim(y_dims));
-    Tensor dOut = Reshape<T>(dout, make_ddim(dout_dims));
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    if (dx) {
-      dx->mutable_data<T>(context.GetPlace());
-      const Tensor& dOut_for_dX =
-          (x_dims.size() == 2 && y_dims.size() == 3)
-              ? CombineBatchAndN<DeviceContext, T>(dev_ctx, dOut)
-              : dOut;
-      if (x_dims.size() == 2 && y_dims.size() == 3) {
-        Y = transpose_y ? CombineBatchAndM<T>(Y)
-                        : CombineBatchAndN<DeviceContext, T>(dev_ctx, Y);
-      }
-      if (transpose_x) {
-        math::MatMulFunctor<DeviceContext, T>()(
-            dev_ctx, Y, transpose_y, dOut_for_dX, transpose_x, T(1), dx, T(0));
-      } else {
-        math::MatMulFunctor<DeviceContext, T>()(
-            dev_ctx, dOut_for_dX, transpose_x, Y, !transpose_y, T(1), dx, T(0));
-      }
-    }
-
-    if (dy) {
-      dy->mutable_data<T>(context.GetPlace());
-      const Tensor& dOut_for_dY = (y_dims.size() == 2 && x_dims.size() == 3)
-                                      ? CombineBatchAndM<T>(dOut)
-                                      : dOut;
-      if (y_dims.size() == 2 && x_dims.size() == 3) {
-        X = transpose_x ? CombineBatchAndN<DeviceContext, T>(dev_ctx, X)
-                        : CombineBatchAndM<T>(X);
-        dOut = CombineBatchAndM<T>(dOut);
-      }
-      if (transpose_y) {
-        math::MatMulFunctor<DeviceContext, T>()(
-            dev_ctx, dOut_for_dY, transpose_y, X, transpose_x, T(1), dy, T(0));
-      } else {
-        math::MatMulFunctor<DeviceContext, T>()(
-            dev_ctx, X, !transpose_x, dOut_for_dY, transpose_y, T(1), dy, T(0));
-      }
-    }
-  }
-};
-}  // namespace matmul_detail
-
-using matmul_detail::MatMulKernel;
-using matmul_detail::MatMulGradKernel;
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/max_sequence_len_op.cc b/paddle/fluid/operators/max_sequence_len_op.cc
index 4cd7c89b48a2442ee7a5074abbf0f3dd9ea3bcb4..8e508b68eeab69a4595904dcc3ea0a541d9ab6e6 100644
--- a/paddle/fluid/operators/max_sequence_len_op.cc
+++ b/paddle/fluid/operators/max_sequence_len_op.cc
@@ -41,8 +41,7 @@ class MaxSeqenceLenOp : public framework::OperatorBase {
 
 class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MaxSeqenceLenOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("RankTable", "The lod_rank_table.");
     AddOutput("Out", "The max sequence length.");
     AddComment(
diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc
index e2bcba5a5e15d4d5f10ae4ae64b5262f750137ab..058115cb624627d81b31d0903f7d615d19708c77 100644
--- a/paddle/fluid/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
@@ -22,8 +22,7 @@ using framework::Tensor;
 
 class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MaxOutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(
         "X",
         "(Tensor) The input tensor of maxout operator. "
diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc
index a134796bfcaa9dea2483ace9f5045e257916daba..74477eb439dc202c3f5f17fdf3e1647bc5c23512 100644
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@@ -32,8 +32,7 @@ class MeanOp : public framework::OperatorWithKernel {
 
 class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MeanOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The input of mean op");
     AddOutput("Out", "The output of mean op");
     AddComment(R"DOC(
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index 4ebf20cbba69bee09dfddb8e928ddc95665e4731..a16861b3b77fc980ab932b9d88859b38ec36108b 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -121,8 +121,7 @@ class MergeLoDTensorOp : public framework::OperatorBase {
 
 class MergeLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MergeLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "The input LoDTensor, contains complete lod information to "
              "construct the output");
diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc
index a302b24560e680076d62d02b422c6410467deb1d..34571a38a14795a98ac8454cec606077727b5ffa 100644
--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
@@ -48,8 +48,7 @@ class MinusOp : public framework::OperatorWithKernel {
 
 class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The left tensor of minus operator.");
     AddInput("Y", "The right tensor of minus operator.");
     AddOutput("Out", "The output tensor of minus operator.");
diff --git a/paddle/fluid/operators/mkldnn_activation_op.h b/paddle/fluid/operators/mkldnn_activation_op.h
index f26a165b5a59f01f864d62bbf798f4cbffa65371..85664623d7330e9473286d995bec67879510dbd7 100644
--- a/paddle/fluid/operators/mkldnn_activation_op.h
+++ b/paddle/fluid/operators/mkldnn_activation_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
@@ -60,52 +62,5 @@ class MKLDNNActivationGradKernel
   }
 };
 
-namespace {  // NOLINT
-framework::OpKernelType GetKernelType(
-    const framework::ExecutionContext& ctx,
-    const framework::OperatorWithKernel& oper) {
-  framework::LibraryType library{framework::LibraryType::kPlain};
-#ifdef PADDLE_WITH_MKLDNN
-  if (library == framework::LibraryType::kPlain &&
-      platform::CanMKLDNNBeUsed(ctx)) {
-    library = framework::LibraryType::kMKLDNN;
-  }
-#endif
-  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
-      ctx.GetPlace(), layout, library);
-}
-}  // anonymous namespace
-
-class ActivationWithMKLDNNOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return GetKernelType(ctx, *this);
-  }
-};
-
-class ActivationWithMKLDNNOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return GetKernelType(ctx, *this);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc
index 3a0fc74584391d0441105a8ac7d7ac292e10fb8d..35db4c1ad1f6c6481eca397e99fc8c1f0bc7164c 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
@@ -39,8 +39,7 @@ class ModifiedHuberLossOp : public framework::OperatorWithKernel {
 
 class ModifiedHuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ModifiedHuberLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "The input tensor of modified huber loss op. "
              "X is 2-D tensor with shape [batch_size, 1].");
diff --git a/paddle/fluid/operators/momentum_op.cc b/paddle/fluid/operators/momentum_op.cc
index f13ec53905aa3d5b55b865c3514f36211c06a549..dcd73e3c3e40f80e07b73944d1f0cc57fea010d3 100644
--- a/paddle/fluid/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
@@ -62,8 +62,7 @@ class MomentumOp : public framework::OperatorWithKernel {
 
 class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MomentumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
              "Input parameter that has to be updated");
diff --git a/paddle/fluid/operators/mul_mkldnn_op.cc b/paddle/fluid/operators/mul_mkldnn_op.cc
deleted file mode 100644
index a5f3a98f678a870d30eebfc4cf329de7c93266ee..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mul_mkldnn_op.cc
+++ /dev/null
@@ -1,197 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "mkldnn.hpp"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/mul_op.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
-
-namespace paddle {
-namespace operators {
-
-using paddle::framework::Tensor;
-using paddle::platform::MKLDNNDeviceContext;
-
-template <typename Format = mkldnn::memory::format>
-mkldnn::memory::desc type(const std::vector<int>& dims, Format&& f) {
-  return platform::MKLDNNMemDesc(dims, mkldnn::memory::data_type::f32, f);
-}
-
-template <typename T>
-class MulMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-
-    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-    auto mkldnn_engine = dev_ctx.GetEngine();
-
-    auto input = ctx.Input<Tensor>("X");
-    auto weight = ctx.Input<Tensor>("Y");
-
-    PADDLE_ENFORCE(input->dims().size() & (2 | 4),
-                   "Input must be with 2 or 4 dimensions, i.e. NC or NCHW");
-    PADDLE_ENFORCE(weight->dims().size() & (2 | 4),
-                   "Weights must be with 2 or 4 dimensions, i.e. OI or OIHW");
-
-    std::vector<int> w_tz = paddle::framework::vectorize2int(weight->dims());
-    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
-
-    auto src_md =
-        src_tz.size() != 2
-            ? type(src_tz, mkldnn::memory::format::nchw)
-            : type({src_tz[0], src_tz[1]}, mkldnn::memory::format::nc);
-
-    auto dst_md = type({src_tz[0], w_tz[1]}, mkldnn::memory::format::nc);
-
-    auto weights_md =
-        src_tz.size() != 2
-            ? type({w_tz[1], src_tz[1], src_tz[2], src_tz[3]},
-                   mkldnn::memory::format::oihw)
-            : type({w_tz[1], src_tz[1]}, mkldnn::memory::format::oi);
-
-    auto output = ctx.Output<Tensor>("Out");
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-
-    const std::string key = ctx.op().Output("Out");
-    const std::string key_fc_pd = key + "@mul_pd";
-
-    const T* input_data = input->data<T>();
-    const T* w_data = weight->data<T>();
-
-    auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine}, output_data);
-
-    auto src_memory = mkldnn::memory({src_md, mkldnn_engine},
-                                     platform::to_void_cast(input_data));
-
-    auto weights_memory = mkldnn::memory({weights_md, mkldnn_engine},
-                                         platform::to_void_cast(w_data));
-
-    auto pd = platform::MKLDNNFwdPrimitiveDesc<mkldnn::inner_product_forward>(
-        mkldnn_engine, src_md, weights_md, dst_md);
-
-    dev_ctx.SetBlob(key_fc_pd, pd);
-
-    auto forward = mkldnn::inner_product_forward(*pd, src_memory,
-                                                 weights_memory, dst_memory);
-
-    std::vector<mkldnn::primitive> pipeline = {forward};
-    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
-  }
-};
-
-template <typename T>
-class MulMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-
-    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-    auto mkldnn_engine = dev_ctx.GetEngine();
-
-    const Tensor* input = ctx.Input<Tensor>("X");
-    const Tensor* w = ctx.Input<Tensor>("Y");
-
-    const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    Tensor* w_grad = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    const std::string key = ctx.op().Input("Out");
-    const std::string key_fc_pd = key + "@mul_pd";
-
-    const T* input_data = input->data<T>();
-    const T* w_data = w->data<T>();
-    const T* out_grad_data = out_grad->data<T>();
-    T* input_grad_data = nullptr;
-    T* w_grad_data = nullptr;
-
-    if (input_grad) {
-      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-    }
-    if (w_grad) {
-      w_grad_data = w_grad->mutable_data<T>(ctx.GetPlace());
-    }
-
-    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
-    std::vector<int> w_tz = paddle::framework::vectorize2int(w->dims());
-
-    auto src_md =
-        src_tz.size() != 2
-            ? type(src_tz, mkldnn::memory::format::nchw)
-            : type({src_tz[0], src_tz[1]}, mkldnn::memory::format::nc);
-
-    auto dst_md = type({src_tz[0], w_tz[1]}, mkldnn::memory::format::nc);
-
-    auto weights_md =
-        src_tz.size() != 2
-            ? type({w_tz[1], src_tz[1], src_tz[2], src_tz[3]},
-                   mkldnn::memory::format::oihw)
-            : type({w_tz[1], src_tz[1]}, mkldnn::memory::format::oi);
-
-    auto src_memory = mkldnn::memory({src_md, mkldnn_engine},
-                                     platform::to_void_cast(input_data));
-
-    auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine},
-                                     platform::to_void_cast(out_grad_data));
-
-    auto weight_memory = mkldnn::memory({weights_md, mkldnn_engine},
-                                        platform::to_void_cast(w_data));
-
-    auto pd =
-        std::static_pointer_cast<mkldnn::inner_product_forward::primitive_desc>(
-            dev_ctx.GetBlob(key_fc_pd));
-
-    PADDLE_ENFORCE(pd != nullptr, "Fail to find pd in device context");
-
-    if (w_grad) {
-      auto weights_grad_memory = mkldnn::memory(
-          {weights_md, mkldnn_engine}, platform::to_void_cast(w_grad_data));
-
-      auto bwd_weight_pd = platform::MKLDNNBwdPrimitiveDesc<
-          mkldnn::inner_product_backward_weights>(mkldnn_engine, *pd, src_md,
-                                                  weights_md, dst_md);
-
-      auto bwd_weights_prim = mkldnn::inner_product_backward_weights(
-          bwd_weight_pd, src_memory, dst_memory, weights_grad_memory);
-
-      std::vector<mkldnn::primitive> pipeline{bwd_weights_prim};
-      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
-    }
-
-    if (input_grad) {
-      auto src_grad_memory = mkldnn::memory(
-          {src_md, mkldnn_engine}, platform::to_void_cast(input_grad_data));
-
-      auto bwd_data_pd =
-          platform::MKLDNNBwdPrimitiveDesc<mkldnn::inner_product_backward_data>(
-              mkldnn_engine, *pd, src_md, weights_md, dst_md);
-
-      auto bwd_data_prim = mkldnn::inner_product_backward_data(
-          bwd_data_pd, dst_memory, weight_memory, src_grad_memory);
-
-      std::vector<mkldnn::primitive> pipeline{bwd_data_prim};
-      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_KERNEL(mul, MKLDNN, ::paddle::platform::CPUPlace,
-                   paddle::operators::MulMKLDNNOpKernel<float>);
-
-REGISTER_OP_KERNEL(mul_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   paddle::operators::MulMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index 6903cf83b41a54b54382fac2cf58f7bfe192b55f..51993398bd3427e1f0da155918395bc50fa65e45 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -16,10 +16,6 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
 namespace paddle {
 namespace operators {
 
@@ -76,28 +72,11 @@ class MulOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
     ctx->ShareLoD("X", /*->*/ "Out");
   }
-
- private:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    framework::LibraryType library{framework::LibraryType::kPlain};
-#ifdef PADDLE_WITH_MKLDNN
-    if (library == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library = framework::LibraryType::kMKLDNN;
-    }
-#endif
-    framework::DataLayout layout{framework::DataLayout::kAnyLayout};
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        layout, library);
-  }
 };
 
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(Tensor), The first input tensor of mul op.");
     AddInput("Y", "(Tensor), The second input tensor of mul op.");
     AddOutput("Out", "(Tensor), The output tensor of mul op.");
@@ -121,9 +100,6 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker {
         )DOC")
         .SetDefault(1)
         .EqualGreaterThan(1);
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
     AddAttr<int>(
         "y_num_col_dims",
         R"DOC((int, default 1), The mul_op can take tensors with more than two,
@@ -178,22 +154,6 @@ class MulGradOp : public framework::OperatorWithKernel {
       ctx->SetOutputDim(y_grad_name, y_dims);
     }
   }
-
- private:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    framework::LibraryType library{framework::LibraryType::kPlain};
-#ifdef PADDLE_WITH_MKLDNN
-    if (library == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library = framework::LibraryType::kMKLDNN;
-    }
-#endif
-    framework::DataLayout layout{framework::DataLayout::kAnyLayout};
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        layout, library);
-  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc
index b698c1bf8a05e053db07db34712a13c8074ee4d0..a4363fd25d57edb5c2509904a1f55634832613be 100644
--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
@@ -61,8 +61,7 @@ class MultiplexOp : public framework::OperatorWithKernel {
 
 class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MultiplexOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Ids", "The index tensor of multiplex operator.");
     AddInput("X", "The candidate tensors of multiplex operator.")
         .AsDuplicable();
diff --git a/paddle/fluid/operators/nccl_op.cc b/paddle/fluid/operators/nccl_op.cc
index 5e4ed886b10bd48bf991ce84a9099611cf5d1d26..0018139cb06fe0573565c920849843e674df6f4c 100644
--- a/paddle/fluid/operators/nccl_op.cc
+++ b/paddle/fluid/operators/nccl_op.cc
@@ -76,8 +76,7 @@ class NCCLInitOpShapeInference : public framework::InferShapeBase {
 
 class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLInitOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(kParallelScopes, "The working place of parallel do.");
     AddOutput("Communicator",
               "Create Communicator for communicating between gpus");
@@ -118,8 +117,7 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
 // AllReduceOp
 class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLAllReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The input of AllReduce op");
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of AllReduce op");
@@ -165,8 +163,7 @@ class NCCLReduceOp : public framework::OperatorWithKernel {
 // ReduceOp
 class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The input of Reduce op");
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of Reduce op");
@@ -214,8 +211,7 @@ class NCCLBcastOp : public framework::OperatorWithKernel {
 // BcastOp
 class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLBcastOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The input of BcastSend op");
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of Bcast");
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index 192bdf8ea553f3a82066f8562458d286ee15a6ee..06092e680a1efbef379ccf40fdf476769f820429 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -75,8 +75,7 @@ class NCEOp : public framework::OperatorWithKernel {
 
 class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCEOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Input", "(Tensor) A tensor of shape [batch_size, dim].");
     AddInput(
         "Label",
diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc
index 30a991224fa184257a8e59af5e6a27a0b0a4da86..cdbc975c02214721ceae3a338741101ef32d7ee9 100644
--- a/paddle/fluid/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
@@ -19,8 +19,7 @@ namespace operators {
 template <typename AttrType>
 class NormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(
         "X",
         "(Tensor) The input tensor of norm operator. "
diff --git a/paddle/fluid/operators/one_hot_op.cc b/paddle/fluid/operators/one_hot_op.cc
index 1d42dfdd765166c9596abc08ce8abd534453bc63..4fcb1d69935175c3f643db7a4da04db34492f8fb 100644
--- a/paddle/fluid/operators/one_hot_op.cc
+++ b/paddle/fluid/operators/one_hot_op.cc
@@ -46,8 +46,7 @@ class OneHotOp : public framework::OperatorWithKernel {
 
 class OneHotOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  OneHotOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(LoDTensor, LoDTensor<int>) Input variable with rank at least 2. "
              "The last dimension of X should be 1. Each value of X is an index "
diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc
index d2a0106f80144e3550d73ea22f8e012426eb01ae..d4b631a6f5bf9332f4ed1d1a4bda529fbb6ada0a 100644
--- a/paddle/fluid/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
@@ -48,8 +48,7 @@ class PadOp : public framework::OperatorWithKernel {
 
 class PadOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PadOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "The input of pad op. "
              "The input should be a k-D tensor(k > 0 and k < 7)");
diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc
index ae34fe2184b43cc104c14672dec30efd3b0e9f3b..1012640d5e2052e4f347ad458cea9072a004f334 100644
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -196,8 +196,7 @@ class ParallelDoOp : public framework::OperatorBase {
 
 class ParallelDoOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ParallelDoOpProtoMaker(OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(kInputs, "").AsDuplicable();
     AddInput(kParameters, "").AsDuplicable();
     AddInput(kPlaces, "");
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index d60a99994edc926456706ff6a3ba998a3e5e7dd5..be55bc43b14f1e6211f71b4080d1676838ad508c 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -135,7 +135,11 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
 
     PoolingMode pooling_mode;
     if (pooling_type == "max") {
-      pooling_mode = PoolingMode::kMaximum;
+      if (FLAGS_cudnn_deterministic) {
+        pooling_mode = PoolingMode::kMaximumDeterministic;
+      } else {
+        pooling_mode = PoolingMode::kMaximum;
+      }
     } else {
       pooling_mode = PoolingMode::kAverage;
     }
diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc
index 63eaaedcd5fc3df17902511dc02b25bf43ccd241..a045f9e98dd7348973c3c4506f44d3e261599a14 100644
--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
@@ -18,6 +18,29 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using mkldnn::memory;  // Note: paddle has also "memory" namespace
+using mkldnn::pooling_forward;
+using mkldnn::pooling_backward;
+
+// Generate keys for storing/retriving primitives for this operator
+// TODO(jczaja): Make hashing function more optimial
+static std::string gethash(const memory::dims& input_dims,
+                           const std::string& pooling_type,
+                           const std::vector<int>& ksize,
+                           const std::vector<int>& strides,
+                           const std::vector<int>& paddings,
+                           const std::string& suffix) {
+  auto dims2str = [](const memory::dims& operand_dims) {
+    std::string dstr = "";
+    for (size_t i = 0; i < operand_dims.size(); ++i) {
+      dstr += std::to_string(operand_dims[i]) + "-";
+    }
+    return dstr;
+  };
+  return dims2str(input_dims) + dims2str(ksize) + dims2str(strides) +
+         dims2str(paddings) + pooling_type + suffix;
+}
+
 template <typename T>
 class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
@@ -34,10 +57,6 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     // Get an unique name from "argument" name of "Out" variable
     // This name will be used as key when saving info into device context
-    const std::string key = ctx.op().Output("Out");
-    const std::string key_pool_pd = key + "@pool_pd";
-    const std::string key_pool_workspace_memory =
-        key + "@pool_workspace_memory";
 
     std::string pooling_type = ctx.Attr<std::string>("pooling_type");
     std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
@@ -63,37 +82,71 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
-    // TODO(pzelazko-intel): support more formats
-    auto src_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                          mkldnn::memory::format::nchw);
-    auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32,
-                                          mkldnn::memory::format::nchw);
-
-    std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
-        CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize,
-                            pooling_type, mkldnn_engine);
-
-    // save pool_pd into global device context to be referred in backward path
-    dev_ctx.SetBlob(key_pool_pd, pool_pd);
-
-    std::shared_ptr<mkldnn::memory> workspace_memory =
-        CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine);
-
-    // save pool_workspace_memory to be referred in backward path
-    dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
-
-    auto src_memory =
-        mkldnn::memory({src_md, mkldnn_engine},
-                       static_cast<void*>(const_cast<T*>(input_data)));
-    auto dst_memory =
-        mkldnn::memory({dst_md, mkldnn_engine},
-                       static_cast<void*>(const_cast<T*>(output_data)));
+    const std::string key = gethash(src_tz, pooling_type, ksize, strides,
+                                    paddings, ctx.op().Output("Out"));
+    const std::string key_pool_p = key + "@pool_p";
+    const std::string key_pool_pd = key + "@pool_pd";
+    const std::string key_pool_src_mem_p = key + "@pool_src_mem_p";
+    const std::string key_pool_dst_mem_p = key + "@pool_dst_mem_p";
+    const std::string key_pool_workspace_memory =
+        key + "@pool_workspace_memory";
 
-    auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory,
-                                             *workspace_memory);
+    auto pool_p =
+        std::static_pointer_cast<pooling_forward>(dev_ctx.GetBlob(key_pool_p));
+    if (pool_p == nullptr) {
+      // TODO(pzelazko-intel): support more formats
+
+      auto src_md =
+          platform::MKLDNNMemDesc(src_tz, platform::MKLDNNGetDataType<T>(),
+                                  mkldnn::memory::format::nchw);
+      auto dst_md =
+          platform::MKLDNNMemDesc(dst_tz, platform::MKLDNNGetDataType<T>(),
+                                  mkldnn::memory::format::nchw);
+
+      std::shared_ptr<pooling_forward::primitive_desc> pool_pd =
+          CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize,
+                              pooling_type, mkldnn_engine);
+
+      // save pool_pd into global device context to be referred in backward path
+      dev_ctx.SetBlob(key_pool_pd, pool_pd);
+
+      std::shared_ptr<mkldnn::memory> workspace_memory =
+          CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine);
+
+      // save pool_workspace_memory to be referred in backward path
+      dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
+
+      auto pool_src_memory_p = std::make_shared<memory>(
+          memory::primitive_desc{src_md, mkldnn_engine},
+          static_cast<void*>(const_cast<T*>(input_data)));
+      dev_ctx.SetBlob(key_pool_src_mem_p, pool_src_memory_p);
+
+      auto pool_dst_memory_p = std::make_shared<memory>(
+          memory::primitive_desc{dst_md, mkldnn_engine},
+          static_cast<void*>(output_data));
+      dev_ctx.SetBlob(key_pool_dst_mem_p, pool_dst_memory_p);
+
+      pool_p = std::make_shared<pooling_forward>(
+          *pool_pd, *(pool_src_memory_p.get()), *(pool_dst_memory_p.get()),
+          *workspace_memory);
+      dev_ctx.SetBlob(key_pool_p, pool_p);
+    } else {
+      // Primitives already exist
+      auto pool_src_memory_p =
+          std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_src_mem_p));
+      PADDLE_ENFORCE(pool_src_memory_p != nullptr,
+                     "Fail to find pooling src mem_p in device context");
+      auto pool_dst_memory_p =
+          std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_dst_mem_p));
+      PADDLE_ENFORCE(pool_dst_memory_p != nullptr,
+                     "Fail to find pooling dst mem_p in device context");
+      pool_src_memory_p->set_data_handle(
+          reinterpret_cast<void*>(const_cast<T*>(input_data)));
+      pool_dst_memory_p->set_data_handle(output_data);
+    }
 
     // push primitive to stream and wait until it's executed
-    std::vector<mkldnn::primitive> pipeline{pool_prim};
+    std::vector<mkldnn::primitive> pipeline{*(pool_p.get())};
     mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
   }
 
@@ -120,9 +173,10 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     mkldnn::memory::primitive_desc workspace_md =
         pooling_type == "max"
             ? pool_pd->workspace_primitive_desc()
-            : mkldnn::memory::primitive_desc(
-                  {{}, mkldnn::memory::f32, mkldnn::memory::format::nchw},
-                  engine);
+            : mkldnn::memory::primitive_desc({{},
+                                              platform::MKLDNNGetDataType<T>(),
+                                              mkldnn::memory::format::nchw},
+                                             engine);
 
     auto p_workspace_memory = new mkldnn::memory(workspace_md);
     return std::unique_ptr<mkldnn::memory>(p_workspace_memory);
@@ -140,13 +194,6 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
     Tensor* in_x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-    // Get an unique name from "argument" name of "Out" variable
-    // This name will be used as key when referring info from device context
-    const std::string key = ctx.op().Input("Out");
-    const std::string key_pool_pd = key + "@pool_pd";
-    const std::string key_pool_workspace_memory =
-        key + "@pool_workspace_memory";
-
     std::string pooling_type = ctx.Attr<std::string>("pooling_type");
     std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
@@ -171,43 +218,76 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<int> diff_dst_tz =
         paddle::framework::vectorize2int(out_grad->dims());
 
-    auto diff_src_md = platform::MKLDNNMemDesc(diff_src_tz, mkldnn::memory::f32,
-                                               mkldnn::memory::format::nchw);
-    auto diff_dst_md = platform::MKLDNNMemDesc(diff_dst_tz, mkldnn::memory::f32,
-                                               mkldnn::memory::format::nchw);
-
-    // Retrieve pool_pd/pool_workspace_memory from device context
-    auto pool_pd =
-        std::static_pointer_cast<mkldnn::pooling_forward::primitive_desc>(
-            dev_ctx.GetBlob(key_pool_pd));
-    PADDLE_ENFORCE(pool_pd != nullptr,
-                   "Fail to find pool_pd in device context");
-
-    auto workspace_memory = std::static_pointer_cast<mkldnn::memory>(
-        dev_ctx.GetBlob(key_pool_workspace_memory));
-    PADDLE_ENFORCE(workspace_memory != nullptr,
-                   "Fail to find workspace_memory in device context");
-
-    auto pool_bwd_desc = mkldnn::pooling_backward::desc(
-        pooling_type == "max" ? mkldnn::algorithm::pooling_max
-                              : mkldnn::algorithm::pooling_avg,
-        diff_src_md, diff_dst_md, strides, ksize, paddings, paddings,
-        mkldnn::padding_kind::zero);
-    auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc(
-        pool_bwd_desc, mkldnn_engine, *pool_pd);
-
-    auto diff_src_memory =
-        mkldnn::memory({diff_src_md, mkldnn_engine},
-                       static_cast<void*>(const_cast<T*>(in_x_grad_data)));
-    auto diff_dst_memory =
-        mkldnn::memory({diff_dst_md, mkldnn_engine},
-                       static_cast<void*>(const_cast<T*>(out_grad_data)));
+    // Get an unique name from "argument" name of "Out" variable
+    // This name will be used as key when referring info from device context
+    const std::string key = gethash(diff_src_tz, pooling_type, ksize, strides,
+                                    paddings, ctx.op().Input("Out"));
+    const std::string key_pool_bwd_p = key + "@pool_bwd_p";
+    const std::string key_pool_diff_src_mem_p = key + "@pool_diff_src_mem_p";
+    const std::string key_pool_diff_dst_mem_p = key + "@pool_diff_dst_mem_p";
+    const std::string key_pool_pd = key + "@pool_pd";
+    const std::string key_pool_workspace_memory =
+        key + "@pool_workspace_memory";
 
-    auto bwd_prim = mkldnn::pooling_backward(
-        pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory);
+    auto pool_bwd_p = std::static_pointer_cast<pooling_backward>(
+        dev_ctx.GetBlob(key_pool_bwd_p));
+    if (pool_bwd_p == nullptr) {
+      auto diff_src_md =
+          platform::MKLDNNMemDesc(diff_src_tz, platform::MKLDNNGetDataType<T>(),
+                                  mkldnn::memory::format::nchw);
+      auto diff_dst_md =
+          platform::MKLDNNMemDesc(diff_dst_tz, platform::MKLDNNGetDataType<T>(),
+                                  mkldnn::memory::format::nchw);
+      // Retrieve pool_pd/pool_workspace_memory from device context
+      auto pool_pd =
+          std::static_pointer_cast<mkldnn::pooling_forward::primitive_desc>(
+              dev_ctx.GetBlob(key_pool_pd));
+      PADDLE_ENFORCE(pool_pd != nullptr,
+                     "Fail to find pool_pd in device context");
+
+      auto workspace_memory = std::static_pointer_cast<mkldnn::memory>(
+          dev_ctx.GetBlob(key_pool_workspace_memory));
+      PADDLE_ENFORCE(workspace_memory != nullptr,
+                     "Fail to find workspace_memory in device context");
+
+      auto pool_diff_src_memory_p = std::make_shared<memory>(memory(
+          {diff_src_md, mkldnn_engine}, static_cast<void*>(in_x_grad_data)));
+      dev_ctx.SetBlob(key_pool_diff_src_mem_p, pool_diff_src_memory_p);
+
+      auto pool_diff_dst_memory_p = std::make_shared<memory>(
+          memory({diff_dst_md, mkldnn_engine},
+                 static_cast<void*>(const_cast<T*>(out_grad_data))));
+      dev_ctx.SetBlob(key_pool_diff_dst_mem_p, pool_diff_dst_memory_p);
+
+      auto pool_bwd_desc = mkldnn::pooling_backward::desc(
+          pooling_type == "max" ? mkldnn::algorithm::pooling_max
+                                : mkldnn::algorithm::pooling_avg,
+          diff_src_md, diff_dst_md, strides, ksize, paddings, paddings,
+          mkldnn::padding_kind::zero);
+      auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc(
+          pool_bwd_desc, mkldnn_engine, *pool_pd);
+
+      pool_bwd_p = std::make_shared<pooling_backward>(
+          pool_bwd_pd, *(pool_diff_dst_memory_p.get()), *workspace_memory,
+          *(pool_diff_src_memory_p));
+      dev_ctx.SetBlob(key_pool_bwd_p, pool_bwd_p);
+    } else {
+      // Primitives already exist
+      auto pool_diff_src_memory_p = std::static_pointer_cast<memory>(
+          dev_ctx.GetBlob(key_pool_diff_src_mem_p));
+      PADDLE_ENFORCE(pool_diff_src_memory_p != nullptr,
+                     "Fail to find pooling src mem_p in device context");
+      auto pool_diff_dst_memory_p = std::static_pointer_cast<memory>(
+          dev_ctx.GetBlob(key_pool_diff_dst_mem_p));
+      PADDLE_ENFORCE(pool_diff_dst_memory_p != nullptr,
+                     "Fail to find pooling dst mem_p in device context");
+      pool_diff_src_memory_p->set_data_handle(
+          reinterpret_cast<void*>(in_x_grad_data));
+      pool_diff_dst_memory_p->set_data_handle(const_cast<T*>(out_grad_data));
+    }
 
     // push primitive to stream and wait until it's executed
-    std::vector<mkldnn::primitive> pipeline{bwd_prim};
+    std::vector<mkldnn::primitive> pipeline{*(pool_bwd_p.get())};
     mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
   }  // Compute()
 };
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index f2de075e0d82fc5bd0ac41b481ac80314f3857a3..18aa2bd352c5d184b5748e57b4af17c1ae0d7a82 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -83,6 +83,9 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
 framework::OpKernelType PoolOp::GetExpectedKernelType(
     const framework::ExecutionContext &ctx) const {
   framework::LibraryType library_{framework::LibraryType::kPlain};
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+
 #ifdef PADDLE_WITH_CUDA
   if (platform::CanCUDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kCUDNN;
@@ -92,11 +95,10 @@ framework::OpKernelType PoolOp::GetExpectedKernelType(
   if (library_ == framework::LibraryType::kPlain &&
       platform::CanMKLDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kMKLDNN;
+    layout_ = framework::DataLayout::kMKLDNN;
   }
 #endif
 
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
       layout_, library_);
@@ -112,6 +114,9 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
 framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
     const framework::ExecutionContext &ctx) const {
   framework::LibraryType library_{framework::LibraryType::kPlain};
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+
 #ifdef PADDLE_WITH_CUDA
   if (platform::CanCUDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kCUDNN;
@@ -121,6 +126,7 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
   if (library_ == framework::LibraryType::kPlain &&
       platform::CanMKLDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kMKLDNN;
+    layout_ = framework::DataLayout::kMKLDNN;
   }
 #endif
 
@@ -129,14 +135,11 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
     PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
                       "float16 can only be used when CUDNN is used");
   }
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
   return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
                                  library_);
 }
 
-Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
+void Pool2dOpMaker::Make() {
   AddInput(
       "X",
       "(Tensor) The input tensor of pooling operator. "
@@ -229,8 +232,7 @@ Example:
 )DOC");
 }
 
-Pool3dOpMaker::Pool3dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
+void Pool3dOpMaker::Make() {
   AddInput("X",
            "(Tensor) The input tensor of pooling operator. "
            "The format of input tensor is NCDHW, where N is batch size, C is "
diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
index a48127ea6983d3d4ea12ec4925f30af233002ef2..a63963ca926bb94ff99e5cfe6dbcb2b15075bcb8 100644
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -50,12 +50,12 @@ class PoolOpGrad : public framework::OperatorWithKernel {
 
 class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Pool2dOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+  void Make() override;
 };
 
 class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Pool3dOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+  void Make() override;
 };
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
index 848cd61b23c2389d3fe11f585b256d55c1ff177f..873706593e4c856f0079738654a9e7e59a1c0cd8 100644
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -100,8 +100,7 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
 
 class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MaxPool2dWithIndexOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(
         "X",
         "(Tensor) The input tensor of pooling operator. "
@@ -177,8 +176,7 @@ Example:
 
 class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MaxPool3dWithIndexOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(Tensor) The input tensor of pooling operator. "
              "The format of input tensor is NCDHW, where N is batch size, C is "
diff --git a/paddle/fluid/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc
index d237da25a00de13057e009b6705d3241b8b26539..4d865b7f17b050ac6f04addc9949f3f65da06ded 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.cc
+++ b/paddle/fluid/operators/positive_negative_pair_op.cc
@@ -95,8 +95,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
 
 class PositiveNegativePairOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PositiveNegativePairOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Score",
              "(Tensor, float) Model Score on an item (with "
              "respect to QueryID). It's a 2-D tensor with shape [batch_size, "
diff --git a/paddle/fluid/operators/precision_recall_op.cc b/paddle/fluid/operators/precision_recall_op.cc
index c34b0d072bdb2f5b97dd4615ff9338d98f2bfbe5..e7ce16f33fb5052ffb41fc05bd1538e2f0dc35be 100644
--- a/paddle/fluid/operators/precision_recall_op.cc
+++ b/paddle/fluid/operators/precision_recall_op.cc
@@ -90,8 +90,7 @@ class PrecisionRecallOp : public framework::OperatorWithKernel {
 
 class PrecisionRecallOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PrecisionRecallOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("MaxProbs",
              "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "
              "where N is the batch size. Each row contains the max probability "
diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc
index f9ae01ab5d2972d2a74b36ae6035985d1d874bb6..d96359d6befa68bdd0c255dde9c63bfc7fffc0a5 100644
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
@@ -41,35 +41,26 @@ class PrefetchOp : public framework::OperatorBase {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
 
-    auto client_var_name = Output("RPCClient");
-    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
-                            "Can not find variable '%s' in the scope.",
-                            client_var_name);
-    auto* client_var = scope.FindVar(client_var_name);
-    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
+    detail::RPCClient* rpc_client =
+        detail::RPCClient::GetInstance<detail::GRPCClient>();
 
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
         VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get "
                 << outs[i] << " back";
-        rpc_client->AsyncPrefetchVariable(epmap[i], ctx, scope, ins[i],
-                                          outs[i]);
+        rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope, ins[i], outs[i]);
       } else {
         VLOG(3) << "don't send no-initialied variable: " << ins[i];
       }
     }
-    PADDLE_ENFORCE(rpc_client->Wait());
+    rpc_client->Wait();
   }
 };
 
 class PrefetchOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PrefetchOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() {
     AddInput("X", "(LoDTensor) Input Id variables to be sent").AsDuplicable();
-    AddOutput("RPCClient",
-              "(RPCClient) The RPC client object which will be"
-              "initialized at most once.");
     AddOutput("Out",
               "(LoDTensor) result "
               "to be fetched from parameter server")
@@ -88,17 +79,6 @@ the parameter server and fetch result back.
   }
 };
 
-class PrefetchOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto out_var_name = op_desc.Output("RPCClient").front();
-    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarType::RAW;
-    out_var.SetType(var_type);
-  }
-};
-
 class PrefetchOpShapeInference : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext* ctx) const override {}
@@ -111,5 +91,4 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(prefetch, ops::PrefetchOp,
                   paddle::framework::EmptyGradOpMaker, ops::PrefetchOpMaker,
-                  ops::PrefetchOpVarTypeInference,
                   ops::PrefetchOpShapeInference);
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index a066b3e06e5eca2661827425b5b2d0059d5bcc3c..db040509bc08c3f6ad031c5b97c93574e31337e0 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -38,8 +38,7 @@ class PReluOp : public framework::OperatorWithKernel {
 
 class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The input tensor of prelu operator.");
     AddInput("Alpha", "The alpha weight of prelu operator.");
     AddOutput("Out", "The output tensor of prelu operator.");
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
index fafc7e54d7a44d6bb2dadf67135537dc16430e76..db7634918a5179a61304315ecd08350d23fb4642 100644
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -209,8 +209,7 @@ class TensorPrintOp : public framework::OperatorBase {
 
 class PrintOpProtoAndCheckMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PrintOpProtoAndCheckMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("In", "Input tensor to be displayed.");
     AddAttr<int>("first_n", "Only log `first_n` number of times.");
     AddAttr<std::string>("message", "A string message to print as a prefix.");
diff --git a/paddle/fluid/operators/proximal_adagrad_op.cc b/paddle/fluid/operators/proximal_adagrad_op.cc
index e057244c1e974edea1b9bbc76c0585c295495299..8d8075d76111928ec9855eb0b70fe6dbd90a979b 100644
--- a/paddle/fluid/operators/proximal_adagrad_op.cc
+++ b/paddle/fluid/operators/proximal_adagrad_op.cc
@@ -66,8 +66,7 @@ class ProximalAdagradOp : public framework::OperatorWithKernel {
 
 class ProximalAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ProximalAdagradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
              "Input parameter that has to be updated.");
diff --git a/paddle/fluid/operators/proximal_gd_op.cc b/paddle/fluid/operators/proximal_gd_op.cc
index ed1472631870e5aee6b0e8b8f80bb5e6c84a3851..baf9cbcba2ed89f62afc9816e0ab9e0f112e6008 100644
--- a/paddle/fluid/operators/proximal_gd_op.cc
+++ b/paddle/fluid/operators/proximal_gd_op.cc
@@ -54,8 +54,7 @@ class ProximalGDOp : public framework::OperatorWithKernel {
 
 class ProximalGDOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ProximalGDOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
              "Input parameter value that has to be updated.");
diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..528a6e4a1b68fe611d104f21bafe970762611a03
--- /dev/null
+++ b/paddle/fluid/operators/random_crop_op.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/random_crop_op.h"
+
+namespace paddle {
+namespace operators {
+
+class RandomCropOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "A batch of instances to random crop.");
+    AddInput("Seed", "The random seed.");
+    AddOutput("Out", "The cropped instance batch.");
+    AddOutput("SeedOut", "The random seed after random cropping.")
+        .AsIntermediate();
+    AddAttr<std::vector<int>>("shape", "The shape of a cropped instance.");
+    AddComment(R"DOC(
+      This operator takes a batch of instance, and do random cropping on each instance.
+      It means that cropping positions differs on each instance, which is determined
+      by an uniform random generator. All cropped instances have the same shape, which 
+      is determined by the operator's attribute 'shape'.
+    )DOC");
+  }
+};
+
+class RandomCropOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    auto seed_dim = ctx->GetInputDim("Seed");
+    PADDLE_ENFORCE(seed_dim.size() == 1 && seed_dim[0] == 1);
+    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    auto x_dim = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_GT(x_dim.size(), static_cast<int64_t>(shape.size()));
+    auto out_dim = framework::vectorize2int(x_dim);
+    for (size_t i = 1; i <= shape.size(); ++i) {
+      size_t x_i = x_dim.size() - i;
+      size_t shape_i = shape.size() - i;
+      PADDLE_ENFORCE_GE(x_dim[x_i], shape[shape_i]);
+      out_dim[x_i] = shape[shape_i];
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dim));
+    ctx->SetOutputDim("SeedOut", framework::make_ddim({1}));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace f = paddle::framework;
+REGISTER_OPERATOR(random_crop, ops::RandomCropOp, ops::RandomCropOpMaker,
+                  ops::RandomCropOpInferShape, f::EmptyGradOpMaker);
+
+template <typename T>
+using Kernel = ops::RandomCropKernel<paddle::platform::CPUDeviceContext, T>;
+REGISTER_OP_CPU_KERNEL(random_crop, Kernel<float>, Kernel<int>, Kernel<double>,
+                       Kernel<uint8_t>, Kernel<int16_t>);
diff --git a/paddle/fluid/operators/random_crop_op.cu b/paddle/fluid/operators/random_crop_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6fc9bedc55b4d349ddf3d109c7f9049113235f0c
--- /dev/null
+++ b/paddle/fluid/operators/random_crop_op.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/random_crop_op.h"
+
+namespace ops = paddle::operators;
+template <typename T>
+using Kernel = ops::RandomCropKernel<paddle::platform::CUDADeviceContext, T>;
+REGISTER_OP_CUDA_KERNEL(random_crop, Kernel<float>, Kernel<int>, Kernel<double>,
+                        Kernel<uint8_t>, Kernel<int16_t>);
diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3261cbdc986b0cc724315c1eb92b8b84e18c742
--- /dev/null
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -0,0 +1,181 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/for_range.h"
+#ifdef PADDLE_WITH_CUDA
+#include <thrust/random.h>
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext>
+struct Random;
+
+template <>
+struct Random<platform::CPUDeviceContext> {
+  using Engine = std::minstd_rand;
+
+  template <typename T>
+  using UniformIntDist = std::uniform_int_distribution<T>;
+};
+
+#ifdef PADDLE_WITH_CUDA
+template <>
+struct Random<platform::CUDADeviceContext> {
+  using Engine = thrust::minstd_rand;
+
+  template <typename T>
+  using UniformIntDist = thrust::uniform_int_distribution<T>;
+};
+#endif
+
+template <typename T>
+HOSTDEVICE inline void StridedMemcpy(const T* x, const size_t* x_dims, T* out,
+                                     const size_t* out_dims, int i, int rank,
+                                     size_t prod_x_remain,
+                                     size_t prod_out_remain,
+                                     const size_t* offsets) {
+  size_t x_dim_i = x_dims[i];
+  size_t out_dim_i = out_dims[i];
+  size_t x_stride = prod_x_remain / x_dim_i;
+  size_t out_stride = prod_out_remain / out_dim_i;
+  size_t offset_i = offsets[i];
+
+  if (i == rank - 1) {
+    PADDLE_ASSERT(x_stride == 1 && out_stride == 1);
+    x += offset_i;
+    for (size_t j = 0; j < out_dim_i; ++j) {
+      *out++ = *x++;
+    }
+  } else {
+    x += offset_i * x_stride;
+    for (size_t j = 0; j < out_dim_i; ++j) {
+      StridedMemcpy<T>(x, x_dims, out, out_dims, i + 1, rank, x_stride,
+                       out_stride, offsets);
+      x += x_stride;
+      out += out_stride;
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+struct RandomCropFunctor {
+  const T* x_;
+  T* out_;
+  size_t x_dims_[9];
+  size_t out_dims_[9];
+  int num_batchsize_dims_;
+  int rank_;
+  int64_t seed_;
+
+  size_t prod_batchsize_dims_;
+  size_t prod_x_ins_dims_;
+  size_t prod_out_ins_dims_;
+
+  RandomCropFunctor(const T* x, T* out, const framework::DDim& x_dims,
+                    const framework::DDim& out_dims, int num_batchsize_dims,
+                    int64_t seed)
+      : x_(x),
+        out_(out),
+        num_batchsize_dims_(num_batchsize_dims),
+        rank_(x_dims.size()),
+        seed_(seed) {
+    PADDLE_ENFORCE_EQ(x_dims.size(), out_dims.size());
+    PADDLE_ENFORCE_GT(rank_, num_batchsize_dims_);
+    prod_batchsize_dims_ = 1;
+    prod_x_ins_dims_ = 1;
+    prod_out_ins_dims_ = 1;
+    for (size_t i = 0; i < static_cast<size_t>(rank_); ++i) {
+      size_t x_dim_i = x_dims[i];
+      size_t out_dim_i = out_dims[i];
+      x_dims_[i] = x_dim_i;
+      out_dims_[i] = out_dim_i;
+      if (i < static_cast<size_t>(num_batchsize_dims_)) {
+        PADDLE_ENFORCE_EQ(x_dim_i, out_dim_i);
+        prod_batchsize_dims_ *= x_dim_i;
+      } else {
+        prod_x_ins_dims_ *= x_dim_i;
+        prod_out_ins_dims_ *= out_dim_i;
+      }
+    }
+  }
+
+  HOSTDEVICE void operator()(size_t ins_idx) {
+    typename Random<DeviceContext>::Engine engine(seed_);
+    engine.discard(ins_idx * (rank_ - num_batchsize_dims_));
+    size_t offsets[9];
+    for (int i = num_batchsize_dims_; i < rank_; ++i) {
+      typename Random<DeviceContext>::template UniformIntDist<size_t> dist(
+          0, x_dims_[i] - out_dims_[i]);
+      offsets[i - num_batchsize_dims_] = dist(engine);
+    }
+
+    const T* x = x_ + ins_idx * prod_x_ins_dims_;
+    T* out = out_ + ins_idx * prod_out_ins_dims_;
+
+    StridedMemcpy<T>(x, x_dims_ + num_batchsize_dims_, out,
+                     out_dims_ + num_batchsize_dims_, 0,
+                     rank_ - num_batchsize_dims_, prod_x_ins_dims_,
+                     prod_out_ins_dims_, offsets);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class RandomCropKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& ctx) const {
+    auto& seed_tensor = detail::Ref(ctx.Input<framework::LoDTensor>("Seed"));
+    int64_t seed = 0;
+    if (platform::is_cpu_place(seed_tensor.place())) {
+      seed = *seed_tensor.data<int64_t>();
+    } else {
+      LOG(WARNING) << "It is slow to place seed in GPU memory. Please verify "
+                      "your program";
+      framework::LoDTensor cpu_seed;
+      framework::TensorCopySync(seed_tensor, platform::CPUPlace(), &cpu_seed);
+      seed = *cpu_seed.data<int64_t>();
+    }
+    auto shape = ctx.Attr<std::vector<int>>("shape");
+    auto& x = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
+    auto& out = detail::Ref(ctx.Output<framework::LoDTensor>("Out"));
+
+    int num_batchsize_dims = x.dims().size() - shape.size();
+    RandomCropFunctor<DeviceContext, T> functor(
+        x.data<T>(), out.mutable_data<T>(ctx.GetPlace()), x.dims(), out.dims(),
+        num_batchsize_dims, seed);
+    platform::ForRange<DeviceContext> for_range(
+        ctx.template device_context<DeviceContext>(),
+        functor.prod_batchsize_dims_);
+
+    for_range(functor);
+
+    Random<platform::CPUDeviceContext>::Engine engine(seed);
+    engine.discard(functor.prod_batchsize_dims_ *
+                   (functor.rank_ - functor.num_batchsize_dims_));
+    *ctx.Output<framework::LoDTensor>("SeedOut")->mutable_data<int64_t>(
+        platform::CPUPlace()) = engine();
+  }
+};
+
+// TODO(fengjiayi): Backward of random crop op
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index eb9ff8de3e4b37ef0bbf7477c1bb62856bdb6310..313cf01541dd88a0f4f8bf54fe4436984c2cbcf8 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -46,8 +46,7 @@ class RankLossOp : public framework::OperatorWithKernel {
 
 class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RankLossOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Label",
              "(2-D Tensor with shape [batch_size x 1]) "
              "The label indicating A ranked higher than B or not.");
diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc
index bf02b9958927580608b95d6b8ecfddc7231a02d4..72a27d43584d55cd0859c63577ae85ff0f5fdfa8 100644
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@@ -79,8 +79,7 @@ class ReadOp : public framework::OperatorBase {
 
 class ReadOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ReadOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(op_proto, op_checker) {
+  void Make() override {
     AddInput("Reader", "(ReaderHolder) The executed reader.");
     AddOutput("Out", "(LoDTensor) The output data.").AsDuplicable();
     AddComment(R"DOC(
diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt
index 3106978eb0149b14849dfd1aaad8bbe76791f2f6..62532036f86bfb82465ccd9e0ec526299489932a 100644
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -23,6 +23,7 @@ reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_o
 reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc)
 reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
 reader_library(create_threaded_reader_op SRCS create_threaded_reader_op.cc)
+reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
 
 cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc)
 # Export local libraries to parent
diff --git a/paddle/fluid/operators/reader/create_batch_reader_op.cc b/paddle/fluid/operators/reader/create_batch_reader_op.cc
index 04c5872bef4600e30ba572a025cc5f0a5e9839ca..4cc7cbc6e89b0712faf9ad9c51480bce00da15f5 100644
--- a/paddle/fluid/operators/reader/create_batch_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
@@ -52,9 +52,8 @@ class CreateBatchReaderOp : public framework::OperatorBase {
 };
 
 class CreateBatchReaderOpMaker : public DecoratedReaderMakerBase {
- public:
-  CreateBatchReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : DecoratedReaderMakerBase(op_proto, op_checker) {
+ protected:
+  void Apply() override {
     AddAttr<int>("batch_size",
                  "How many instances the batch reader yields each time.")
         .GreaterThan(0);
diff --git a/paddle/fluid/operators/reader/create_custom_reader_op.cc b/paddle/fluid/operators/reader/create_custom_reader_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..331224a59899b4a7d517ca4f7141fb5b8f4f5168
--- /dev/null
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -0,0 +1,187 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class CustomReader : public framework::DecoratedReader {
+ public:
+  CustomReader(ReaderBase* reader, const framework::BlockDesc& sub_block,
+               const std::vector<std::string>& source_var_names,
+               const std::vector<std::string>& sink_var_names)
+      : DecoratedReader(reader),
+        program_(*sub_block.Program()),
+        sub_block_id_(sub_block.ID()),
+        exe_(framework::Executor(platform::CPUPlace())),
+        source_var_names_(source_var_names),
+        sink_var_names_(sink_var_names) {}
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+
+ private:
+  const framework::ProgramDesc program_;
+  int sub_block_id_;
+  framework::Executor exe_;
+
+  std::vector<std::string> source_var_names_;
+  std::vector<std::string> sink_var_names_;
+};
+
+class CreateCustomReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    auto* sub_block = Attr<framework::BlockDesc*>("sub_block");
+    if (out->Get() != nullptr) {
+      return;
+    }
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    out->Reset(
+        new CustomReader(underlying_reader.Get(), *sub_block,
+                         Attr<std::vector<std::string>>("source_var_names"),
+                         Attr<std::vector<std::string>>("sink_var_names")));
+  }
+};
+
+class CreateCustomReaderOpMaker : public DecoratedReaderMakerBase {
+ protected:
+  void Apply() override {
+    AddAttr<framework::BlockDesc*>(
+        "sub_block", "The block to hold all preprocessing operators.");
+    AddAttr<std::vector<std::string>>(
+        "source_var_names",
+        "Source variables are starting points of data preprocessing. They hold "
+        "preprocessing's input tensors. Each source variable corresponds to "
+        "one of underlying reader's output datas.");
+    AddAttr<std::vector<std::string>>(
+        "sink_var_names",
+        "Sink variables are ending points of data preprocessing. They hold "
+        "preprocessing's output tensors. Each sink variable corresponds to "
+        "one of custom reader's output datas.");
+    AddComment(R"DOC(
+      CreateCustomReader Operator
+
+      A custom reader can be used for input data preprocessing. 
+      A custom reader holds its own sub-block, which will be executed in CPU 
+      in its 'ReadNext()' function. Users can configurate their own 
+      preprocessing pipelines by inserting operators into custom reader's 
+      sub-block.
+    )DOC");
+  }
+};
+
+class CustomReaderInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(!ctx->IsRuntime(),
+                   "'CustomReaderInferShape' should only be invoked during "
+                   "compile time.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "The output decorated reader should not be null.");
+    const auto* sub_block =
+        ctx->Attrs().Get<framework::BlockDesc*>("sub_block");
+    const auto sink_var_names =
+        ctx->Attrs().Get<std::vector<std::string>>("sink_var_names");
+    std::vector<std::vector<int64_t>> res_dims;
+    std::vector<int32_t> res_lod_levels;
+    for (const std::string& var_name : sink_var_names) {
+      auto* sink_var = sub_block->FindVar(var_name);
+      PADDLE_ENFORCE_NOT_NULL(sink_var);
+      res_dims.emplace_back(sink_var->GetShape());
+      res_lod_levels.push_back(sink_var->GetLoDLevel());
+    }
+    auto* out_reader =
+        boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
+    out_reader->SetShapes(res_dims);
+    out_reader->SetLoDLevels(res_lod_levels);
+  }
+};
+
+class CustomReaderInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    framework::VarDesc* out_reader = block->FindVar(op_desc.Output("Out")[0]);
+    PADDLE_ENFORCE_NOT_NULL(out_reader);
+    out_reader->SetType(framework::proto::VarType::READER);
+
+    auto sink_var_names =
+        boost::get<std::vector<std::string>>(op_desc.GetAttr("sink_var_names"));
+    const auto* sub_block =
+        boost::get<framework::BlockDesc*>(op_desc.GetAttr("sub_block"));
+    std::vector<framework::proto::VarType::Type> res_data_types;
+    for (const std::string& var_name : sink_var_names) {
+      framework::VarDesc* var = sub_block->FindVar(var_name);
+      PADDLE_ENFORCE_NOT_NULL(var);
+      res_data_types.emplace_back(var->GetDataType());
+    }
+    out_reader->SetDataTypes(res_data_types);
+  }
+};
+
+void CustomReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+  out->clear();
+  std::vector<framework::LoDTensor> underlying_outs;
+  reader_->ReadNext(&underlying_outs);
+  if (underlying_outs.empty()) {
+    // There is not next data.
+    return;
+  }
+  PADDLE_ENFORCE(source_var_names_.size() == underlying_outs.size(),
+                 "The size of source_var_names(%d) and the size of "
+                 "underlying_outs(%d) are not consistent. Each feeding element "
+                 "must have its own source variable.",
+                 source_var_names_.size(), underlying_outs.size());
+  // The scope for CustomReader's sub-block should be independent and shouldn't
+  // be any other computation scope's child. Otherwise, data preprocessing and
+  // compution cannot be concurrent.
+  framework::Scope scope;
+  // 1. Copy LoDTensors from underlying reader's output to source variables.
+  for (size_t i = 0; i < source_var_names_.size(); ++i) {
+    framework::Variable* var = scope.Var(source_var_names_[i]);
+    framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
+    tensor->ShareDataWith(underlying_outs[i]);
+    tensor->set_lod(underlying_outs[i].lod());
+  }
+  // 2. Run the sub-block.
+  exe_.Run(program_, &scope, sub_block_id_, false, true);
+  // 3. Copy LoDTensors from sink variables to out.
+  out->resize(sink_var_names_.size());
+  for (size_t i = 0; i < sink_var_names_.size(); ++i) {
+    const auto& tensor = detail::Ref(scope.FindVar(sink_var_names_[i]))
+                             .Get<framework::LoDTensor>();
+    framework::TensorCopySync(tensor, platform::CPUPlace(), &(*out)[i]);
+  }
+}
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators::reader;
+REGISTER_OPERATOR(create_custom_reader, ops::CreateCustomReaderOp,
+                  ops::CreateCustomReaderOpMaker, ops::CustomReaderInferShape,
+                  ops::CustomReaderInferVarType,
+                  paddle::framework::EmptyGradOpMaker)
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index e5efac461512a9a1869318d6547233589ca45a77..bc830a2b72e657f79f4c94e24428d38ff2b7c42e 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -113,14 +113,13 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
 };
 
 class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
- public:
-  CreateDoubleBufferReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : DecoratedReaderMakerBase(op_proto, op_checker) {
+ protected:
+  void Apply() override {
     AddComment(R"DOC(
       CreateDoubleBufferReader Operator
 
       A double buffer reader takes another reader as its 'underlying reader'.
-      It launches another thread to execute the 'underlying reader' asynchronously, 
+      It launches another thread to execute the 'underlying reader' asynchronously,
       which prevents reading process from blocking subsequent training.
     )DOC");
     std::unordered_set<std::string> enum_range;
diff --git a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
index 0573345ba502b6a9af35710840d5acf7634f332f..249b0b7c6dbc8b8104bce95562e6e9b2a28c77f8 100644
--- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
@@ -65,20 +65,19 @@ class CreateMultiPassReaderOp : public framework::OperatorBase {
 };
 
 class CreateMultiPassReaderOpMaker : public DecoratedReaderMakerBase {
- public:
-  CreateMultiPassReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : DecoratedReaderMakerBase(op_proto, op_checker) {
+ protected:
+  void Apply() override {
     AddAttr<int>("pass_num", "The number of pass to run.").GreaterThan(0);
     AddComment(R"DOC(
       CreateMultiPassReader Operator
 
-      This operator creates a multi-pass reader. A multi-pass reader 
-      is used to yield data for several pass training continuously. 
+      This operator creates a multi-pass reader. A multi-pass reader
+      is used to yield data for several pass training continuously.
       It takes the number of passes to run as one of its attributes
-      ('pass_num'), and maintains a pass counter to record how many 
-      passes it has completed. When the underlying reader reaches the 
-      EOF, the multi-pass reader checks whether it has completed training 
-      of the given number of pass. If not, the underlying reader will 
+      ('pass_num'), and maintains a pass counter to record how many
+      passes it has completed. When the underlying reader reaches the
+      EOF, the multi-pass reader checks whether it has completed training
+      of the given number of pass. If not, the underlying reader will
       be re-initialized and starts a new pass automatically.
     )DOC");
   }
diff --git a/paddle/fluid/operators/reader/create_random_data_generator_op.cc b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
index d1cb8e47da70cab784858caea7e791151fc104dd..5b7e8a063a034f0be056065826fca0fe807bc9a7 100644
--- a/paddle/fluid/operators/reader/create_random_data_generator_op.cc
+++ b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
@@ -21,14 +21,15 @@ namespace reader {
 template <typename T>
 class RandomDataGenerator : public framework::ReaderBase {
  public:
-  RandomDataGenerator(const std::vector<framework::DDim>& shapes, float min,
-                      float max)
-      : framework::ReaderBase(), min_(min), max_(max), shapes_(shapes) {
-    PADDLE_ENFORCE_LE(
-        min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max);
+  RandomDataGenerator(const std::vector<framework::DDim>& shapes, float low,
+                      float high)
+      : framework::ReaderBase(), low_(low), high_(high), shapes_(shapes) {
+    PADDLE_ENFORCE_LE(low, high,
+                      "'low' shouldn't be greater than 'high'.(%f vs %f)", low,
+                      high);
     unsigned int seed = std::random_device()();
     engine_.seed(seed);
-    dist_ = std::uniform_real_distribution<float>(min_, max_);
+    dist_ = std::uniform_real_distribution<float>(low_, high_);
   }
 
   void ReadNext(std::vector<framework::LoDTensor>* out) override {
@@ -53,8 +54,8 @@ class RandomDataGenerator : public framework::ReaderBase {
   void ReInit() override { return; }
 
  private:
-  float min_;
-  float max_;
+  float low_;
+  float high_;
   std::minstd_rand engine_;
   std::uniform_real_distribution<float> dist_;
   std::vector<framework::DDim> shapes_;
@@ -78,23 +79,22 @@ class CreateRandomDataGeneratorOp : public framework::OperatorBase {
     std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new RandomDataGenerator<T>(shapes, Attr<float>("min"),
-                                          Attr<float>("max")));
+    out->Reset(new RandomDataGenerator<T>(shapes, Attr<float>("low"),
+                                          Attr<float>("high")));
   }
 };
 
 class CreateRandomDataGeneratorOpMaker : public FileReaderMakerBase {
- public:
-  CreateRandomDataGeneratorOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : FileReaderMakerBase(op_proto, op_checker) {
-    AddAttr<float>("min", "The lower bound of reader's uniform distribution.");
-    AddAttr<float>("max", "The upper bound of reader's uniform distribution.");
+ protected:
+  void Apply() override {
+    AddAttr<float>("low", "The lower bound of reader's uniform distribution.");
+    AddAttr<float>("high", "The upper bound of reader's uniform distribution.");
     AddComment(R"DOC(
       CreateRandomDataGenerator Operator
 
       This Op creates a random reader.
       The reader generates random data instead of really reading from files.
-      Generated data follow an uniform distribution between 'min' and 'max'.
+      Generated data follow an uniform distribution between 'low' and 'high'.
     )DOC");
   }
 };
diff --git a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
index 2ae29725561769ebe6428002c9983246b8eec724..282ec3f36b98e7aa62d71fb04f72721a5464e21c 100644
--- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
@@ -76,9 +76,8 @@ class CreateRecordIOReaderOp : public framework::OperatorBase {
 };
 
 class CreateRecordIOReaderOpMaker : public FileReaderMakerBase {
- public:
-  CreateRecordIOReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : FileReaderMakerBase(op_proto, op_checker) {
+ protected:
+  void Apply() override {
     AddAttr<std::string>("filename", "The filename of record io reader");
     AddComment(R"DOC(
       CreateRecordIOReader Operator
diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
index 13825d65913be95f4f444bd9d5271a036ec8b1e2..fd233be945932eee9f9a3c0c578a43d5b7cc83aa 100644
--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -92,9 +92,8 @@ class CreateShuffleReaderOp : public framework::OperatorBase {
 };
 
 class CreateShuffleReaderOpMaker : public DecoratedReaderMakerBase {
- public:
-  CreateShuffleReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : DecoratedReaderMakerBase(op_proto, op_checker) {
+ protected:
+  void Apply() override {
     AddAttr<int>("buffer_size", "The shuffle buffer size.").GreaterThan(0);
     AddComment(R"DOC(
       CreateShuffleReader Operator
diff --git a/paddle/fluid/operators/reader/create_threaded_reader_op.cc b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
index 1cb9bd36455a2287b8ba4fb4ca14a4c5338da098..1db70f3e9699dba604569c36dc35025dfe2c94fe 100644
--- a/paddle/fluid/operators/reader/create_threaded_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
@@ -53,17 +53,16 @@ class CreateThreadedReaderOp : public framework::OperatorBase {
 };
 
 class CreateThreadedReaderOpMaker : public DecoratedReaderMakerBase {
- public:
-  CreateThreadedReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : DecoratedReaderMakerBase(op_proto, op_checker) {
+ protected:
+  void Apply() override {
     AddComment(R"DOC(
       CreateThreadedReader Operator
 
-      This operator creates a threaded reader. A threaded reader's 
-      'ReadNext()' can be invoked by several threads at the same 
-      time. 
-      When the attribute 'safe_mode' is true, the threaded reader's 
-      'ReInit()' is disabled to avoid unexpected bugs in multi-thread 
+      This operator creates a threaded reader. A threaded reader's
+      'ReadNext()' can be invoked by several threads at the same
+      time.
+      When the attribute 'safe_mode' is true, the threaded reader's
+      'ReInit()' is disabled to avoid unexpected bugs in multi-thread
       environment.
     )DOC");
   }
diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc
index 91ad7d56583446ee4686e74187de166f387125df..31e5d81e55ed9703eb3a9ef2595fa2a280f1a734 100644
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -26,7 +26,11 @@ class MultiFileReader : public framework::ReaderBase {
   MultiFileReader(const std::vector<std::string>& file_names,
                   const std::vector<framework::DDim>& dims, size_t thread_num,
                   size_t buffer_size)
-      : file_names_(file_names), dims_(dims), buffer_size_(buffer_size) {
+      : buffer_size_(buffer_size) {
+    readers_.reserve(file_names.size());
+    for (const std::string& f_name : file_names) {
+      readers_.emplace_back(CreateReaderByFileName(f_name, dims));
+    }
     prefetchers_.resize(thread_num);
     StartNewScheduler();
   }
@@ -40,14 +44,13 @@ class MultiFileReader : public framework::ReaderBase {
   void StartNewScheduler();
   void EndScheduler();
   void ScheduleThreadFunc();
-  void PrefetchThreadFunc(std::string file_name, size_t thread_idx);
+  void PrefetchThreadFunc(size_t reader_idx, size_t thread_idx);
 
-  std::vector<std::string> file_names_;
-  std::vector<framework::DDim> dims_;
+  std::vector<std::unique_ptr<framework::ReaderBase>> readers_;
   std::thread scheduler_;
   std::vector<std::thread> prefetchers_;
   size_t buffer_size_;
-  reader::BlockingQueue<size_t>* waiting_file_idx_;
+  reader::BlockingQueue<size_t>* waiting_reader_idx_;
   reader::BlockingQueue<size_t>* available_thread_idx_;
   reader::BlockingQueue<std::vector<framework::LoDTensor>>* buffer_;
 };
@@ -65,15 +68,15 @@ void MultiFileReader::ReInit() {
 
 void MultiFileReader::StartNewScheduler() {
   size_t thread_num = prefetchers_.size();
-  waiting_file_idx_ = new reader::BlockingQueue<size_t>(file_names_.size());
+  waiting_reader_idx_ = new reader::BlockingQueue<size_t>(readers_.size());
   available_thread_idx_ = new reader::BlockingQueue<size_t>(thread_num);
   buffer_ = new reader::BlockingQueue<std::vector<framework::LoDTensor>>(
       buffer_size_);
 
-  for (size_t i = 0; i < file_names_.size(); ++i) {
-    waiting_file_idx_->Send(i);
+  for (size_t i = 0; i < readers_.size(); ++i) {
+    waiting_reader_idx_->Send(i);
   }
-  waiting_file_idx_->Close();
+  waiting_reader_idx_->Close();
   for (size_t i = 0; i < thread_num; ++i) {
     available_thread_idx_->Send(i);
   }
@@ -84,13 +87,13 @@ void MultiFileReader::StartNewScheduler() {
 void MultiFileReader::EndScheduler() {
   available_thread_idx_->Close();
   buffer_->Close();
-  waiting_file_idx_->Close();
+  waiting_reader_idx_->Close();
   if (scheduler_.joinable()) {
     scheduler_.join();
   }
   delete buffer_;
   delete available_thread_idx_;
-  delete waiting_file_idx_;
+  delete waiting_reader_idx_;
 }
 
 void MultiFileReader::ScheduleThreadFunc() {
@@ -102,12 +105,11 @@ void MultiFileReader::ScheduleThreadFunc() {
     if (prefetcher.joinable()) {
       prefetcher.join();
     }
-    size_t file_idx;
-    if (waiting_file_idx_->Receive(&file_idx)) {
+    size_t reader_idx;
+    if (waiting_reader_idx_->Receive(&reader_idx)) {
       // Still have files to read. Start a new prefetch thread.
-      std::string file_name = file_names_[file_idx];
-      prefetcher = std::thread([this, file_name, thread_idx] {
-        PrefetchThreadFunc(file_name, thread_idx);
+      prefetcher = std::thread([this, reader_idx, thread_idx] {
+        PrefetchThreadFunc(reader_idx, thread_idx);
       });
     } else {
       // No more file to read.
@@ -129,23 +131,22 @@ void MultiFileReader::ScheduleThreadFunc() {
   VLOG(5) << "MultiFileReader schedule thread terminates.";
 }
 
-void MultiFileReader::PrefetchThreadFunc(std::string file_name,
-                                         size_t thread_idx) {
-  VLOG(5) << "The prefetch thread of file '" << file_name << "' starts.";
-  std::unique_ptr<framework::ReaderBase> reader =
-      CreateReaderByFileName(file_name, dims_);
+void MultiFileReader::PrefetchThreadFunc(size_t reader_idx, size_t thread_idx) {
+  VLOG(5) << "The prefetch thread of file idx '" << reader_idx << "' starts.";
+  std::unique_ptr<framework::ReaderBase>& reader = readers_[reader_idx];
   while (true) {
     std::vector<framework::LoDTensor> ins;
     reader->ReadNext(&ins);
     if (ins.empty()) {
+      reader->ReInit();
       break;
     }
     try {
       buffer_->Send(std::move(ins));
     } catch (paddle::platform::EnforceNotMet e) {
       VLOG(5) << "WARNING: The buffer channel has been closed. The prefetch "
-                 "thread of file '"
-              << file_name << "' will terminate.";
+                 "thread of file idx '"
+              << reader_idx << "' will terminate.";
       break;
     }
   }
@@ -154,7 +155,8 @@ void MultiFileReader::PrefetchThreadFunc(std::string file_name,
     VLOG(5) << "WARNING: The available_thread_idx_ channel has been closed. "
                "Fail to send thread_idx.";
   }
-  VLOG(5) << "The prefetch thread of file '" << file_name << "' terminates.";
+  VLOG(5) << "The prefetch thread of file idx '" << reader_idx
+          << "' terminates.";
 }
 
 class OpenFilesOp : public framework::OperatorBase {
@@ -185,9 +187,8 @@ class OpenFilesOp : public framework::OperatorBase {
 };
 
 class OpenFilesOpMaker : public FileReaderMakerBase {
- public:
-  OpenFilesOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : FileReaderMakerBase(op_proto, op_checker) {
+ protected:
+  void Apply() override {
     AddAttr<std::vector<std::string>>("file_names", "Files to be read.");
     AddAttr<int>("thread_num", "The maximal concurrent prefetch thread number.")
         .GreaterThan(0);
@@ -196,7 +197,7 @@ class OpenFilesOpMaker : public FileReaderMakerBase {
     AddComment(R"DOC(
       OpenFiles Operator
 
-      An OpenFilesOp creates a MultiFileReader, which is able to 
+      An OpenFilesOp creates a MultiFileReader, which is able to
       read data multi-threaded from multiple files.
     )DOC");
   }
diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc
index 3ff4536819b128d9c593b97f4942a0292a3b6b36..612e1f5eca3a4836db1fd167fc6bb63400d20177 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -53,10 +53,7 @@ std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
   return std::unique_ptr<framework::ReaderBase>(reader);
 }
 
-FileReaderMakerBase::FileReaderMakerBase(
-    framework::OpProtoAndCheckerMaker::OpProto* op_proto,
-    framework::OpAttrChecker* op_checker)
-    : OpProtoAndCheckerMaker(op_proto, op_checker) {
+void FileReaderMakerBase::Make() {
   AddOutput("Out", "(ReaderHolder) The created random reader.").AsDuplicable();
   AddAttr<std::vector<int>>("shape_concat", "The concat of all data's shapes.");
   AddAttr<std::vector<int>>(
@@ -68,6 +65,7 @@ FileReaderMakerBase::FileReaderMakerBase(
       "It means the reader will generate two data each time,"
       "whose shapes are [2,3,4] and [5,6] respectively.");
   AddAttr<std::vector<int>>("lod_levels", "The LoD levels of each data.");
+  Apply();
 }
 
 void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
@@ -117,6 +115,7 @@ void DecoratedReaderInferShape::operator()(
       boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
   out_reader->SetLoDLevels(in_reader->GetLoDLevels());
 }
+
 void DecoratedReaderInferVarType::operator()(
     const framework::OpDesc& op_desc, framework::BlockDesc* block) const {
   std::string in_reader_name = op_desc.Input("UnderlyingReader")[0];
@@ -127,13 +126,11 @@ void DecoratedReaderInferVarType::operator()(
   out_reader->SetDataTypes(in_reader->GetDataTypes());
 }
 
-DecoratedReaderMakerBase::DecoratedReaderMakerBase(
-    framework::OpProtoAndCheckerMaker::OpProto* op_proto,
-    framework::OpAttrChecker* op_checker)
-    : OpProtoAndCheckerMaker(op_proto, op_checker) {
+void DecoratedReaderMakerBase::Make() {
   AddInput("UnderlyingReader",
            "(ReaderHolder) The underlying reader for creating a batch reader.");
   AddOutput("Out", "(ReaderHolder) The created batch reader.");
+  Apply();
 }
 
 }  // namespace reader
diff --git a/paddle/fluid/operators/reader/reader_op_registry.h b/paddle/fluid/operators/reader/reader_op_registry.h
index ec25f55ef5c3bb691b1213328b996c080656bb7b..244bf15f068a47efc29ee54492cdbdeb10025020 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.h
+++ b/paddle/fluid/operators/reader/reader_op_registry.h
@@ -47,7 +47,10 @@ extern std::vector<framework::DDim> RestoreShapes(
 
 class FileReaderMakerBase : public framework::OpProtoAndCheckerMaker {
  public:
-  FileReaderMakerBase(OpProto* op_proto, OpAttrChecker* op_checker);
+  void Make() final;
+
+ protected:
+  virtual void Apply() = 0;
 };
 
 class FileReaderInferShape : public framework::InferShapeBase {
@@ -76,7 +79,10 @@ class DecoratedReaderInferVarType : public framework::VarTypeInference {
 
 class DecoratedReaderMakerBase : public framework::OpProtoAndCheckerMaker {
  public:
-  DecoratedReaderMakerBase(OpProto* op_proto, OpAttrChecker* op_checker);
+  void Make() final;
+
+ protected:
+  virtual void Apply() = 0;
 };
 
 }  // namespace reader
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index 72c2905872c528a7ed05820744f4031799ad9e46..9c1cee7022a9b9a98f026f7602f0f7badc44a49b 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -508,8 +508,7 @@ class RecurrentGradOp : public RecurrentBase {
 
 class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RecurrentOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(kInputs, "rnn inputs").AsDuplicable();
     AddInput(kInitialStates, "rnn initial states").AsDuplicable();
     AddInput(kParameters,
diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc
index a4dcf704a63ae3bad6567ddb042ea23513bccff7..1ea1cc458b2a20017cc36457af81387a8c808642 100644
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 
 #include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -36,25 +37,29 @@ class RecvOp : public framework::OperatorBase {
                const platform::Place& place) const override {
     auto outs = Outputs("Out");
     std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+    int sync_mode = Attr<int>("sync_mode");
 
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
+    // For profiling
+    platform::RecordEvent record_event(Type(), &ctx);
+
+    detail::RPCClient* rpc_client =
+        detail::RPCClient::GetInstance<detail::GRPCClient>();
 
     for (size_t i = 0; i < outs.size(); i++) {
-      VLOG(3) << "getting " << outs[i];
-      client_.AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
+      VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
+      rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]);
+    }
+    if (sync_mode) {
+      rpc_client->Wait();
     }
-    PADDLE_ENFORCE(client_.Wait());
   }
-
- private:
-  mutable detail::RPCClient client_;
 };
 
 class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RecvOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() {
     AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable();
     AddComment(R"DOC(
 Recv operator
@@ -66,6 +71,10 @@ This operator can get variables from server side.
                                       "Server endpoints in the order of input "
                                       "variables for mapping")
         .SetDefault({});
+    AddAttr<int>("sync_mode",
+                 "(int, default 0)"
+                 "sync recv or async recv.")
+        .SetDefault(0);
   }
 };
 
diff --git a/paddle/fluid/operators/reduce_max_op.cc b/paddle/fluid/operators/reduce_max_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..95d3768e1fdf6947659c7b3a1c9d57fad741472a
--- /dev/null
+++ b/paddle/fluid/operators/reduce_max_op.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_REDUCE_OP(reduce_max);
+REGISTER_OP_CPU_KERNEL(
+    reduce_max, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                                  ops::MaxFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
+                      ops::MaxFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MaxFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
+                      ops::MaxFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_max_op.cu b/paddle/fluid/operators/reduce_max_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0d86b3127e42f7ee14ba57b1c762e8128a0f2d54
--- /dev/null
+++ b/paddle/fluid/operators/reduce_max_op.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_max,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::MaxFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::MaxFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::MaxFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::MaxFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_mean_op.cc b/paddle/fluid/operators/reduce_mean_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fc258c2496340b47d24dc89f16f7419dbb4b0d95
--- /dev/null
+++ b/paddle/fluid/operators/reduce_mean_op.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_mean_op.h"
+
+REGISTER_REDUCE_OP(reduce_mean);
+REGISTER_OP_CPU_KERNEL(reduce_mean,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         float, ops::MeanFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         double, ops::MeanFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         int, ops::MeanFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         int64_t, ops::MeanFunctor>);
+REGISTER_OP_CPU_KERNEL(reduce_mean_grad,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             float, ops::MeanGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             double, ops::MeanGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int, ops::MeanGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int64_t, ops::MeanGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_mean_op.cu b/paddle/fluid/operators/reduce_mean_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..960cb3235be7f4cc98b97d3b088ceaeb3d4a4209
--- /dev/null
+++ b/paddle/fluid/operators/reduce_mean_op.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_mean_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_mean,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::MeanFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::MeanFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::MeanFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::MeanFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                            float, ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MeanGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_mean_op.h b/paddle/fluid/operators/reduce_mean_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..1359679c4767d2032bf3e3a90849ad2a2ef3e829
--- /dev/null
+++ b/paddle/fluid/operators/reduce_mean_op.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct MeanFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->mean(dim);
+  }
+};
+
+struct MeanGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    dx->device(place) = dy->broadcast(dim) / dx->constant(size);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_min_max_op.h b/paddle/fluid/operators/reduce_min_max_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec59f3e71c1c702655a3feed10935b2f5a29d8a8
--- /dev/null
+++ b/paddle/fluid/operators/reduce_min_max_op.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/fluid/operators/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct MaxFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->maximum(dim);
+  }
+};
+
+struct MinFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->minimum(dim);
+  }
+};
+
+struct MaxOrMinGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    auto equals = (*x) == y->broadcast(dim);
+    auto ones = dx->constant(1);
+    auto zeros = dx->constant(0);
+    // If there are multiple minimum or maximum elements, the subgradient of
+    // each is the set [0, 1], and we pass gradient to all of them here.
+    dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_min_op.cc b/paddle/fluid/operators/reduce_min_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..330a86d2e4237a10d8cf6fd40025540edf08d897
--- /dev/null
+++ b/paddle/fluid/operators/reduce_min_op.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_REDUCE_OP(reduce_min);
+REGISTER_OP_CPU_KERNEL(
+    reduce_min, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                                  ops::MinFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
+                      ops::MinFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MinFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
+                      ops::MinFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_min_op.cu b/paddle/fluid/operators/reduce_min_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..da466f805eff4709dc23471baef03e94052ee6c1
--- /dev/null
+++ b/paddle/fluid/operators/reduce_min_op.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_min,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::MinFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::MinFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::MinFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::MinFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_op.cc b/paddle/fluid/operators/reduce_op.cc
deleted file mode 100644
index 093db966472cf100b2f1e4159ce20399cee1f481..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_op.cc
+++ /dev/null
@@ -1,234 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/reduce_op.h"
-
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class ReduceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ReduceOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ReduceOp should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
-    int dim = ctx->Attrs().Get<int>("dim");
-    if (dim < 0) dim = x_rank + dim;
-    PADDLE_ENFORCE_LT(
-        dim, x_rank,
-        "The dim should be in the range [-rank(input), rank(input)).");
-    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
-    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
-    if (reduce_all) {
-      if (keep_dim)
-        ctx->SetOutputDim(
-            "Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
-      else
-        ctx->SetOutputDim("Out", {1});
-    } else {
-      auto dims_vector = vectorize(x_dims);
-      if (keep_dim || x_rank == 1) {
-        dims_vector[dim] = 1;
-      } else {
-        dims_vector.erase(dims_vector.begin() + dim);
-      }
-      auto out_dims = framework::make_ddim(dims_vector);
-      ctx->SetOutputDim("Out", out_dims);
-      if (dim != 0) {
-        // Only pass LoD when not reducing on the first dim.
-        ctx->ShareLoD("X", /*->*/ "Out");
-      }
-    }
-  }
-};
-
-class ReduceGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
-    int dim = ctx->Attrs().Get<int>("dim");
-    if (dim < 0) dim = x_rank + dim;
-    PADDLE_ENFORCE_LT(
-        dim, x_rank,
-        "The dim should be in the range [-rank(input), rank(input)).");
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-      ctx->ShareLoD("X", /*->*/ x_grad_name);
-    }
-  }
-};
-
-class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(Tensor) The input tensor. Tensors with rank at most 6 are "
-             "supported.");
-    AddOutput("Out", "(Tensor) The result tensor.");
-    AddAttr<int>(
-        "dim",
-        "(int, default 0) The dimension to reduce. "
-        "Must be in the range [-rank(input), rank(input)). "
-        "If `dim < 0`, the dim to reduce is `rank + dim`. "
-        "Note that reducing on the first dim will make the LoD info lost.")
-        .SetDefault(0);
-    AddAttr<bool>("keep_dim",
-                  "(bool, default false) "
-                  "If true, retain the reduced dimension with length 1.")
-        .SetDefault(false);
-    AddAttr<bool>("reduce_all",
-                  "(bool, default false) "
-                  "If true, output a scalar reduced along all dimensions.")
-        .SetDefault(false);
-    comment_ = R"DOC(
-{ReduceOp} Operator.
-
-This operator computes the {reduce} of input tensor along the given dimension. 
-The result tensor has 1 fewer dimension than the input unless keep_dim is true.
-If reduce_all is true, just reduce along all dimensions and output a scalar.
-
-)DOC";
-    AddComment(comment_);
-  }
-
- protected:
-  std::string comment_;
-
-  void Replace(std::string *src, std::string from, std::string to) {
-    std::size_t len_from = std::strlen(from.c_str());
-    std::size_t len_to = std::strlen(to.c_str());
-    for (std::size_t pos = src->find(from); pos != std::string::npos;
-         pos = src->find(from, pos + len_to)) {
-      src->replace(pos, len_from, to);
-    }
-  }
-
-  void SetComment(std::string name, std::string op) {
-    Replace(&comment_, "{ReduceOp}", name);
-    Replace(&comment_, "{reduce}", op);
-  }
-};
-
-class ReduceSumOpMaker : public ReduceOpMaker {
- public:
-  ReduceSumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : ReduceOpMaker(proto, op_checker) {
-    SetComment("ReduceSum", "sum");
-    AddComment(comment_);
-  }
-};
-
-class ReduceMeanOpMaker : public ReduceOpMaker {
- public:
-  ReduceMeanOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : ReduceOpMaker(proto, op_checker) {
-    SetComment("ReduceMean", "mean");
-    AddComment(comment_);
-  }
-};
-
-class ReduceMaxOpMaker : public ReduceOpMaker {
- public:
-  ReduceMaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : ReduceOpMaker(proto, op_checker) {
-    SetComment("ReduceMax", "max");
-    AddComment(comment_);
-  }
-};
-
-class ReduceMinOpMaker : public ReduceOpMaker {
- public:
-  ReduceMinOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : ReduceOpMaker(proto, op_checker) {
-    SetComment("ReduceMin", "min");
-    AddComment(comment_);
-  }
-};
-
-class ReduceProdOpMaker : public ReduceOpMaker {
- public:
-  ReduceProdOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : ReduceOpMaker(proto, op_checker) {
-    SetComment("ReduceProd", "production");
-    AddComment(comment_);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ops::ReduceSumOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp);
-
-REGISTER_OPERATOR(reduce_mean, ops::ReduceOp, ops::ReduceMeanOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp);
-
-REGISTER_OPERATOR(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(reduce_max_grad, ops::ReduceGradOp);
-
-REGISTER_OPERATOR(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(reduce_min_grad, ops::ReduceGradOp);
-
-REGISTER_OPERATOR(reduce_prod, ops::ReduceOp, ops::ReduceProdOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(reduce_prod_grad, ops::ReduceGradOp);
-
-#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor)         \
-  REGISTER_OP_CPU_KERNEL(reduce_type,                                          \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           float, ops::functor>,               \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           double, ops::functor>,              \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           int, ops::functor>,                 \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           int64_t, ops::functor>);            \
-  REGISTER_OP_CPU_KERNEL(                                                      \
-      reduce_type##_grad,                                                      \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, float,         \
-                            ops::grad_functor>,                                \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,        \
-                            ops::grad_functor>,                                \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,           \
-                            ops::grad_functor>,                                \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,       \
-                            ops::grad_functor>);
-
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL);
diff --git a/paddle/fluid/operators/reduce_op.cu b/paddle/fluid/operators/reduce_op.cu
deleted file mode 100644
index ae29587f55847315b1d84f1344677e753fe01a9b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_op.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/fluid/operators/reduce_op.h"
-
-namespace ops = paddle::operators;
-
-#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor)    \
-  REGISTER_OP_CUDA_KERNEL(                                                \
-      reduce_type, ops::ReduceKernel<paddle::platform::CUDADeviceContext, \
-                                     float, ops::functor>,                \
-      ops::ReduceKernel<paddle::platform::CUDADeviceContext, double,      \
-                        ops::functor>,                                    \
-      ops::ReduceKernel<paddle::platform::CUDADeviceContext, int,         \
-                        ops::functor>,                                    \
-      ops::ReduceKernel<paddle::platform::CUDADeviceContext, int64_t,     \
-                        ops::functor>);                                   \
-  REGISTER_OP_CUDA_KERNEL(                                                \
-      reduce_type##_grad,                                                 \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float,   \
-                            ops::grad_functor>,                           \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,  \
-                            ops::grad_functor>,                           \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,     \
-                            ops::grad_functor>,                           \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t, \
-                            ops::grad_functor>);
-
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL);
diff --git a/paddle/fluid/operators/reduce_op.h b/paddle/fluid/operators/reduce_op.h
index e42b4bfe42df05346020d4f48519fecf39aa37d2..72b6cf1773d5bcc42e40e72111179d454d2bb4a9 100644
--- a/paddle/fluid/operators/reduce_op.h
+++ b/paddle/fluid/operators/reduce_op.h
@@ -14,111 +14,36 @@ limitations under the License. */
 
 #pragma once
 
-#include "glog/logging.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/reduce_op_function.h"
 
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-struct SumFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->sum(dim);
-  }
-};
-
-struct SumGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    dx->device(place) = dy->broadcast(dim);
-  }
-};
-
-struct MeanFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->mean(dim);
-  }
-};
-
-struct MeanGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    dx->device(place) = dy->broadcast(dim) / dx->constant(size);
-  }
-};
-
-struct MaxFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->maximum(dim);
-  }
-};
-
-struct MinFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->minimum(dim);
-  }
-};
-
-struct MaxOrMinGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    auto equals = (*x) == y->broadcast(dim);
-    auto ones = dx->constant(1);
-    auto zeros = dx->constant(0);
-    // If there are multiple minimum or maximum elements, the subgradient of
-    // each is the set [0, 1], and we pass gradient to all of them here.
-    dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros);
-  }
-};
-
-struct ProdFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->prod(dim);
-  }
-};
-
-struct ProdGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
-                  const Dim& dim, int size) {
-    dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse();
+#define HANDLE_DIM(NDIM, RDIM)                                            \
+  if (ndim == NDIM && rdim == RDIM) {                                     \
+    ReduceFunctor<DeviceContext, T, NDIM, RDIM, Functor>(                 \
+        context.template device_context<DeviceContext>(), *input, output, \
+        dims, keep_dim);                                                  \
   }
-};
 
 template <typename DeviceContext, typename T, typename Functor>
 class ReduceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<T>(context.GetPlace());
+
+    auto dims = context.Attr<std::vector<int>>("dim");
+    bool keep_dim = context.Attr<bool>("keep_dim");
+
     if (reduce_all) {
       // Flatten and reduce 1-D tensor
-      auto* input = context.Input<Tensor>("X");
-      auto* output = context.Output<Tensor>("Out");
-      output->mutable_data<T>(context.GetPlace());
       auto x = EigenVector<T>::Flatten(*input);
       auto out = EigenScalar<T>::From(*output);
       auto& place =
@@ -127,61 +52,25 @@ class ReduceKernel : public framework::OpKernel<T> {
       Functor functor;
       functor(place, &x, &out, reduce_dim);
     } else {
-      int rank = context.Input<Tensor>("X")->dims().size();
-      switch (rank) {
-        case 1:
-          ReduceCompute<1>(context);
-          break;
-        case 2:
-          ReduceCompute<2>(context);
-          break;
-        case 3:
-          ReduceCompute<3>(context);
-          break;
-        case 4:
-          ReduceCompute<4>(context);
-          break;
-        case 5:
-          ReduceCompute<5>(context);
-          break;
-        case 6:
-          ReduceCompute<6>(context);
-          break;
-      }
-    }
-  }
-
- private:
-  template <size_t D>
-  void ReduceCompute(const framework::ExecutionContext& context) const {
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-    output->mutable_data<T>(context.GetPlace());
-
-    auto x = EigenTensor<T, D>::From(*input);
-    auto x_rank = static_cast<int>(x.dimensions().size());
-    int dim = static_cast<int>(context.Attr<int>("dim"));
-    if (dim < 0) dim = x_rank + dim;
-    auto reduce_dim = Eigen::array<int, 1>({{dim}});
-    // construct the squeezed output tensor
-    bool keep_dim = context.Attr<bool>("keep_dim");
-    DDim dims = output->dims();
-    auto dims_vector = vectorize(dims);
-    if (keep_dim && x_rank > 1) {
-      dims_vector.erase(dims_vector.begin() + dim);
-      dims = framework::make_ddim(dims_vector);
-    }
-
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    Functor functor;
-
-    if (D == 1) {
-      auto out = EigenScalar<T>::From(*output);
-      functor(place, &x, &out, reduce_dim);
-    } else {
-      auto out = EigenTensor<T, (D - 1)>::From(*output, dims);
-      functor(place, &x, &out, reduce_dim);
+      int ndim = input->dims().size();
+      int rdim = dims.size();
+      // comments for accelerating compiling temporarily.
+      //      HANDLE_DIM(6, 5);
+      //      HANDLE_DIM(6, 4);
+      //      HANDLE_DIM(6, 3);
+      //      HANDLE_DIM(6, 2);
+      //      HANDLE_DIM(6, 1);
+      //      HANDLE_DIM(5, 4);
+      //      HANDLE_DIM(5, 3);
+      //      HANDLE_DIM(5, 2);
+      //      HANDLE_DIM(5, 1);
+      HANDLE_DIM(4, 3);
+      HANDLE_DIM(4, 2);
+      HANDLE_DIM(4, 1);
+      HANDLE_DIM(3, 2);
+      HANDLE_DIM(3, 1);
+      HANDLE_DIM(2, 1);
+      HANDLE_DIM(1, 1);
     }
   }
 };
@@ -191,12 +80,15 @@ class ReduceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     bool reduce_all = context.Attr<bool>("reduce_all");
+    auto dims = context.Attr<std::vector<int>>("dim");
+
+    auto* input0 = context.Input<Tensor>("X");
+    auto* input1 = context.Input<Tensor>("Out");
+    auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* output = context.Output<Tensor>(framework::GradVarName("X"));
+    output->mutable_data<T>(context.GetPlace());
+
     if (reduce_all) {
-      auto* input0 = context.Input<Tensor>("X");
-      auto* input1 = context.Input<Tensor>("Out");
-      auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
-      auto* output = context.Output<Tensor>(framework::GradVarName("X"));
-      output->mutable_data<T>(context.GetPlace());
       auto x = EigenVector<T>::Flatten(*input0);
       auto x_reduce = EigenVector<T>::From(*input1);
       auto x_reduce_grad = EigenVector<T>::From(*input2);
@@ -209,66 +101,172 @@ class ReduceGradKernel : public framework::OpKernel<T> {
       functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
               broadcast_dim[0]);
     } else {
-      int rank = context.Input<Tensor>("X")->dims().size();
+      int rank = input0->dims().size();
       switch (rank) {
         case 1:
-          ReduceGradCompute<1>(context);
+          ReduceGradFunctor<DeviceContext, T, 1, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
         case 2:
-          ReduceGradCompute<2>(context);
+          ReduceGradFunctor<DeviceContext, T, 2, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
         case 3:
-          ReduceGradCompute<3>(context);
+          ReduceGradFunctor<DeviceContext, T, 3, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
         case 4:
-          ReduceGradCompute<4>(context);
+          ReduceGradFunctor<DeviceContext, T, 4, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
         case 5:
-          ReduceGradCompute<5>(context);
+          ReduceGradFunctor<DeviceContext, T, 5, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
         case 6:
-          ReduceGradCompute<6>(context);
+          ReduceGradFunctor<DeviceContext, T, 6, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
       }
     }
   }
+};
 
- private:
-  template <size_t D>
-  void ReduceGradCompute(const framework::ExecutionContext& context) const {
-    auto* input0 = context.Input<Tensor>("X");
-    auto* input1 = context.Input<Tensor>("Out");
-    auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* output = context.Output<Tensor>(framework::GradVarName("X"));
+class ReduceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
-    output->mutable_data<T>(context.GetPlace());
-    auto x = EigenTensor<T, D>::From(*input0);
-    auto x_grad = EigenTensor<T, D>::From(*output);
-    auto x_rank = static_cast<int>(x.dimensions().size());
-    int dim = static_cast<int>(context.Attr<int>("dim"));
-    if (dim < 0) dim = x_rank + dim;
-    DDim dims = input0->dims();
-    dims[dim] = 1;
-    auto x_reduce = EigenTensor<T, D>::From(*input1, dims);
-    auto x_reduce_grad = EigenTensor<T, D>::From(*input2, dims);
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ReduceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ReduceOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_rank = x_dims.size();
+    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
+    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
+    for (size_t i = 0; i < dims.size(); ++i) {
+      if (dims[i] < 0) dims[i] = x_rank + dims[i];
+      PADDLE_ENFORCE_LT(
+          dims[i], x_rank,
+          "The dim should be in the range [-rank(input), rank(input)).");
+    }
+    sort(dims.begin(), dims.end());
+    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
+    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
+    if (reduce_all) {
+      if (keep_dim)
+        ctx->SetOutputDim(
+            "Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
+      else
+        ctx->SetOutputDim("Out", {1});
+    } else {
+      auto dims_vector = vectorize(x_dims);
+      if (keep_dim) {
+        for (size_t i = 0; i < dims.size(); ++i) {
+          dims_vector[dims[i]] = 1;
+        }
+      } else {
+        const int kDelFlag = -2;
+        for (size_t i = 0; i < dims.size(); ++i) {
+          dims_vector[dims[i]] = kDelFlag;
+        }
+        dims_vector.erase(
+            remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
+            dims_vector.end());
+      }
+      auto out_dims = framework::make_ddim(dims_vector);
+      ctx->SetOutputDim("Out", out_dims);
+      if (dims[0] != 0) {
+        // Only pass LoD when not reducing on the first dim.
+        ctx->ShareLoD("X", /*->*/ "Out");
+      }
+    }
+  }
+};
 
-    Eigen::array<int, D> broadcast_dim;
-    for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
-    broadcast_dim[dim] = input0->dims()[dim];
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    Functor functor;
-    functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
-            broadcast_dim[dim]);
+class ReduceGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_rank = x_dims.size();
+    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
+    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
+    for (size_t i = 0; i < dims.size(); ++i) {
+      if (dims[i] < 0) dims[i] = x_rank + dims[i];
+      PADDLE_ENFORCE_LT(
+          dims[i], x_rank,
+          "The dim should be in the range [-rank(input), rank(input)).");
+    }
+    sort(dims.begin(), dims.end());
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+      ctx->ShareLoD("X", /*->*/ x_grad_name);
+    }
+  }
+};
+
+class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() final {
+    AddInput("X",
+             "(Tensor) The input tensor. Tensors with rank at most 6 are "
+             "supported.");
+    AddOutput("Out", "(Tensor) The result tensor.");
+    AddAttr<std::vector<int>>(
+        "dim",
+        "(list<int>, default {0}) The dimensions to reduce. "
+        "Must be in the range [-rank(input), rank(input)). "
+        "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. "
+        "Note that reducing on the first dim will make the LoD info lost.")
+        .SetDefault({0});
+    AddAttr<bool>("keep_dim",
+                  "(bool, default false) "
+                  "If true, retain the reduced dimension with length 1.")
+        .SetDefault(false);
+    AddAttr<bool>("reduce_all",
+                  "(bool, default false) "
+                  "If true, output a scalar reduced along all dimensions.")
+        .SetDefault(false);
+    AddComment(string::Sprintf(R"DOC(
+%s Operator.
+
+This operator computes the %s of input tensor along the given dimension.
+The result tensor has 1 fewer dimension than the input unless keep_dim is true.
+If reduce_all is true, just reduce along all dimensions and output a scalar.
+
+)DOC",
+                               GetOpType(), GetName()));
   }
+
+ protected:
+  virtual std::string GetName() const = 0;
+  virtual std::string GetOpType() const = 0;
 };
 
 }  // namespace operators
 }  // namespace paddle
 
-#define FOR_EACH_KERNEL_FUNCTOR(__macro)                \
-  __macro(reduce_sum, SumFunctor, SumGradFunctor);      \
-  __macro(reduce_mean, MeanFunctor, MeanGradFunctor);   \
-  __macro(reduce_max, MaxFunctor, MaxOrMinGradFunctor); \
-  __macro(reduce_min, MinFunctor, MaxOrMinGradFunctor); \
-  __macro(reduce_prod, ProdFunctor, ProdGradFunctor);
+namespace ops = paddle::operators;
+
+#define REGISTER_REDUCE_OP(op_name)                                      \
+  class __##op_name##Maker__ : public ops::ReduceOpMaker {               \
+   protected:                                                            \
+    virtual std::string GetName() const { return #op_name; }             \
+    virtual std::string GetOpType() const { return "Reduce " #op_name; } \
+  };                                                                     \
+  REGISTER_OPERATOR(op_name, ops::ReduceOp, __##op_name##Maker__,        \
+                    paddle::framework::DefaultGradOpDescMaker<true>);    \
+  REGISTER_OPERATOR(op_name##_grad, ops::ReduceGradOp)
diff --git a/paddle/fluid/operators/reduce_op_function.h b/paddle/fluid/operators/reduce_op_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..3da27bc8ac8d448471b9ff3779ac6aca59fac523
--- /dev/null
+++ b/paddle/fluid/operators/reduce_op_function.h
@@ -0,0 +1,109 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T, size_t D, size_t R_D,
+          typename Functor>
+void ReduceFunctor(const DeviceContext& context, const framework::Tensor& input,
+                   framework::Tensor* output, const std::vector<int>& dims,
+                   bool keep_dim) {
+  auto x = EigenTensor<T, D>::From(input);
+  auto x_rank = static_cast<int>(x.dimensions().size());
+  auto reduce_dim = Eigen::array<int, R_D>();
+  std::vector<int> dims_ref = dims;
+  for (size_t i = 0; i < dims_ref.size(); ++i) {
+    if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i];
+    reduce_dim[i] = dims_ref[i];
+  }
+  // construct the squeezed output tensor
+  DDim out_dims = output->dims();
+  if (keep_dim && x_rank > 1) {
+    const int kDelFlag = -2;
+    auto dims_vector = framework::vectorize(out_dims);
+    for (size_t i = 0; i < dims_ref.size(); ++i) {
+      dims_vector[dims_ref[i]] = kDelFlag;
+    }
+    dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
+                      dims_vector.end());
+    out_dims = framework::make_ddim(dims_vector);
+  }
+  auto& place = *context.eigen_device();
+  Functor functor;
+
+  if (D == 1) {
+    auto out = EigenScalar<T>::From(*output);
+    functor(place, &x, &out, reduce_dim);
+  } else {
+    auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims);
+    functor(place, &x, &out, reduce_dim);
+  }
+}
+
+template <typename DeviceContext, typename T, size_t D, typename Functor>
+void ReduceGradFunctor(const DeviceContext& context,
+                       const framework::Tensor& input0,
+                       const framework::Tensor& input1,
+                       const framework::Tensor& input2,
+                       framework::Tensor* output,
+                       const std::vector<int>& dims) {
+  auto x = EigenTensor<T, D>::From(input0);
+  auto x_grad = EigenTensor<T, D>::From(*output);
+  auto x_rank = static_cast<int>(x.dimensions().size());
+  auto x_dims = input0.dims();
+  auto reduced_dims_v = framework::vectorize(x_dims);
+  std::vector<int> dims_ref = dims;
+  Eigen::array<int, D> broadcast_dim;
+  for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
+
+  int broad_cats_times = 1;
+  for (size_t i = 0; i < dims_ref.size(); ++i) {
+    if (dims_ref[i] < 0) {
+      dims_ref[i] = x_rank + dims_ref[i];
+    }
+    reduced_dims_v[dims_ref[i]] = 1;
+    broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]];
+    broad_cats_times *= x_dims[dims_ref[i]];
+  }
+  auto reduced_dims = framework::make_ddim(reduced_dims_v);
+  auto x_reduce = EigenTensor<T, D>::From(input1, reduced_dims);
+  auto x_reduce_grad = EigenTensor<T, D>::From(input2, reduced_dims);
+
+  auto& place = *context.eigen_device();
+
+  Functor functor;
+  functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
+          broad_cats_times);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_prod_op.cc b/paddle/fluid/operators/reduce_prod_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..713728b99757a6f3bb128f665d5576ac64eef8ec
--- /dev/null
+++ b/paddle/fluid/operators/reduce_prod_op.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_prod_op.h"
+
+REGISTER_REDUCE_OP(reduce_prod);
+REGISTER_OP_CPU_KERNEL(reduce_prod,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         float, ops::ProdFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         double, ops::ProdFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         int, ops::ProdFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         int64_t, ops::ProdFunctor>);
+REGISTER_OP_CPU_KERNEL(reduce_prod_grad,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             float, ops::ProdGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             double, ops::ProdGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int, ops::ProdGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int64_t, ops::ProdGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_prod_op.cu b/paddle/fluid/operators/reduce_prod_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d62e677d92cffecf629d1684026b0c7bcfec29e3
--- /dev/null
+++ b/paddle/fluid/operators/reduce_prod_op.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_prod_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_prod,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::ProdFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::ProdFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::ProdFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::ProdFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_prod_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                            float, ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::ProdGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_prod_op.h b/paddle/fluid/operators/reduce_prod_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..97748113e092719aceed9d806ca6242077111532
--- /dev/null
+++ b/paddle/fluid/operators/reduce_prod_op.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct ProdFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->prod(dim);
+  }
+};
+
+struct ProdGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_sum_op.cc b/paddle/fluid/operators/reduce_sum_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c5b5398787b44e658b0f8390162df0e6c3006651
--- /dev/null
+++ b/paddle/fluid/operators/reduce_sum_op.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_sum_op.h"
+
+REGISTER_REDUCE_OP(reduce_sum);
+REGISTER_OP_CPU_KERNEL(
+    reduce_sum, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                                  ops::SumFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
+                      ops::SumFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::SumFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
+                      ops::SumFunctor>);
+REGISTER_OP_CPU_KERNEL(reduce_sum_grad,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             float, ops::SumGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             double, ops::SumGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int, ops::SumGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int64_t, ops::SumGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_sum_op.cu b/paddle/fluid/operators/reduce_sum_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f2e16955a50dc6a7feda9fbaf968c929ef3d8a4f
--- /dev/null
+++ b/paddle/fluid/operators/reduce_sum_op.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_sum_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_sum,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::SumFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::SumFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::SumFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::SumFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::SumGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_sum_op.h b/paddle/fluid/operators/reduce_sum_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e67d7e1da5f0244d2dee346873692a80cbad2fc4
--- /dev/null
+++ b/paddle/fluid/operators/reduce_sum_op.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct SumFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->sum(dim);
+  }
+};
+
+struct SumGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    dx->device(place) = dy->broadcast(dim);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
index 5c3e1f5678df0270c837ed407d1e6cc662276880..e4f4fe358e0e8cd2080525227f14a3d40f3c1411 100644
--- a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
@@ -23,9 +23,7 @@ namespace operators {
 class ReorderLoDTensorByRankTableOpProtoMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
-  ReorderLoDTensorByRankTableOpProtoMaker(OpProto *proto,
-                                          OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(LoDTensor), the input lod tensor to be reordered according to "
              "Input(RankTable).");
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 5e5ccc3ded95d57dfed37c1ac9c7eae61d36b8c0..7f743f577fbcdaf6f62e01031e25ef09a842c2e9 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -22,8 +22,7 @@ namespace operators {
 
 class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(Tensor). The input tensor of reshape operator.");
     AddInput("Shape",
              "(Tensor<int32>, optional). If provided, reshape according to "
diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h
index ccd7063fe69e0f21b4d2a821bb70902b39c9b9de..3dd8c7c11eca241e747bfa129962032d882ce44c 100644
--- a/paddle/fluid/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
@@ -92,14 +92,16 @@ class ReshapeOp : public framework::OperatorWithKernel {
     }
 
     if (unk_dim_idx != -1) {
-      output_shape[unk_dim_idx] = -in_size / capacity;
-      // in_size < 0 and is un-determinate in compile time, skip the check,
-      // for example, in_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
-      // capacity = -24, in_size = -8, output_shape[0] = 0
-      // the following check will fail.
       if (in_size > 0) {
+        // in_size < 0 and is un-determinate in compile time, skip the check,
+        // for example, in_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
+        // capacity = -24, in_size = -8, output_shape[0] = 0
+        // the following check will fail.
+        output_shape[unk_dim_idx] = -in_size / capacity;
         PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size,
                           "Invalid shape is given.");
+      } else {
+        output_shape[unk_dim_idx] = -1;
       }
     } else {
       PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given.");
@@ -122,7 +124,10 @@ class ReshapeKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext &ctx) const {
     auto *out = ctx.Output<framework::LoDTensor>("Out");
     auto *in = ctx.Input<framework::LoDTensor>("X");
-    auto *shape_tensor = ctx.Input<framework::LoDTensor>("Shape");
+
+    auto *shape_tensor = ctx.HasInput("Shape")
+                             ? ctx.Input<framework::LoDTensor>("Shape")
+                             : nullptr;
 
     framework::DDim out_dims = out->dims();
 
diff --git a/paddle/fluid/operators/reverse_op.cc b/paddle/fluid/operators/reverse_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a20f7d231fa9ea313581ac0629a87fa5f4a88ce5
--- /dev/null
+++ b/paddle/fluid/operators/reverse_op.cc
@@ -0,0 +1,107 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reverse_op.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+class ReverseOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
+    const auto& x_dims = ctx->GetInputDim("X");
+    const auto& axis = ctx->Attrs().Get<std::vector<int>>("axis");
+    PADDLE_ENFORCE(!axis.empty(), "'axis' can not be empty.");
+    for (int a : axis) {
+      PADDLE_ENFORCE_LT(a, x_dims.size(),
+                        "The axis must be less than input tensor's rank.");
+    }
+    ctx->SetOutputDim("Out", x_dims);
+  }
+};
+
+class ReverseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The LoDTensor to be flipped.");
+    AddOutput("Out", "The LoDTensor after flipping.");
+    AddAttr<std::vector<int>>(
+        "axis", "The axises that along which order of elements is reversed.");
+    AddComment(R"DOC(
+      Reverse Operator.
+
+      Reverse the order of elements in the input LoDTensor along given axises.
+
+      Case 1:
+        Given
+            X = [[1, 2, 3, 4, 5]
+                 [6, 7, 8, 9, 10]
+                 [11, 12, 13, 14, 15]],
+        and
+            axis = [0],
+        we get:
+            Out = [[11, 12, 13, 14, 15]
+                   [6, 7, 8, 9, 10]
+                   [1, 2, 3, 4, 5]].
+        
+      Case 2:
+        Given
+            X = [[[1, 2, 3, 4]
+                  [5, 6, 7, 8]]
+                 [[9, 10, 11, 12]
+                  [13, 14, 15, 16]]],
+        and
+            axis = [0, 2],
+        we get:
+            Out = [[[12, 11, 10, 9]
+                    [16, 15, 14, 13]]
+                   [[4, 3, 2, 1]
+                    [8, 7, 6, 5]]],
+    )DOC");
+  }
+};
+
+class ReverseGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* grad_op = new framework::OpDesc();
+    grad_op->SetType("reverse");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttr("axis", GetAttr("axis"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(reverse, ops::ReverseOp, ops::ReverseOpMaker,
+                  ops::ReverseGradMaker);
+REGISTER_OPERATOR(reverse_grad, ops::ReverseOp);
+REGISTER_OP_CPU_KERNEL(
+    reverse, ops::ReverseKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, uint8_t>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, bool>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, double>)
diff --git a/paddle/fluid/operators/reverse_op.cu b/paddle/fluid/operators/reverse_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..635c41529b38f2dd287b00ed2e5659e11f619e78
--- /dev/null
+++ b/paddle/fluid/operators/reverse_op.cu
@@ -0,0 +1,24 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reverse_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    reverse, ops::ReverseKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, uint8_t>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, bool>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, double>)
diff --git a/paddle/fluid/operators/reverse_op.h b/paddle/fluid/operators/reverse_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9063cd59bba5c6307b55a500455908a5fd278390
--- /dev/null
+++ b/paddle/fluid/operators/reverse_op.h
@@ -0,0 +1,87 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T, int Rank>
+struct ReverseFunctor {
+  void operator()(const DeviceContext& context, const framework::LoDTensor& in,
+                  framework::LoDTensor* out, const std::vector<int>& axis) {
+    Eigen::array<bool, Rank> reverse_axis;
+    for (int i = 0; i < Rank; ++i) {
+      reverse_axis[i] = false;
+    }
+    for (int a : axis) {
+      reverse_axis[a] = true;
+    }
+
+    auto in_eigen = framework::EigenTensor<T, Rank>::From(in);
+    auto out_eigen = framework::EigenTensor<T, Rank>::From(*out);
+    auto* dev = context.eigen_device();
+
+    out_eigen.device(*dev) = in_eigen.reverse(reverse_axis);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ReverseKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<framework::LoDTensor>("X");
+    auto* out = context.Output<framework::LoDTensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+    const auto& axis = context.Attr<std::vector<int>>("axis");
+    int rank = x->dims().size();
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    switch (rank) {
+      case 1:
+        ReverseFunctor<DeviceContext, T, 1> functor1;
+        functor1(dev_ctx, *x, out, axis);
+        break;
+      case 2:
+        ReverseFunctor<DeviceContext, T, 2> functor2;
+        functor2(dev_ctx, *x, out, axis);
+        break;
+      case 3:
+        ReverseFunctor<DeviceContext, T, 3> functor3;
+        functor3(dev_ctx, *x, out, axis);
+        break;
+      case 4:
+        ReverseFunctor<DeviceContext, T, 4> functor4;
+        functor4(dev_ctx, *x, out, axis);
+        break;
+      case 5:
+        ReverseFunctor<DeviceContext, T, 5> functor5;
+        functor5(dev_ctx, *x, out, axis);
+        break;
+      case 6:
+        ReverseFunctor<DeviceContext, T, 6> functor6;
+        functor6(dev_ctx, *x, out, axis);
+        break;
+      default:
+        PADDLE_THROW(
+            "Reserve operator doesn't supports tensors whose ranks are greater "
+            "than 6.");
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/rmsprop_op.cc b/paddle/fluid/operators/rmsprop_op.cc
index a8855b3ccd1686c75953e762ce42cc27b26202e6..919ebe48ca38040274bd2052b95ef96eccff4db6 100644
--- a/paddle/fluid/operators/rmsprop_op.cc
+++ b/paddle/fluid/operators/rmsprop_op.cc
@@ -63,8 +63,7 @@ class RmspropOp : public framework::OperatorWithKernel {
 
 class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RmspropOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
              "Input parameter value that has to be updated.");
diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc
index 70f205d887ef710aeed02905713200ce32988987..23e5fc1112d0b1e634d0ab288721cbba57b3ffe5 100644
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
@@ -59,8 +59,7 @@ class RNNMemoryHelperOpShapeInference : public framework::InferShapeBase {
 
 class RNNMemoryHelperOpInfoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RNNMemoryHelperOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "");
     AddOutput("Out", "");
     AddAttr<int>("dtype",
@@ -117,8 +116,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
 class RNNMemoryHelperGradOpInfoMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
-  RNNMemoryHelperGradOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(framework::GradVarName("Out"), "");
     AddInput("X", "");
     AddInput("Out", "");
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index 397e49ef20ac45515a852f466d693f358ef5461b..293abb0ea4f1ac03c3889ce2937ef8fa0845db73 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -98,8 +98,7 @@ class ROIPoolGradOp : public framework::OperatorWithKernel {
 
 class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ROIPoolOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(Tensor), "
              "the input of ROIPoolOp. "
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
index f905d690f984a20622c5fbcbcc813d888dfb19d9..50450b62f7b1c0b2b5abf01a43581a0e2d2cd01e 100644
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -38,10 +38,10 @@ __global__ void GPUROIPoolForward(
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int offset = blockDim.x * gridDim.x;
   for (size_t i = index; i < nthreads; i += offset) {
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % channels;
+    int n = i / pooled_width / pooled_height / channels;
 
     const int64_t* offset_input_rois = input_rois + n * kROISize;
     int roi_batch_ind = roi_batch_id_data[n];
@@ -52,14 +52,19 @@ __global__ void GPUROIPoolForward(
 
     int roi_width = max(roi_end_w - roi_start_w + 1, 1);
     int roi_height = max(roi_end_h - roi_start_h + 1, 1);
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h));
-    int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w));
-    int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h));
-    int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w));
 
+    int hstart = static_cast<int>(floor(static_cast<double>(ph) *
+                                        static_cast<double>(roi_height) /
+                                        static_cast<double>(pooled_height)));
+    int wstart = static_cast<int>(floor(static_cast<double>(pw) *
+                                        static_cast<double>(roi_width) /
+                                        static_cast<double>(pooled_width)));
+    int hend = static_cast<int>(ceil(static_cast<double>(ph + 1) *
+                                     static_cast<double>(roi_height) /
+                                     static_cast<double>(pooled_height)));
+    int wend = static_cast<int>(ceil(static_cast<double>(pw + 1) *
+                                     static_cast<double>(roi_width) /
+                                     static_cast<double>(pooled_width)));
     hstart = min(max(hstart + roi_start_h, 0), height);
     hend = min(max(hend + roi_start_h, 0), height);
     wstart = min(max(wstart + roi_start_w, 0), width);
@@ -79,9 +84,9 @@ __global__ void GPUROIPoolForward(
         }
       }
     }
-    output_data[index] = maxval;
+    output_data[i] = maxval;
     if (argmax_data) {
-      argmax_data[index] = maxidx;
+      argmax_data[i] = maxidx;
     }
   }
 }
@@ -96,10 +101,10 @@ __global__ void GPUROIPoolBackward(
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int offset = blockDim.x * gridDim.x;
   for (int i = index; i < nthreads; i += offset) {
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % channels;
+    int n = i / pooled_width / pooled_height / channels;
 
     int roi_batch_ind = roi_batch_id_data[n];
     int input_offset = (roi_batch_ind * channels + c) * height * width;
@@ -138,6 +143,7 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
     int width = in_dims[3];
 
     int rois_num = rois->dims()[0];
+
     if (rois_num == 0) return;
 
     int output_size = out->numel();
diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
index 23f720da0b68cd2fd4c9b51182bf82f72078a906..20f140f962c3aac364a1239a663d5f340bbeb6b2 100644
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -76,8 +76,7 @@ class RowConvGradOp : public framework::OperatorWithKernel {
 
 class RowConvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RowConvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(LoDTensor), the input(X) is a LodTensor, which supports "
              "variable time-length input sequences. The underlying tensor "
diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
index 94703393bfa53124d16e34ae4373773eece5f11f..cfee9207083b46f7c27354f22e82a7d3c38a027c 100644
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <numeric>
 #include <sstream>
 #include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -69,6 +70,7 @@ class SaveCombineOp : public framework::OperatorBase {
                const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
     auto overwrite = Attr<bool>("overwrite");
+    auto save_as_fp16 = Attr<bool>("save_as_fp16");
 
     bool is_present = FileExists(filename);
     if (is_present && !overwrite) {
@@ -100,8 +102,24 @@ class SaveCombineOp : public framework::OperatorBase {
                      inp_var_names[i]);
 
       auto &tensor = var->Get<framework::LoDTensor>();
-      // Serialize tensor
-      framework::SerializeToStream(fout, tensor, dev_ctx);
+      // Serialize tensors one by one
+
+      // Check types to see if a fp16 transformation is required
+      auto in_dtype = framework::ToDataType(tensor.type());
+      auto out_dtype =
+          save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
+
+      if (in_dtype != out_dtype) {
+        auto in_kernel_type = framework::OpKernelType(in_dtype, place);
+        auto out_kernel_type = framework::OpKernelType(out_dtype, place);
+        framework::LoDTensor out;
+        // copy LoD info to the new tensor
+        out.set_lod(tensor.lod());
+        framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
+        framework::SerializeToStream(fout, out, dev_ctx);
+      } else {
+        framework::SerializeToStream(fout, tensor, dev_ctx);
+      }
     }
     fout.close();
   }
@@ -109,8 +127,7 @@ class SaveCombineOp : public framework::OperatorBase {
 
 class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SaveCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(
         "X",
         "(vector) Input LoDTensors that need to be saved together in a file.")
@@ -125,6 +142,12 @@ to a file on disk.
                   "(boolean, default true)"
                   "Overwrite the output file if it exists.")
         .SetDefault(true);
+    AddAttr<bool>("save_as_fp16",
+                  "(boolean, default false)"
+                  "If true, the tensor will be converted to float16 data "
+                  "type and then saved. Otherwise, the tensor will be "
+                  "directly saved without data type conversion.")
+        .SetDefault(false);
     AddAttr<std::string>(
         "file_path",
         "(string)"
diff --git a/paddle/fluid/operators/save_load_combine_op_test.cc b/paddle/fluid/operators/save_load_combine_op_test.cc
index 2773c32a0a10269e28c24e12527711e3c5b8f869..4743e0d9499b111d8baa921dbb245431713fd7a8 100644
--- a/paddle/fluid/operators/save_load_combine_op_test.cc
+++ b/paddle/fluid/operators/save_load_combine_op_test.cc
@@ -17,15 +17,17 @@ limitations under the License. */
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/float16.h"
 
 USE_NO_KERNEL_OP(save_combine);
 USE_NO_KERNEL_OP(load_combine);
 
-int* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
-                            std::string var_name,
-                            const paddle::platform::CPUPlace& place,
-                            paddle::framework::Scope* scope,
-                            paddle::framework::LoD* expect_lod) {
+template <typename T, typename U>
+T* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
+                          std::string var_name,
+                          const paddle::platform::CPUPlace& place,
+                          paddle::framework::Scope* scope,
+                          paddle::framework::LoD* expect_lod) {
   auto var = scope->Var(var_name);
   auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
   tensor->Resize({x, y});
@@ -34,9 +36,10 @@ int* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
     (*expect_lod)[0].push_back(lod_info[i]);
   }
   tensor->set_lod(*expect_lod);
-  int* expect = tensor->mutable_data<int>(place);
+  T* expect = tensor->mutable_data<T>(place);
   for (int64_t i = 0; i < tensor->numel(); ++i) {
-    expect[i] = static_cast<int>(i);
+    expect[i] = static_cast<T>(
+        static_cast<U>(i));  // For FP16, we intend to do float(float16(i))
   }
   return expect;
 }
@@ -48,18 +51,20 @@ paddle::framework::LoDTensor* GeneratePlaceholderBeforeLoad(
   return target;
 }
 
-int* GetValuesAfterLoadCombineOp(paddle::framework::LoDTensor* target,
-                                 const paddle::framework::Scope& scope,
-                                 paddle::framework::LoD* actual_lod) {
-  int* actual = target->data<int>();
+template <typename T>
+T* GetValuesAfterLoadCombineOp(paddle::framework::LoDTensor* target,
+                               const paddle::framework::Scope& scope,
+                               paddle::framework::LoD* actual_lod) {
+  T* actual = target->data<T>();
   *actual_lod = target->lod();
   return actual;
 }
 
-void CheckValues(int* expect, int* actual, paddle::framework::LoD expect_lod,
-                 paddle::framework::LoD actual_lod, const int& numel) {
-  for (int64_t i = 0; i < numel; ++i) {
-    EXPECT_EQ(expect[i], actual[i]);
+template <typename T, typename U>
+void CheckValues(T* expect, U* actual, const paddle::framework::LoD& expect_lod,
+                 const paddle::framework::LoD& actual_lod, const int& numel) {
+  for (int i = 0; i < numel; ++i) {
+    EXPECT_EQ(expect[i], static_cast<T>(actual[i]));
   }
   EXPECT_EQ(expect_lod.size(), actual_lod.size());
   for (size_t i = 0; i < expect_lod.size(); ++i) {
@@ -78,26 +83,26 @@ TEST(SaveLoadCombineOp, CPU) {
   std::vector<int> lod1 = {0, 1, 2, 3, 10};
   int numel1 = 100;
   paddle::framework::LoD expect_lod1;
-  int* expect1 = CreateForSaveCombineOp(10, 10, lod1, "test_var1", place,
-                                        &scope, &expect_lod1);
+  int* expect1 = CreateForSaveCombineOp<int, int>(10, 10, lod1, "test_var1",
+                                                  place, &scope, &expect_lod1);
 
   std::vector<int> lod2 = {0, 2, 5, 10};
   int numel2 = 200;
   paddle::framework::LoD expect_lod2;
-  int* expect2 = CreateForSaveCombineOp(10, 20, lod2, "test_var2", place,
-                                        &scope, &expect_lod2);
+  int* expect2 = CreateForSaveCombineOp<int, int>(10, 20, lod2, "test_var2",
+                                                  place, &scope, &expect_lod2);
 
   std::vector<int> lod3 = {0, 2, 3, 20};
   int numel3 = 4000;
   paddle::framework::LoD expect_lod3;
-  int* expect3 = CreateForSaveCombineOp(20, 200, lod3, "test_var3", place,
-                                        &scope, &expect_lod3);
+  int* expect3 = CreateForSaveCombineOp<int, int>(20, 200, lod3, "test_var3",
+                                                  place, &scope, &expect_lod3);
 
   std::vector<int> lod4 = {0, 1, 20};
   int numel4 = 1000;
   paddle::framework::LoD expect_lod4;
-  int* expect4 = CreateForSaveCombineOp(20, 50, lod4, "test_var4", place,
-                                        &scope, &expect_lod4);
+  int* expect4 = CreateForSaveCombineOp<int, int>(20, 50, lod4, "test_var4",
+                                                  place, &scope, &expect_lod4);
 
   // Set attributes
   std::string filename = "check_tensor.ls";
@@ -123,15 +128,176 @@ TEST(SaveLoadCombineOp, CPU) {
   load_combine_op->Run(scope, place);
 
   paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4;
-  int* actual1 = GetValuesAfterLoadCombineOp(target1, scope, &actual_lod1);
-  int* actual2 = GetValuesAfterLoadCombineOp(target2, scope, &actual_lod2);
-  int* actual3 = GetValuesAfterLoadCombineOp(target3, scope, &actual_lod3);
-  int* actual4 = GetValuesAfterLoadCombineOp(target4, scope, &actual_lod4);
-
-  CheckValues(expect1, actual1, expect_lod1, actual_lod1, numel1);
-  CheckValues(expect2, actual2, expect_lod2, actual_lod2, numel2);
-  CheckValues(expect3, actual3, expect_lod3, actual_lod3, numel3);
-  CheckValues(expect4, actual4, expect_lod4, actual_lod4, numel4);
+  int* actual1 = GetValuesAfterLoadCombineOp<int>(target1, scope, &actual_lod1);
+  int* actual2 = GetValuesAfterLoadCombineOp<int>(target2, scope, &actual_lod2);
+  int* actual3 = GetValuesAfterLoadCombineOp<int>(target3, scope, &actual_lod3);
+  int* actual4 = GetValuesAfterLoadCombineOp<int>(target4, scope, &actual_lod4);
+
+  CheckValues<int, int>(expect1, actual1, expect_lod1, actual_lod1, numel1);
+  CheckValues<int, int>(expect2, actual2, expect_lod2, actual_lod2, numel2);
+  CheckValues<int, int>(expect3, actual3, expect_lod3, actual_lod3, numel3);
+  CheckValues<int, int>(expect4, actual4, expect_lod4, actual_lod4, numel4);
+}
+
+// FP16 version of SaveLoadCombineOp Test, only altering the saving aspect
+// to save as FP16.
+TEST(SaveCombineFP16Op, CPU) {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+
+  std::vector<int> lod1 = {0, 1, 2, 3, 10};
+  int numel1 = 100;
+  paddle::framework::LoD expect_lod1;
+  float* expect1 = CreateForSaveCombineOp<float, paddle::platform::float16>(
+      10, 10, lod1, "test_var1", place, &scope, &expect_lod1);
+
+  std::vector<int> lod2 = {0, 2, 5, 10};
+  int numel2 = 200;
+  paddle::framework::LoD expect_lod2;
+  float* expect2 = CreateForSaveCombineOp<float, paddle::platform::float16>(
+      10, 20, lod2, "test_var2", place, &scope, &expect_lod2);
+
+  std::vector<int> lod3 = {0, 20};
+  int numel3 = 4000;
+  paddle::framework::LoD expect_lod3;
+  float* expect3 = CreateForSaveCombineOp<float, paddle::platform::float16>(
+      20, 200, lod3, "test_var3", place, &scope, &expect_lod3);
+
+  std::vector<int> lod4 = {0, 1, 20};
+  int numel4 = 1000;
+  paddle::framework::LoD expect_lod4;
+  float* expect4 = CreateForSaveCombineOp<float, paddle::platform::float16>(
+      20, 50, lod4, "test_var4", place, &scope, &expect_lod4);
+
+  // Set attributes
+  std::string filename = "check_tensor_fp16_save.ls";
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"file_path", std::string(filename)});
+  attrs.insert({"save_as_fp16", true});
+
+  // Run the save_combine_op
+  auto save_combine_op = paddle::framework::OpRegistry::CreateOp(
+      "save_combine",
+      {{"X", {"test_var1", "test_var2", "test_var3", "test_var4"}}}, {}, attrs);
+  save_combine_op->Run(scope, place);
+
+  // Set up output vars
+  auto target1 = GeneratePlaceholderBeforeLoad("out_var1", &scope);
+  auto target2 = GeneratePlaceholderBeforeLoad("out_var2", &scope);
+  auto target3 = GeneratePlaceholderBeforeLoad("out_var3", &scope);
+  auto target4 = GeneratePlaceholderBeforeLoad("out_var4", &scope);
+
+  // Run the load_combine_op
+  auto load_combine_op = paddle::framework::OpRegistry::CreateOp(
+      "load_combine", {},
+      {{"Out", {"out_var1", "out_var2", "out_var3", "out_var4"}}}, attrs);
+  load_combine_op->Run(scope, place);
+
+  paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4;
+  paddle::platform::float16* actual1 =
+      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target1, scope,
+                                                             &actual_lod1);
+  paddle::platform::float16* actual2 =
+      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target2, scope,
+                                                             &actual_lod2);
+  paddle::platform::float16* actual3 =
+      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target3, scope,
+                                                             &actual_lod3);
+  paddle::platform::float16* actual4 =
+      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target4, scope,
+                                                             &actual_lod4);
+
+  CheckValues<float, paddle::platform::float16>(expect1, actual1, expect_lod1,
+                                                actual_lod1, numel1);
+  CheckValues<float, paddle::platform::float16>(expect2, actual2, expect_lod2,
+                                                actual_lod2, numel2);
+  CheckValues<float, paddle::platform::float16>(expect3, actual3, expect_lod3,
+                                                actual_lod3, numel3);
+  CheckValues<float, paddle::platform::float16>(expect4, actual4, expect_lod4,
+                                                actual_lod4, numel4);
+}
+
+// FP16 version of SaveLoadCombineOp Test, only altering the loading aspect
+// to load tensors with FP16 precision.
+TEST(LoadCombineFP16Op, CPU) {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+
+  std::vector<int> lod1 = {0, 1, 2, 3, 10};
+  int numel1 = 100;
+  paddle::framework::LoD expect_lod1;
+  float* expect1 = CreateForSaveCombineOp<float, paddle::platform::float16>(
+      10, 10, lod1, "test_var1", place, &scope, &expect_lod1);
+
+  std::vector<int> lod2 = {0, 2, 5, 10};
+  int numel2 = 200;
+  paddle::framework::LoD expect_lod2;
+  float* expect2 = CreateForSaveCombineOp<float, paddle::platform::float16>(
+      10, 20, lod2, "test_var2", place, &scope, &expect_lod2);
+
+  std::vector<int> lod3 = {0, 20};
+  int numel3 = 4000;
+  paddle::framework::LoD expect_lod3;
+  float* expect3 = CreateForSaveCombineOp<float, paddle::platform::float16>(
+      20, 200, lod3, "test_var3", place, &scope, &expect_lod3);
+
+  std::vector<int> lod4 = {0, 1, 20};
+  int numel4 = 1000;
+  paddle::framework::LoD expect_lod4;
+  float* expect4 = CreateForSaveCombineOp<float, paddle::platform::float16>(
+      20, 50, lod4, "test_var4", place, &scope, &expect_lod4);
+
+  // Set attributes
+  std::string filename = "check_tensor_fp16_load.ls";
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"file_path", std::string(filename)});
+
+  // Run the save_combine_op
+  auto save_combine_op = paddle::framework::OpRegistry::CreateOp(
+      "save_combine",
+      {{"X", {"test_var1", "test_var2", "test_var3", "test_var4"}}}, {}, attrs);
+  save_combine_op->Run(scope, place);
+
+  // Set up output vars
+  auto load_var1 = scope.Var("out_var1");
+  auto load_var2 = scope.Var("out_var2");
+  auto load_var3 = scope.Var("out_var3");
+  auto load_var4 = scope.Var("out_var4");
+
+  attrs.insert({"load_as_fp16", true});
+  // Run the load_combine_op
+  auto load_combine_op = paddle::framework::OpRegistry::CreateOp(
+      "load_combine", {},
+      {{"Out", {"out_var1", "out_var2", "out_var3", "out_var4"}}}, attrs);
+  load_combine_op->Run(scope, place);
+
+  auto* target1 = load_var1->GetMutable<paddle::framework::LoDTensor>();
+  auto* target2 = load_var2->GetMutable<paddle::framework::LoDTensor>();
+  auto* target3 = load_var3->GetMutable<paddle::framework::LoDTensor>();
+  auto* target4 = load_var4->GetMutable<paddle::framework::LoDTensor>();
+
+  paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4;
+  paddle::platform::float16* actual1 =
+      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target1, scope,
+                                                             &actual_lod1);
+  paddle::platform::float16* actual2 =
+      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target2, scope,
+                                                             &actual_lod2);
+  paddle::platform::float16* actual3 =
+      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target3, scope,
+                                                             &actual_lod3);
+  paddle::platform::float16* actual4 =
+      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target4, scope,
+                                                             &actual_lod4);
+
+  CheckValues<float, paddle::platform::float16>(expect1, actual1, expect_lod1,
+                                                actual_lod1, numel1);
+  CheckValues<float, paddle::platform::float16>(expect2, actual2, expect_lod2,
+                                                actual_lod2, numel2);
+  CheckValues<float, paddle::platform::float16>(expect3, actual3, expect_lod3,
+                                                actual_lod3, numel3);
+  CheckValues<float, paddle::platform::float16>(expect4, actual4, expect_lod4,
+                                                actual_lod4, numel4);
 }
 
 // Test with original SaveLoadTest
@@ -141,7 +307,7 @@ TEST(SaveLoadTestWithCombineOp, CPU) {
 
   auto var = scope.Var("test_var");
   auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
-  tensor->Resize({3, 10});
+  tensor->Resize({3, 4000});
   paddle::framework::LoD expect_lod;
   expect_lod.resize(1);
   expect_lod[0].push_back(0);
diff --git a/paddle/fluid/operators/save_load_op_test.cc b/paddle/fluid/operators/save_load_op_test.cc
index 74385ee47543e3f4887081c2225212996d3df3f1..c4fcc61af4b75e6dc7d5c31e20c5fff358637af5 100644
--- a/paddle/fluid/operators/save_load_op_test.cc
+++ b/paddle/fluid/operators/save_load_op_test.cc
@@ -63,14 +63,21 @@ TEST(SaveLoadOp, CPU) {
   }
 }
 
-TEST(SaveLoadFP16Op, CPU) {
+TEST(SaveFP16Op, CPU) {
   paddle::framework::Scope scope;
   paddle::platform::CPUPlace place;
 
   auto var = scope.Var("test_var");
   auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
   tensor->Resize({3, 10});
+  paddle::framework::LoD expect_lod;
+  expect_lod.resize(1);
+  expect_lod[0].push_back(0);
+  expect_lod[0].push_back(1);
+  expect_lod[0].push_back(2);
+  expect_lod[0].push_back(3);
 
+  tensor->set_lod(expect_lod);
   float* expect = tensor->mutable_data<float>(place);
   for (int64_t i = 0; i < tensor->numel(); ++i) {
     expect[i] = static_cast<float>(paddle::platform::float16(i));
@@ -93,4 +100,60 @@ TEST(SaveLoadFP16Op, CPU) {
   for (int64_t i = 0; i < tensor->numel(); ++i) {
     EXPECT_EQ(expect[i], static_cast<float>(actual[i]));
   }
+  auto& actual_lod = target->lod();
+  EXPECT_EQ(expect_lod.size(), actual_lod.size());
+  for (size_t i = 0; i < expect_lod.size(); ++i) {
+    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
+      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
+    }
+  }
+}
+
+TEST(LoadFP16Op, CPU) {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+
+  auto var = scope.Var("test_var");
+  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+  tensor->Resize({3, 10});
+
+  paddle::framework::LoD expect_lod;
+  expect_lod.resize(1);
+  expect_lod[0].push_back(0);
+  expect_lod[0].push_back(1);
+  expect_lod[0].push_back(2);
+  expect_lod[0].push_back(3);
+
+  tensor->set_lod(expect_lod);
+  float* expect = tensor->mutable_data<float>(place);
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    expect[i] = static_cast<float>(paddle::platform::float16(i));
+  }
+
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"file_path", std::string("tensor.save")});
+  attrs.insert({"load_as_fp16", true});
+
+  auto save_op = paddle::framework::OpRegistry::CreateOp(
+      "save", {{"X", {"test_var"}}}, {}, attrs);
+  save_op->Run(scope, place);
+
+  auto load_var = scope.Var("out_var");
+  auto load_op = paddle::framework::OpRegistry::CreateOp(
+      "load", {}, {{"Out", {"out_var"}}}, attrs);
+  load_op->Run(scope, place);
+
+  auto target = load_var->Get<paddle::framework::LoDTensor>();
+  paddle::platform::float16* actual = target.data<paddle::platform::float16>();
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    EXPECT_EQ(expect[i], static_cast<float>(actual[i]));
+  }
+
+  auto& actual_lod = target.lod();
+  EXPECT_EQ(expect_lod.size(), actual_lod.size());
+  for (size_t i = 0; i < expect_lod.size(); ++i) {
+    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
+      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
+    }
+  }
 }
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index dcc1b9ec204e9e273b8fd2b12f2423fc989ba502..e6d27e2dedd7668b93bd8ddc330a897d1c6fa732 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -117,8 +117,7 @@ class SaveOp : public framework::OperatorBase {
 
 class SaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SaveOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(Tensor ) Input tensor to be saved");
     AddComment(R"DOC(
 Save operator
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index 7dcf33c989c3bcd905da8017ee36ec8ce8032911..4687e21e7155fc7309fb28c881c0d47152df9ad5 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -37,8 +37,7 @@ class ScaleOp : public framework::OperatorWithKernel {
 
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(Tensor) Input tensor of scale operator.");
     AddOutput("Out", "(Tensor) Output tensor of scale operator.");
     AddComment(R"DOC(
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index 95b12455ea4996f00bab8a353ccd425b2c37aed1..bf5e0d864495ce3a651a31c9d5a7664fe9eb2396 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -78,8 +78,7 @@ class ScatterGradOp : public framework::OperatorWithKernel {
 
 class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ScatterOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The source input of scatter op");
     AddInput("Ids", "The index input of scatter op where X will be updated");
     AddInput("Updates", "The updated value of updates op");
diff --git a/paddle/fluid/operators/select_op.cc b/paddle/fluid/operators/select_op.cc
index 876d8acf0d880a7ef806514014d297f98e04c53d..e71841d4d1815d50cd9800910c9db34e121beffc 100644
--- a/paddle/fluid/operators/select_op.cc
+++ b/paddle/fluid/operators/select_op.cc
@@ -380,8 +380,7 @@ class SelectOp : public framework::OperatorBase {
 
 class SelectOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SelectOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(kX,
              "A set of variables, which are required by operators inside the "
              "cases of Select Op")
diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc
index 12b844daaa33162b86b7daffa2e4c49785701662..511ad753876dd26a4d6bc8c2c727c7c9253ce59c 100644
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 
 #include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -36,32 +37,33 @@ class SendBarrierOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
     std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
+    bool sync_mode = Attr<bool>("sync_mode");
 
-    auto client_var_name = Output("RPCClient");
-    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
-                            "Can not find variable '%s' in the scope.",
-                            client_var_name);
-    auto* client_var = scope.FindVar(client_var_name);
-    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(place);
+    // For profiling
+    platform::RecordEvent record_event(Type(), &ctx);
 
-    // need to wait before sending send_barrier message
-    PADDLE_ENFORCE(rpc_client->Wait());
+    detail::RPCClient* rpc_client =
+        detail::RPCClient::GetInstance<detail::GRPCClient>();
+
+    VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode;
 
-    for (auto& ep : eps) {
-      VLOG(3) << "send barrier, ep: " << ep;
-      rpc_client->AsyncSendBatchBarrier(ep);
+    // need to wait before sending send_barrier message
+    rpc_client->Wait();
+    if (sync_mode) {
+      for (auto& ep : eps) {
+        VLOG(3) << "send barrier, ep: " << ep;
+        rpc_client->AsyncSendBatchBarrier(ep);
+      }
+      rpc_client->Wait();
     }
-    PADDLE_ENFORCE(rpc_client->Wait());
   }
 };
 
 class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SendBarrierOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddOutput("RPCClient",
-              "(RPCClient) The RPC client object which is"
-              "initialized at most once.");
+  void Make() {
     AddComment(R"DOC(
 SendBarrier operator
 
@@ -73,17 +75,7 @@ the Parameter Server would knew all variables have been sent.
                                       "(string vector, default 127.0.0.1:6164)"
                                       "Server endpoints to send variables to.")
         .SetDefault({"127.0.0.1:6164"});
-  }
-};
-
-class SendBarrierOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto out_var_name = op_desc.Output("RPCClient").front();
-    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarType::RAW;
-    out_var.SetType(var_type);
+    AddAttr<bool>("sync_mode", "work in sync_mode or not").SetDefault(true);
   }
 };
 
@@ -99,5 +91,4 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(send_barrier, ops::SendBarrierOp,
                   paddle::framework::EmptyGradOpMaker, ops::SendBarrierOpMaker,
-                  ops::SendBarrierOpVarTypeInference,
                   ops::SendBarrierOpShapeInference);
diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index e4386b640a298cd216bb60104653f20c4a96e7dc..9697579707f1a510ba7db8a1a9616b59918b971b 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -49,57 +49,49 @@ class SendOp : public framework::OperatorBase {
     // For profiling
     platform::RecordEvent record_event(Type(), &ctx);
 
-    auto client_var_name = Output("RPCClient");
-    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
-                            "Can not find variable '%s' in the scope.",
-                            client_var_name);
-    auto* client_var = scope.FindVar(client_var_name);
-    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
+    detail::RPCClient* rpc_client =
+        detail::RPCClient::GetInstance<detail::GRPCClient>();
 
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
         VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
-        rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
+        rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]);
       } else {
         VLOG(3) << "don't send no-initialied variable: " << ins[i];
       }
     }
-    PADDLE_ENFORCE(rpc_client->Wait());
+    rpc_client->Wait();
 
     if (sync_mode) {
       for (auto& ep : endpoints) {
         VLOG(3) << "batch barrier, ep: " << ep;
         rpc_client->AsyncSendBatchBarrier(ep);
       }
-      PADDLE_ENFORCE(rpc_client->Wait());
+      rpc_client->Wait();
     }
 
     if (outs.size() > 0) {
       for (size_t i = 0; i < outs.size(); i++) {
         VLOG(2) << "getting " << outs[i] << " from " << epmap[i];
-        rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
+        rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]);
       }
-      PADDLE_ENFORCE(rpc_client->Wait());
+      rpc_client->Wait();
       // tell pservers that current trainer have called fetch
       for (auto& ep : endpoints) {
         VLOG(2) << "send fetch barrier, ep: " << ep;
         rpc_client->AsyncSendFetchBarrier(ep);
       }
-      PADDLE_ENFORCE(rpc_client->Wait());
+      rpc_client->Wait();
     }
   }
 };
 
 class SendOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SendOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() {
     AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable();
     AddOutput("Out", "(Tensor) Output tensor to be received from server")
         .AsDuplicable();
-    AddOutput("RPCClient",
-              "(RPCClient) The RPC client object which is"
-              "initialized at most once.");
     AddComment(R"DOC(
 Send operator
 
@@ -120,17 +112,6 @@ This operator will send tensor to recv_op at the parameter server.
   }
 };
 
-class SendOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto out_var_name = op_desc.Output("RPCClient").front();
-    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarType::RAW;
-    out_var.SetType(var_type);
-  }
-};
-
 class SendOpShapeInference : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext* ctx) const override {}
@@ -142,5 +123,4 @@ class SendOpShapeInference : public framework::InferShapeBase {
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(send, ops::SendOp, paddle::framework::EmptyGradOpMaker,
-                  ops::SendOpMaker, ops::SendOpVarTypeInference,
-                  ops::SendOpShapeInference);
+                  ops::SendOpMaker, ops::SendOpShapeInference);
diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc
index eb51f301bfe2a97c65dd1fec23ff5a44f3843b05..e550552b195b768d68ec64e9c3b5889b56ca719f 100644
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -92,12 +92,16 @@ void InitSelectedRowsInScope(const p::CPUPlace &place, f::Scope *scope) {
 
 void AddOp(const std::string &type, const f::VariableNameMap &inputs,
            const f::VariableNameMap &outputs, f::AttributeMap attrs,
-           f::BlockDesc *block) {
+           f::BlockDesc *block, bool is_sparse) {
   // insert output
   for (auto kv : outputs) {
     for (auto v : kv.second) {
       auto var = block->Var(v);
       var->SetDataType(f::proto::VarType::FP32);
+      var->SetPersistable(true);
+      if (is_sparse) {
+        var->SetType(f::proto::VarType::SELECTED_ROWS);
+      }
     }
   }
 
@@ -128,7 +132,8 @@ void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
   auto *optimize_block = program.AppendBlock(root_block);
   auto *prefetch_block = program.AppendBlock(root_block);
   // X for server side tensors, RX for received tensors, must be of same shape.
-  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block);
+  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block,
+        is_sparse);
   f::AttributeMap attrs;
   attrs.insert({"endpoint", std::string("127.0.0.1:0")});
   attrs.insert({"Fanin", 1});
@@ -151,6 +156,7 @@ TEST(SendRecvOp, CPUDense) {
   std::thread server_thread(StartServerNet, false, &initialized);
   while (!initialized) {
   }
+
   static_cast<paddle::operators::ListenAndServOp *>(listen_and_serv_op.get())
       ->WaitServerReady();
 
@@ -170,9 +176,10 @@ TEST(SendRecvOp, CPUDense) {
   std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
   attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
   attrs.insert({"epmap", std::vector<std::string>({endpoint})});
-  auto send_op = f::OpRegistry::CreateOp(
-      "send", {{"X", {"x1"}}},
-      {{"Out", {"Out"}}, {"RPCClient", {"RPC_CLIENT_VAR"}}}, attrs);
+  const f::VariableNameMap &inputs = {{"X", {"x1"}}};
+  const f::VariableNameMap &outputs = {{"Out", {"Out"}}};
+
+  auto send_op = f::OpRegistry::CreateOp("send", inputs, outputs, attrs);
   send_op->Run(scope, place);
 
   auto in_var = scope.Var("x1");
@@ -215,9 +222,8 @@ TEST(SendRecvOp, CPUSparse) {
   std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
   attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
   attrs.insert({"epmap", std::vector<std::string>({endpoint})});
-  auto send_op = f::OpRegistry::CreateOp(
-      "send", {{"X", {"x1"}}},
-      {{"Out", {"Out"}}, {"RPCClient", {"RPC_CLIENT_VAR"}}}, attrs);
+  auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
+                                         {{"Out", {"Out"}}}, attrs);
   send_op->Run(scope, place);
 
   auto x0 = scope.Var("x0")->GetMutable<f::SelectedRows>();
diff --git a/paddle/fluid/operators/send_recv_util.h b/paddle/fluid/operators/send_recv_util.h
index 113513eb6b327773ab4a1c062fb8a3f06fddfbca..deab005149027caffa962783df944fad7110382f 100644
--- a/paddle/fluid/operators/send_recv_util.h
+++ b/paddle/fluid/operators/send_recv_util.h
@@ -20,6 +20,9 @@ namespace operators {
 
 inline bool NeedSend(const framework::Scope& scope,
                      const std::string& varname) {
+  // dummy variable is only used in parallel executor to represent
+  // some dependency relationship, we don't need to send/recv it.
+  if (varname == "dummy") return false;
   auto* var = scope.FindVar(varname);
   PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
                           varname);
diff --git a/paddle/fluid/operators/send_vars_op.cc b/paddle/fluid/operators/send_vars_op.cc
index 56b3713d6af28d0787e114a672a503e86cbd85fd..564e40461f8f894cffab11e26cc538b7964b6f19 100644
--- a/paddle/fluid/operators/send_vars_op.cc
+++ b/paddle/fluid/operators/send_vars_op.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/grpc_client.h"
 #include "paddle/fluid/operators/send_recv_util.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -41,19 +42,18 @@ class SendVarsOp : public framework::OperatorBase {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
 
-    auto client_var_name = Output("RPCClient");
-    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
-                            "Can not find variable '%s' in the scope.",
-                            client_var_name);
-    auto* client_var = scope.FindVar(client_var_name);
-    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
+    // For profiling
+    platform::RecordEvent record_event(Type(), &ctx);
+
+    detail::RPCClient* rpc_client =
+        detail::RPCClient::GetInstance<detail::GRPCClient>();
 
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
         VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
         // TODO(Yancey1989): we need to use an IO threadpool which has
         // a larger number of threads than the computing threadpool.
-        rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
+        rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]);
       } else {
         VLOG(3) << "don't send no-initialied variable: " << ins[i];
       }
@@ -66,13 +66,9 @@ class SendVarsOp : public framework::OperatorBase {
 
 class SendVarsOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SendVarsOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() {
     AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
         .AsDuplicable();
-    AddOutput("RPCClient",
-              "(RPCClient) The RPC client object which will be"
-              "initialized at most once.");
     AddComment(R"DOC(
 Send operator
 
@@ -90,17 +86,6 @@ This operator will send variables to listen_and_serve op at the parameter server
   }
 };
 
-class SendVarsOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto out_var_name = op_desc.Output("RPCClient").front();
-    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarType::RAW;
-    out_var.SetType(var_type);
-  }
-};
-
 class SendVarsOpShapeInference : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext* ctx) const override {}
@@ -113,5 +98,4 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(send_vars, ops::SendVarsOp,
                   paddle::framework::EmptyGradOpMaker, ops::SendVarsOpMaker,
-                  ops::SendVarsOpVarTypeInference,
                   ops::SendVarsOpShapeInference);
diff --git a/paddle/fluid/operators/sequence_concat_op.cc b/paddle/fluid/operators/sequence_concat_op.cc
index 3c21903e3a08dcfb55c6c07370a117d0ad633e69..077b9a5f7d935a39706ef3c2b710522bf1b713ed 100644
--- a/paddle/fluid/operators/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cc
@@ -43,8 +43,7 @@ class SequenceConcatOp : public framework::OperatorWithKernel {
 
 class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceConcatOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(LodTensorArray) Input is a vector of LoDTensor, "
              "each of which is a variable-length sequence or nested sequence.")
diff --git a/paddle/fluid/operators/sequence_conv_op.cc b/paddle/fluid/operators/sequence_conv_op.cc
index 94f4b49b0018fdbff6e67c3c081aa5706ccb2e66..ec6cb24350ae276724aae339590d40be1e9ea400 100644
--- a/paddle/fluid/operators/sequence_conv_op.cc
+++ b/paddle/fluid/operators/sequence_conv_op.cc
@@ -102,8 +102,7 @@ class SequenceConvGradOp : public framework::OperatorWithKernel {
 
 class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceConvOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(
         "X",
         "(LoDTensor) the input(X) is a LodTensor, which supports "
diff --git a/paddle/fluid/operators/sequence_erase_op.cc b/paddle/fluid/operators/sequence_erase_op.cc
index 73c0e89512972cda002bd902ee0c78b4b77d8502..1c86486157a02c3b78ed61e840fd8e452b9cb452 100644
--- a/paddle/fluid/operators/sequence_erase_op.cc
+++ b/paddle/fluid/operators/sequence_erase_op.cc
@@ -37,8 +37,7 @@ class SequenceEraseOp : public framework::OperatorWithKernel {
 
 class SequenceEraseOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceEraseOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(2-D LoDTensor with the 2nd dim. equal to 1) "
              "Input LoDTensor of SequenceEraseOp.");
diff --git a/paddle/fluid/operators/sequence_expand_op.cc b/paddle/fluid/operators/sequence_expand_op.cc
index 84a35d7172a567a3f6505559fa45a32290288533..944c7f85e5f43679e1875fcce813382be2ba5526 100644
--- a/paddle/fluid/operators/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_expand_op.cc
@@ -94,8 +94,7 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
 
 class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(LoDTensor, default LoDTensor<float>) A 2-D LoDTensor whose lod "
              "level is at most 1.");
diff --git a/paddle/fluid/operators/sequence_pool_op.cc b/paddle/fluid/operators/sequence_pool_op.cc
index 933c8c26239d49221819a583f999389ed6fb6cb6..5c6fd13d42e43e3502a1cab85a56e019420c708d 100644
--- a/paddle/fluid/operators/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_pool_op.cc
@@ -38,8 +38,7 @@ class SequencePoolOp : public framework::OperatorWithKernel {
 
 class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequencePoolOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(LoDTensor) The variable-length input of SequencePoolOp");
     AddOutput("Out",
               "(Tensor) The output of SequencePoolOp does not contain LoD "
diff --git a/paddle/fluid/operators/sequence_reshape_op.cc b/paddle/fluid/operators/sequence_reshape_op.cc
index a2999650b8903f9d819a8e8011421349e098b219..ef5e6f3210234d59298fcf04c812390643c693d0 100644
--- a/paddle/fluid/operators/sequence_reshape_op.cc
+++ b/paddle/fluid/operators/sequence_reshape_op.cc
@@ -42,8 +42,7 @@ class SequenceReshapeOp : public framework::OperatorWithKernel {
 
 class SequenceReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceReshapeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(LoDTensor, default LoDTensor<float>) A 2-D LoDTensor with shape "
              "being [N, M].");
diff --git a/paddle/fluid/operators/sequence_slice_op.cc b/paddle/fluid/operators/sequence_slice_op.cc
index 7cd620af07fa9b5f8fcee3c0f88207ef2800c4a1..df9243dc04c584d70dfa6ca78d5fac8423796466 100644
--- a/paddle/fluid/operators/sequence_slice_op.cc
+++ b/paddle/fluid/operators/sequence_slice_op.cc
@@ -79,8 +79,7 @@ class SequenceSliceGradOp : public framework::OperatorWithKernel {
 
 class SequenceSliceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceSliceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(LoDTensor), "
              "the input of SequenceSliceOp.");
diff --git a/paddle/fluid/operators/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_softmax_op.cc
index a0d47c12ba606eb62bbbea4d5ea793ce915e8100..c44f8206eb5079fef969e3e527552512eebd0f1a 100644
--- a/paddle/fluid/operators/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cc
@@ -57,8 +57,7 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel {
 
 class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceSoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(LoDTensor) 1-D or 2-D input LoDTensor with the 2-nd dimension "
              "of length 1.");
diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc
index bd04c60ffa5c1e5eb8d2051ce495ab6c685b14b5..7a2bdeac09d61603f437ff10d58d0542bb3c3689 100644
--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
@@ -68,8 +68,7 @@ class SGDOpInferVarType : public framework::VarTypeInference {
 
 class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SGDOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Param", "(Tensor or SelectedRows) Input parameter");
     AddInput("LearningRate", "(Tensor) Learning rate of SGD");
     AddInput("Grad", "(Tensor or SelectedRows) Input gradient");
diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h
index f3e88b0a0b05ef792b2cc8e880bdfddb6e6124d1..2685ce217ee0f0d3e89f3751e96218dcd19bead4 100644
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
@@ -96,8 +96,12 @@ class SGDOpKernel : public framework::OpKernel<T> {
         return;
       }
 
-      size_t param_row_width = param.value().numel() / param.rows().size();
-      size_t grad_row_width = grad.value().numel() / grad.rows().size();
+      auto param_row_width = param.value().dims()[1];
+      auto grad_row_width = grad.value().dims()[1];
+      VLOG(4) << " param rows: " << param.rows().size()
+              << " param memory rows: " << param.value().dims()[0]
+              << " grad rows: " << grad.rows().size()
+              << " grad memory rows: " << grad.value().dims()[0];
       PADDLE_ENFORCE_EQ(param_row_width, grad_row_width,
                         "param_row should have the same size with grad_row");
 
@@ -110,7 +114,7 @@ class SGDOpKernel : public framework::OpKernel<T> {
         int64_t id_index = param.Index(grad.rows()[i]);
         PADDLE_ENFORCE_GE(id_index, static_cast<int64_t>(0),
                           "id should be in the table");
-        for (size_t j = 0; j < grad_row_width; j++) {
+        for (int64_t j = 0; j < grad_row_width; j++) {
           out_data[id_index * grad_row_width + j] -=
               lr[0] * grad_data[i * grad_row_width + j];
         }
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c75fce7959d1af51afd52af23fe657d10a2f3988
--- /dev/null
+++ b/paddle/fluid/operators/shape_op.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/shape_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class ShapeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input (Input) of get_shape op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output (Out) of get_shape op should not be null.");
+    auto in_dim = ctx->GetInputDim("Input");
+    ctx->SetOutputDim("Out", {in_dim.size()});
+  }
+};
+
+class ShapeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Input", "(Tensor), The input tensor.");
+    AddOutput("Out", "(Tensor), The shape of input tensor.");
+    AddComment(R"DOC(
+Shape Operator. 
+Get the shape of input tensor.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(shape, ops::ShapeOp, ops::ShapeOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<int>, ops::ShapeKernel<int64_t>,
+                       ops::ShapeKernel<float>, ops::ShapeKernel<double>);
diff --git a/paddle/fluid/operators/shape_op.cu b/paddle/fluid/operators/shape_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7736a2a1e13cfa5d445411b3efac7669a7bf23a2
--- /dev/null
+++ b/paddle/fluid/operators/shape_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/shape_op.h"
+
+REGISTER_OP_CUDA_KERNEL(shape, paddle::operators::ShapeKernel<int>,
+                        paddle::operators::ShapeKernel<int64_t>,
+                        paddle::operators::ShapeKernel<float>,
+                        paddle::operators::ShapeKernel<double>);
diff --git a/paddle/fluid/operators/shape_op.h b/paddle/fluid/operators/shape_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..3be86b66a538e7b38a5d59095fee7e7636364bce
--- /dev/null
+++ b/paddle/fluid/operators/shape_op.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class ShapeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_t = ctx.Input<Tensor>("Input");
+    auto* out_t = ctx.Output<Tensor>("Out");
+    auto out_data = out_t->mutable_data<int64_t>(platform::CPUPlace());
+    auto in_dims = in_t->dims();
+    for (int i = 0; i < in_dims.size(); ++i) {
+      out_data[i] = in_dims[i];
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc
index a1871a8e7fb27d351f9d333966baa63c6f32ae01..8146c5f56104b7dec86b1c4491ed10fc2e94b58b 100644
--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
@@ -69,8 +69,7 @@ class ShrinkRNNMemoryOp : public ArrayOp {
 
 class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ShrinkRNNMemoryOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(LoDTensor) The RNN step memory to be shrinked.");
     AddInput("RankTable", "(LoDRankTable) The lod_rank_table of dynamic RNN.");
     AddInput("I",
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index 5db77d0493fc0abaa0a696cb559c3ca0534d4101..135e2a6f7f877c9ef159a4542b834d5627649e81 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -86,9 +86,7 @@ class SigmoidCrossEntropyWithLogitsGradOp
 class SigmoidCrossEntropyWithLogitsOpMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
-  SigmoidCrossEntropyWithLogitsOpMaker(OpProto* proto,
-                                       OpAttrChecker* op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, "
              "where N is the batch size and D is the number of classes. "
diff --git a/paddle/fluid/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc
index 8f8b7abd03212c12ca351e551621e63b4c7148c2..f3985dcc027f974e0213a73ea9a21e268d77615f 100644
--- a/paddle/fluid/operators/sign_op.cc
+++ b/paddle/fluid/operators/sign_op.cc
@@ -34,8 +34,7 @@ class SignOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class SignOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SignOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(Tensor) Input tensor of sign operator.");
     AddOutput("Out", "(Tensor) Output tensor of sign operator.");
     AddComment(R"DOC(
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cc b/paddle/fluid/operators/smooth_l1_loss_op.cc
index 322581fdef27b12a06704abc9c3b8772adf002f2..622420c1c33a62994c81ad9534c4fa37a4a1fa1a 100644
--- a/paddle/fluid/operators/smooth_l1_loss_op.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cc
@@ -46,8 +46,7 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
 
 class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SmoothL1LossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
              "The input value of smooth l1 loss op with shape "
@@ -106,7 +105,7 @@ class SmoothL1LossGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim("X");
+    auto in_dims = ctx->GetInputDim("Diff");
     auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
 
     PADDLE_ENFORCE_GE(out_dims.size(), 2,
@@ -128,12 +127,33 @@ class SmoothL1LossGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class SmoothL1LossGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("smooth_l1_loss_grad");
+    op->SetInput("InsideWeight", Input("InsideWeight"));
+    op->SetInput("OutsideWeight", Input("OutsideWeight"));
+    op->SetInput("Diff", Output("Diff"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+
+    op->SetAttrMap(Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(smooth_l1_loss, ops::SmoothL1LossOp, ops::SmoothL1LossOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::SmoothL1LossGradMaker);
 REGISTER_OPERATOR(smooth_l1_loss_grad, ops::SmoothL1LossGradOp);
 REGISTER_OP_CPU_KERNEL(
     smooth_l1_loss,
diff --git a/paddle/fluid/operators/softmax_mkldnn_op.cc b/paddle/fluid/operators/softmax_mkldnn_op.cc
index 71b541d98f6e0d3e12601c9988ca6ffb8bb7554d..14b57b11fefb2b726531cb164dbf479f8df26b24 100644
--- a/paddle/fluid/operators/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
@@ -53,25 +53,60 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
                    "Softmax input and output dimensions should match");
     // Same memory descriptor to be used for input and output
     memory::dims softmax_tz = {src_tz[0], src_tz[1]};
-    // Currently only supports NC data format
-    // TODO(jczaja-intel): support more formats
-    auto softmax_md =
-        MKLDNNMemDesc({softmax_tz}, memory::f32, memory::format::nc);
-    // Normalization is made after innermost dimension eg. C out of NC
-    auto softmax_desc = softmax_forward::desc(prop_kind::forward_scoring,
-                                              softmax_md, 1 /*dim: C*/);
-    // create memory primitives
-    auto softmax_src_memory =
-        memory({softmax_md, mkldnn_engine},
-               static_cast<void*>(const_cast<T*>(input_data)));
-    auto softmax_dst_memory =
-        memory({softmax_md, mkldnn_engine},
-               static_cast<void*>(const_cast<T*>(output_data)));
-    auto softmax_prim_desc =
-        softmax_forward::primitive_desc(softmax_desc, mkldnn_engine);
-    auto softmax = softmax_forward(softmax_prim_desc, softmax_src_memory,
-                                   softmax_dst_memory);
-    std::vector<primitive> pipeline{softmax};
+    // Generate keys for storing/retriving primitives for this operator
+    // TODO(jczaja): Each MKLDNN operator may have diffrent hashing function
+    auto gethash = [](memory::dims& operand_dims) {
+      return std::string(std::to_string(operand_dims[0]) + "-" +
+                         std::to_string(operand_dims[1]));
+    };
+    const std::string key = gethash(softmax_tz);
+    const std::string key_softmax_p = key + "@softmax_p";
+    const std::string key_softmax_src_mem_p = key + "@softmax_src_mem_p";
+    const std::string key_softmax_dst_mem_p = key + "@softmax_dst_mem_p";
+
+    std::shared_ptr<void> softmax_p = dev_ctx.GetBlob(key_softmax_p);
+    if (softmax_p == nullptr) {
+      // Currently only NC data format is supported
+      auto softmax_md =
+          MKLDNNMemDesc({softmax_tz}, memory::f32, memory::format::nc);
+      // Normalization is made after innermost dimension eg. C out of NC
+      auto softmax_desc = softmax_forward::desc(prop_kind::forward_scoring,
+                                                softmax_md, 1 /*dim: C*/);
+      // create memory primitives
+      auto softmax_src_memory_p = std::make_shared<memory>(
+          memory::primitive_desc{softmax_md, mkldnn_engine},
+          static_cast<void*>(const_cast<T*>(input_data)));
+      dev_ctx.SetBlob(key_softmax_src_mem_p, softmax_src_memory_p);
+      auto softmax_dst_memory_p = std::make_shared<memory>(
+          memory::primitive_desc{softmax_md, mkldnn_engine},
+          static_cast<void*>(output_data));
+      dev_ctx.SetBlob(key_softmax_dst_mem_p, softmax_dst_memory_p);
+
+      auto softmax_forward_pd =
+          std::make_shared<softmax_forward::primitive_desc>(softmax_desc,
+                                                            mkldnn_engine);
+      softmax_p = std::make_shared<softmax_forward>(
+          *(softmax_forward_pd.get()),
+          *(static_cast<memory*>(softmax_src_memory_p.get())),
+          *(static_cast<memory*>(softmax_dst_memory_p.get())));
+      dev_ctx.SetBlob(key_softmax_p, softmax_p);
+    } else {
+      // Primitives already exist
+      auto src_memory_p = std::static_pointer_cast<memory>(
+          dev_ctx.GetBlob(key_softmax_src_mem_p));
+      PADDLE_ENFORCE(src_memory_p != nullptr,
+                     "Fail to find softmax src mem_p in device context");
+      auto dst_memory_p = std::static_pointer_cast<memory>(
+          dev_ctx.GetBlob(key_softmax_dst_mem_p));
+      PADDLE_ENFORCE(dst_memory_p != nullptr,
+                     "Fail to find softmax dst mem_p in device context");
+      src_memory_p->set_data_handle(
+          reinterpret_cast<void*>(const_cast<T*>(input_data)));
+      dst_memory_p->set_data_handle(output_data);
+    }
+
+    std::vector<primitive> pipeline{
+        *(static_cast<softmax_forward::primitive*>(softmax_p.get()))};
     stream(stream::kind::eager).submit(pipeline).wait();
 
     const bool is_test = ctx.Attr<bool>("is_test");
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index aa7b192e327704c02a26c86cc208ebe8a5cd7ba5..c90a3be964a3a309a182d3620abec619c366dd84 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -49,6 +49,9 @@ class SoftmaxOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     // choose cudnn kernel if the runtime supported.
     framework::LibraryType library_{framework::LibraryType::kPlain};
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+
 #ifdef PADDLE_WITH_CUDA
     if (platform::CanCUDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kCUDNN;
@@ -58,6 +61,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
     if (library_ == framework::LibraryType::kPlain &&
         platform::CanMKLDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
     }
 #endif
 
@@ -68,17 +72,14 @@ class SoftmaxOp : public framework::OperatorWithKernel {
                      "float16 can only be used on GPU place");
     }
 
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-                                   framework::StringToDataLayout(data_format),
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
                                    library_);
   }
 };
 
 class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "The input tensor of softmax. "
              "2-D with shape [batch_size, input_feature_dimensions].");
@@ -143,6 +144,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     // choose cudnn kernel if the runtime supported.
     framework::LibraryType library_{framework::LibraryType::kPlain};
+
 #ifdef PADDLE_WITH_CUDA
     if (platform::CanCUDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kCUDNN;
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
index 857e5733573497b56520daa7860f4feb4e01cda7..53cb716a979229c99fcbdc12f1aeab4e21b320f3 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -20,8 +20,7 @@ namespace operators {
 class SoftmaxWithCrossEntropyOpMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftmaxWithCrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Logits",
              "(Tensor, default: Tensor<float>), The unscaled log probabilities "
              "which is a 2-D tensor with shape [N x K]. N is the batch_size, "
diff --git a/paddle/fluid/operators/split_byref_op.cc b/paddle/fluid/operators/split_byref_op.cc
index 7413ce3e9ce60ed733bb4d27e9ec205e5f0a7e1b..bc998e1abbd7131a7497288cc9d66315a6fedc85 100644
--- a/paddle/fluid/operators/split_byref_op.cc
+++ b/paddle/fluid/operators/split_byref_op.cc
@@ -64,8 +64,7 @@ class SplitByrefOp : public framework::OperatorWithKernel {
 
 class SplitByrefOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SplitByrefOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(Tensor) Input tensor of the split operator.");
     AddOutput("Out", "(Tensor) Output tensors of the split operator.")
         .AsDuplicable();
diff --git a/paddle/fluid/operators/split_ids_op.cc b/paddle/fluid/operators/split_ids_op.cc
index a53cbc8ac5199061dafdc7f4cf560b9e4fc577ab..c867c46873ae7ddbdbda280351e4ab28235bcc08 100644
--- a/paddle/fluid/operators/split_ids_op.cc
+++ b/paddle/fluid/operators/split_ids_op.cc
@@ -19,8 +19,7 @@ namespace operators {
 
 class SplitIdsOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SplitIdsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}");
     AddOutput("Out", "(LoDTensor) The outputs of the input Ids.")
         .AsDuplicable();
diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc
index 3222cce239988b170501f2b99e9f1253036b7fbc..767449cde981e5925b7144ff1038560c67651f3e 100644
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -125,8 +125,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {
 
 class SplitLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SplitLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The input LoDTensor");
     AddInput("Mask", "A bool column vector which mask the input");
     AddOutput("OutTrue", "True branch of input LoDTensor");
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index a4398df36bcc2d3b8bbe8949f27f5d6508861d95..5e2b2a994534c2fb1e053c067b36651d358b9da8 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -70,8 +70,7 @@ class SplitOp : public framework::OperatorWithKernel {
 
 class SplitOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SplitOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(Tensor) Input tensor of the split operator.");
     AddOutput("Out", "(Tensor) Output tensors of the split operator.")
         .AsDuplicable();
diff --git a/paddle/fluid/operators/split_selected_rows_op.cc b/paddle/fluid/operators/split_selected_rows_op.cc
index e1ce3d0c1bf11e9a623e4e9adc8f08f5069f4d94..76615a9405d7a8e3fa9dba8d01a956209e02ae8f 100644
--- a/paddle/fluid/operators/split_selected_rows_op.cc
+++ b/paddle/fluid/operators/split_selected_rows_op.cc
@@ -19,8 +19,7 @@ namespace operators {
 
 class SplitSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SplitSelectedRowsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The input SelectedRows.");
     AddOutput("Out", "The outputs of the input SelectedRows.").AsDuplicable();
     AddAttr<std::vector<int>>("height_sections",
diff --git a/paddle/fluid/operators/spp_op.cc b/paddle/fluid/operators/spp_op.cc
index 1cada95501a76da27081d533b451ce7f6a384a49..a2a96b72f09df86790ad1f90ead9189ff9bd581c 100644
--- a/paddle/fluid/operators/spp_op.cc
+++ b/paddle/fluid/operators/spp_op.cc
@@ -20,8 +20,7 @@ namespace operators {
 
 class SppOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SppOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(
         "X",
         "(Tensor) The input tensor of spp operator. "
diff --git a/paddle/fluid/operators/squared_l2_distance_op.cc b/paddle/fluid/operators/squared_l2_distance_op.cc
index c32f575b541d6a6441cc1b6e999496eacef421a5..42532a294b2ef9ffdb240fac8596278047daf7fe 100644
--- a/paddle/fluid/operators/squared_l2_distance_op.cc
+++ b/paddle/fluid/operators/squared_l2_distance_op.cc
@@ -56,8 +56,7 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {
 
 class SquaredL2DistanceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SquaredL2DistanceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(Tensor) Input of SquaredL2DistanceOp.");
     AddInput("Y", "(Tensor) Target of SquaredL2DistanceOp.");
     AddOutput("sub_result",
diff --git a/paddle/fluid/operators/squared_l2_norm_op.cc b/paddle/fluid/operators/squared_l2_norm_op.cc
index 4ce51259da3530367d91b5da34f06fbe5d969fce..7bd82e0ce4add6d4434e1defaee43da178a6f309 100644
--- a/paddle/fluid/operators/squared_l2_norm_op.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op.cc
@@ -48,8 +48,7 @@ class SquaredL2NormGradOp : public framework::OperatorWithKernel {
 
 class SquaredL2NormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SquaredL2NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(Tensor) The input of squared_l2_norm op.");
     AddOutput("Out", "(Scalar) The output of squared_l2_norm op.");
     AddComment(R"DOC(
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 108f26fafe7af76eaa613d77ed77748ee43ea234..bcc5e22d4a77349e7cde9a43b83f23d4c867d994 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -112,8 +112,7 @@ class SumOp : public framework::OperatorWithKernel {
 
 class SumOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SumOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(vector<Tensor>) The input tensors of sum operator.")
         .AsDuplicable();
     AddOutput("Out", "(Tensor) The output tensor of sum operator.");
diff --git a/paddle/fluid/operators/tensor_array_read_write_op.cc b/paddle/fluid/operators/tensor_array_read_write_op.cc
index 2636812c42985536e7ca3475c03bbd8d1638ece6..c703d11eeccf8418250f00c801f47418ee9c85ae 100644
--- a/paddle/fluid/operators/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/tensor_array_read_write_op.cc
@@ -57,8 +57,7 @@ class WriteToArrayOp : public ArrayOp {
 
 class WriteToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  WriteToArrayOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(LoDTensor) the tensor will be written to tensor array");
     AddInput(
         "I",
@@ -148,8 +147,7 @@ class ReadFromArrayOp : public ArrayOp {
 
 class ReadFromArrayProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ReadFromArrayProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(TensorArray) the array will be read from.");
     AddInput("I",
              "(Tensor) the subscript index in tensor array. The number of "
diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4b1208c4376b48e25866fc510f3a6d2ea06e7610
--- /dev/null
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -0,0 +1,143 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+
+#include "paddle/fluid/operators/tensorrt_engine_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+
+namespace paddle {
+namespace operators {
+
+using inference::Singleton;
+using inference::tensorrt::TRT_EngineManager;
+
+using FluidDT = framework::proto::VarType_Type;
+using TRT_DT = nvinfer1::DataType;
+
+namespace {
+
+TRT_DT FluidDataType2TRT(FluidDT type) {
+  switch (type) {
+    case FluidDT::VarType_Type_FP32:
+      return TRT_DT::kFLOAT;
+    case FluidDT::VarType_Type_INT32:
+      return TRT_DT::kINT32;
+    default:
+      return TRT_DT::kINT32;
+  }
+  PADDLE_THROW("unkown type");
+  return TRT_DT::kINT32;
+}
+
+nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
+  PADDLE_ENFORCE_GT(shape.size(), 1UL,
+                    "TensorRT' tensor input requires at least 2 dimensions");
+  PADDLE_ENFORCE_LE(shape.size(), 4UL,
+                    "TensorRT' tensor input requires at most 4 dimensions");
+
+  switch (shape.size()) {
+    case 2:
+      return nvinfer1::Dims2(shape[0], shape[1]);
+    case 3:
+      return nvinfer1::Dims3(shape[0], shape[1], shape[2]);
+    case 4:
+      return nvinfer1::Dims4(shape[0], shape[1], shape[2], shape[3]);
+    default:
+      return nvinfer1::Dims();
+  }
+  return nvinfer1::Dims();
+}
+
+}  // namespace
+
+template <typename DeviceContext, typename T>
+void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
+    const framework::ExecutionContext &context) const {
+  VLOG(4) << "Prepare engine";
+  // Get the ProgramDesc and pass to convert.
+  framework::proto::BlockDesc block_desc;
+  block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
+  max_batch_ = context.Attr<int>("max_batch");
+  auto max_workspace = context.Attr<int>("max_workspace");
+  engine_ = Singleton<TRT_EngineManager>::Global().Create(
+      max_batch_, max_workspace, &stream_);
+  engine_->InitNetwork();
+
+  framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
+  // Add inputs
+  VLOG(4) << "declare inputs";
+  for (auto &input : context.Inputs("Xs")) {
+    VLOG(4) << "declare input " << input;
+    auto *var = block.FindVar(input);
+    PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
+                      "TensorRT engine only takes LoDTensor as input");
+    auto shape = var->GetShape();
+    engine_->DeclareInput(
+        input, FluidDataType2TRT(
+                   var->Proto()->type().lod_tensor().tensor().data_type()),
+        Vec2TRT_Dims(var->GetShape()));
+  }
+
+  // TODO(Superjomn) parameters should be passed after analysised from outside.
+  inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
+      block_desc, {}, context.scope(), engine_);
+
+  // Add outputs
+  VLOG(4) << "declare outputs";
+  for (auto &output : context.Outputs("Ys")) {
+    VLOG(4) << "declare output " << output;
+    engine_->DeclareOutput(output);
+  }
+
+  engine_->FreezeNetwork();
+}
+
+class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Xs", "A list of inputs.").AsDuplicable();
+    AddOutput("Ys", "A list of outputs").AsDuplicable();
+    AddAttr<std::string>("subgraph", "the subgraph.");
+    AddAttr<int>("max_batch", "the maximum batch size.");
+    AddAttr<int>("max_workspace", "the maximum batch size.");
+    AddComment("TensorRT engine operator.");
+  }
+};
+
+class TensorRTEngineInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp,
+                  ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    tensorrt_engine,
+    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int64_t>);
+
+#endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b089601ff76eedd87bb3a52a38c4d22d4a94bf6
--- /dev/null
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -0,0 +1,118 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_CUDA
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+
+namespace paddle {
+namespace operators {
+
+class TensorRTEngineOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input0 = ctx.Inputs("Xs").front();
+    framework::OpKernelType kt = framework::OpKernelType(
+        framework::ToDataType(ctx.scope()
+                                  .FindVar(input0)
+                                  ->GetMutable<framework::LoDTensor>()
+                                  ->type()),
+        platform::CPUPlace());
+    return kt;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TensorRTEngineKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    if (!engine_) {
+      Prepare(context);
+    }
+    auto input_names = context.op().Inputs("Xs");
+    PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
+    // Try to determine a batch_size
+    auto& tensor0 = inference::analysis::GetFromScope<framework::LoDTensor>(
+        context.scope(), input_names.front());
+    int batch_size = tensor0.dims()[0];
+    PADDLE_ENFORCE_LE(batch_size, max_batch_);
+
+    // Convert input tensor from fluid to engine.
+    for (const auto& x : context.Inputs("Xs")) {
+      // convert input and copy to TRT engine's buffer
+      auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(
+          context.scope(), x);
+      if (platform::is_cpu_place(t.place())) {
+        engine_->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
+                                 t.memory_size());
+      } else {
+        engine_->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()),
+                                 t.memory_size());
+      }
+    }
+    // Execute the engine.
+    PADDLE_ENFORCE_GT(batch_size, 0);
+    engine_->Execute(batch_size);
+    // Convert output tensor from engine to fluid
+    for (const auto& y : context.Outputs("Ys")) {
+      // convert output and copy to fluid.
+      nvinfer1::ITensor* trt_t = engine_->GetITensor(y);
+      auto dims = trt_t->getDimensions();
+      // Use the output ITensor's dims to reshape the Fluid Tensor.
+      std::vector<int> ddim(dims.d, dims.d + dims.nbDims);
+
+      auto* fluid_v = context.scope().FindVar(y);
+      PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
+      auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
+      fluid_t->Resize(framework::make_ddim(ddim));
+      auto size = inference::analysis::AccuDims(dims.d, dims.nbDims);
+      if (platform::is_cpu_place(fluid_t->place())) {
+        // TODO(Superjomn) change this float to dtype size.
+        engine_->GetOutputInCPU(
+            y, fluid_t->mutable_data<float>(platform::CPUPlace()),
+            size * sizeof(float));
+      } else {
+        engine_->GetOutputInGPU(
+            y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
+            size * sizeof(float));
+      }
+    }
+
+    cudaStreamSynchronize(stream_);
+  }
+
+ protected:
+  // Build the engine.
+  void Prepare(const framework::ExecutionContext& context) const;
+
+ private:
+  mutable cudaStream_t stream_;
+  mutable inference::tensorrt::TensorRTEngine* engine_{nullptr};
+  mutable int max_batch_{0};
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6f383de259b270038c32296b59007f6c7d895f12
--- /dev/null
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -0,0 +1,152 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+USE_CPU_ONLY_OP(tensorrt_engine);
+
+namespace paddle {
+namespace operators {
+
+namespace {
+void CreateCPUTensor(framework::Scope* scope, const std::string& name,
+                     const std::vector<int64_t>& shape) {
+  auto* var = scope->Var(name);
+  auto* tensor = var->GetMutable<framework::LoDTensor>();
+  auto dims = framework::make_ddim(shape);
+  tensor->Resize(dims);
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+  inference::tensorrt::RandomizeTensor(tensor, place, ctx);
+}
+
+void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
+                          const std::string& name,
+                          const std::vector<int64_t>& shape) {
+  using framework::proto::VarType;
+  auto* var = block->add_vars();
+  framework::VarDesc desc(name);
+  desc.SetType(VarType::LOD_TENSOR);
+  desc.SetDataType(VarType::FP32);
+  desc.SetShape(shape);
+  *var = *desc.Proto();
+}
+
+template <typename T>
+void SetAttr(framework::proto::OpDesc* op, const std::string& name,
+             const T& data);
+
+template <>
+void SetAttr<std::string>(framework::proto::OpDesc* op, const std::string& name,
+                          const std::string& data) {
+  auto* attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::STRING);
+  attr->set_s(data);
+}
+template <>
+void SetAttr<int>(framework::proto::OpDesc* op, const std::string& name,
+                  const int& data) {
+  auto* attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::INT);
+  attr->set_i(data);
+}
+template <>
+void SetAttr<int64_t>(framework::proto::OpDesc* op, const std::string& name,
+                      const int64_t& data) {
+  auto* attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::LONG);
+  attr->set_l(data);
+}
+
+}  // namespace
+
+TEST(TensorRTEngineOp, manual) {
+  framework::ProgramDesc program;
+  auto* block_ = program.Proto()->add_blocks();
+  block_->set_idx(0);
+  block_->set_parent_idx(-1);
+
+  LOG(INFO) << "create block desc";
+  framework::BlockDesc block_desc(&program, block_);
+  LOG(INFO) << "create mul op";
+  auto* mul = block_desc.AppendOp();
+  mul->SetType("mul");
+  mul->SetInput("X", std::vector<std::string>({"x"}));     // 2 x 4
+  mul->SetInput("Y", std::vector<std::string>({"y"}));     // 4 x 6
+  mul->SetOutput("Out", std::vector<std::string>({"z"}));  // 2 x 6
+
+  LOG(INFO) << "create fc op";
+  auto* fc = block_desc.AppendOp();
+  fc->SetType("mul");
+  fc->SetInput("X", std::vector<std::string>({"z"}));
+  fc->SetInput("Y", std::vector<std::string>({"y0"}));     // 6 x 8
+  fc->SetOutput("Out", std::vector<std::string>({"z0"}));  // 2 x 8
+
+  // Set inputs' variable shape in BlockDesc
+  AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4}));
+  AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({4, 6}));
+  AddTensorToBlockDesc(block_, "y0", std::vector<int64_t>({6, 8}));
+  AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 6}));
+
+  // It is wired, need to copy manually.
+  *block_->add_ops() = *mul->Proto();
+  *block_->add_ops() = *fc->Proto();
+
+  ASSERT_EQ(block_->ops_size(), 2);
+
+  LOG(INFO) << "create tensorrt desc";
+  framework::OpDesc engine_op_desc(nullptr);
+  engine_op_desc.SetType("tensorrt_engine");
+  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x", "y", "y0"}));
+  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
+  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
+                       block_->SerializeAsString());
+  SetAttr<int>(engine_op_desc.Proto(), "max_batch", 30);
+  SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 1 << 10);
+
+  LOG(INFO) << "create engine op";
+  auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
+
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+  // Prepare variables.
+  CreateCPUTensor(&scope, "x", std::vector<int64_t>({2, 4}));
+  CreateCPUTensor(&scope, "y", std::vector<int64_t>({4, 6}));
+  CreateCPUTensor(&scope, "z", std::vector<int64_t>({2, 6}));
+
+  CreateCPUTensor(&scope, "y0", std::vector<int64_t>({6, 8}));
+  CreateCPUTensor(&scope, "z0", std::vector<int64_t>({2, 8}));
+
+  // Execute them.
+  LOG(INFO) << "engine_op run";
+  engine_op->Run(scope, place);
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+USE_TRT_CONVERTER(mul)
+USE_TRT_CONVERTER(fc)
diff --git a/paddle/fluid/operators/test_send_nccl_id.cc b/paddle/fluid/operators/test_send_nccl_id.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a01a75cbb070f7f835b93f5ddc62f5aad5a5e667
--- /dev/null
+++ b/paddle/fluid/operators/test_send_nccl_id.cc
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <string>
+#include <thread>  // NOLINT
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/detail/request_handler_impl.h"
+#include "paddle/fluid/operators/listen_and_serv_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/string/printf.h"
+
+USE_NO_KERNEL_OP(listen_and_serv);
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+namespace detail = paddle::operators::detail;
+namespace string = paddle::string;
+
+std::unique_ptr<detail::AsyncGRPCServer> g_rpc_service;
+std::unique_ptr<detail::RequestHandler> g_req_handler;
+
+void StartServer() {
+  f::Scope scope;
+  p::CPUPlace place;
+  scope.Var(NCCL_ID_VARNAME);
+  p::DeviceContextPool& pool = p::DeviceContextPool::Instance();
+  auto& dev_ctx = *pool.Get(p::CPUPlace());
+
+  f::ProgramDesc empty_program;
+  f::Executor executor(dev_ctx.GetPlace());
+  g_req_handler->SetScope(&scope);
+  g_req_handler->SetDevCtx(&dev_ctx);
+  g_req_handler->SetProgram(&empty_program);
+  g_req_handler->SetExecutor(&executor);
+
+  g_rpc_service->RegisterRPC(detail::kRequestSend, g_req_handler.get());
+  g_req_handler->SetRPCServer(g_rpc_service.get());
+
+  std::thread server_thread(
+      std::bind(&detail::AsyncGRPCServer::StartServer, g_rpc_service.get()));
+
+  g_rpc_service->SetCond(detail::kRequestSend);
+  g_rpc_service->WaitBarrier(detail::kRequestSend);
+
+  LOG(INFO) << "got nccl id and stop server...";
+  g_rpc_service->ShutDown();
+  server_thread.join();
+}
+
+TEST(SendNcclId, GrpcServer) {
+  g_req_handler.reset(new detail::RequestSendHandler(true));
+  g_rpc_service.reset(new detail::AsyncGRPCServer("127.0.0.1:0", 1));
+
+  std::thread server_thread(StartServer);
+  g_rpc_service->WaitServerReady();
+
+  f::Scope scope;
+  p::CPUPlace place;
+  p::DeviceContextPool& pool = p::DeviceContextPool::Instance();
+  auto& dev_ctx = *pool.Get(p::CPUPlace());
+
+  auto var = scope.Var(NCCL_ID_VARNAME);
+  auto id = var->GetMutable<ncclUniqueId>();
+  p::dynload::ncclGetUniqueId(id);
+
+  int port = g_rpc_service->GetSelectedPort();
+
+  std::string ep = string::Sprintf("127.0.0.1:%d", port);
+  detail::RPCClient* client =
+      detail::RPCClient::GetInstance<detail::GRPCClient>();
+  LOG(INFO) << "connect to server" << ep;
+  client->AsyncSendVar(ep, dev_ctx, scope, NCCL_ID_VARNAME);
+  client->Wait();
+  client->AsyncSendBatchBarrier(ep);
+  client->Wait();
+
+  server_thread.join();
+  g_rpc_service.reset(nullptr);
+  g_req_handler.reset(nullptr);
+}
diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
index 942a5de3f90f20eabe691924a570b61509eccf76..c17d1afc309c65035063348d4934ea1783b018ed 100644
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -48,8 +48,7 @@ class TopkOp : public framework::OperatorWithKernel {
 
 class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  TopkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(Tensor) The input of Topk op");
     AddOutput("Out", "(Tensor) The output tensor of Topk op");
     AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
index d44eeae8e6ff9ac87ab093d04e3f5427743f0c08..7ddb82ef6ff063868a4b9b603b8ab89700b9dd13 100644
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -55,6 +55,9 @@ class TopkKernel : public framework::OpKernel<T> {
     // NOTE: eigen shape doesn't affect paddle tensor.
     eg_input.reshape(flat2dims);
 
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
     for (size_t i = 0; i < row; i++) {
       std::vector<std::pair<T, size_t>> vec;
       for (size_t j = 0; j < col; j++) {
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index 3555cb68cab97c0cf983f1173c3b4ca9307e4f7d..60556a564c25c08612447ebd47a4b432b8a12d29 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -56,8 +56,7 @@ class TransposeOp : public framework::OperatorWithKernel {
 
 class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  TransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(
         "X",
         "(Tensor) The input tensor, tensors with rank up to 6 are supported.");
diff --git a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
index 00f00bb403db5e40939a1502b2219fb4d36d58e5..78fee77df8151221459b0afa0d6789bfe82cfda5 100644
--- a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
@@ -32,9 +32,8 @@ class UniformRandomBatchSizeLikeOp : public BatchSizeLikeOp {
 };
 
 class UniformRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
- public:
-  UniformRandomBatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : BatchSizeLikeOpMaker(proto, op_checker) {
+ protected:
+  void Apply() override {
     AddComment(R"DOC(
 Uniform random operator
 
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 3b5cf68dd4f28d23e507058337fe55de9b88d3cd..137ea91caedabc3167146d91b063dbe9e2e2b931 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -85,8 +85,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
 
 class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  UniformRandomOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddOutput("Out", "(Tensor) The output tensor of uniform random op");
     AddComment(R"DOC(
 Uniform random operator.
diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc
index b3cd87efa21115565b32659cb35fee4b5bed2d4f..1d441b43b14ea194152095874645f8133c423efd 100644
--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -20,8 +20,7 @@ namespace operators {
 
 class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Unpool2dOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(
         "X",
         "(Tensor) The input tensor of unpool operator. "
diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc
index 6835a5dd6286ece20c4ce6f3e951ed4b0057012c..e06c8c962f45a4e91b7efed7431571f0fc6870a3 100644
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -53,8 +53,7 @@ class WarpCTCOp : public framework::OperatorWithKernel {
 
 class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  WarpCTCOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Logits",
              "(LodTensor, default: LoDTensor<float>), the unscaled "
              "probabilities of variable-length sequences, which is a 2-D "
diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h
index 705cc894c06b207f4e4e45fc771c04fa3cbdf6d5..ab70c1f0592d122ba248a101db487e64c0bdae6f 100644
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -186,8 +186,7 @@ class WarpCTCKernel : public framework::OpKernel<T> {
 
     // warpctc accesses labels in CPU memory
     Tensor warpctc_label;
-    TensorCopy(*label, platform::CPUPlace(), ctx.device_context(),
-               &warpctc_label);
+    TensorCopySync(*label, platform::CPUPlace(), &warpctc_label);
     const int* warpctc_label_data = warpctc_label.data<int>();
     // warpctc stores loss in CPU memory
     Tensor warpctc_loss;
diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc
index 710cc9fc2e716da2e4fd067562a34d312e48b1a1..175c3ac5d79f24e47d21417df8e3eaeb4d5b2335 100644
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
@@ -68,8 +68,7 @@ class WhileOp : public framework::OperatorBase {
 
 class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  WhileOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(kX,
              "A set of variables, which are required by operators inside the "
              "block of While Op.")
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 598fd4d419078a973647f2f8f20e8a12c8115a8b..b29035bafd34fa81dc6b59691142fe74439202b8 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -1,4 +1,4 @@
-proto_library(profiler_proto SRCS profiler.proto)
+proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto)
 py_proto_compile(profiler_py_proto SRCS profiler.proto)
 
 add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
@@ -49,7 +49,7 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
 
-cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto ${GPU_CTX_DEPS})
+cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
 
diff --git a/paddle/fluid/platform/assert.h b/paddle/fluid/platform/assert.h
index 123d3598f4f4753f70889e415aff0f41b7d212f7..2ce9b31bb81de867ff4ed6ee14afddecd95317b9 100644
--- a/paddle/fluid/platform/assert.h
+++ b/paddle/fluid/platform/assert.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #define STRINGIFY(x) #x
 #define TOSTRING(x) STRINGIFY(x)
 
-#if defined(__APPLE__) && defined(__CUDA_ARCH__) && !defined(NDEBUG)
+#if defined(__CUDA_ARCH__)
 #include <stdio.h>
 #define PADDLE_ASSERT(e)                                           \
   do {                                                             \
@@ -38,6 +38,9 @@ limitations under the License. */
   } while (0)
 #else
 #include <assert.h>
-#define PADDLE_ASSERT(e) assert(e)
+// For cuda, the assertions can affect performance and it is therefore
+// recommended to disable them in production code
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion
+#define PADDLE_ASSERT(e) assert((e))
 #define PADDLE_ASSERT_MSG(e, m) assert((e) && (m))
 #endif
diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h
index e81c385727be5c2ba3f02bfbd86168cb4650dfda..ecec4178f2d9937920e52eb74bf9068b84e741a0 100644
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -63,6 +63,7 @@ __device__ T reduceSum(T val, int tid, int len) {
     val += platform::CudaShuffleDownSync(mask, val, offset);
 
   if (tid < warpSize) shm[tid] = 0;
+  __syncthreads();
 
   if (tid % warpSize == 0) {
     shm[tid / warpSize] = val;
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index c0d399d078f73743836fc2a0c1d4b1e6b31ecd83..6ea4f8b7cba18ce7f803dbd9b15a7ae70c3055f2 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -22,6 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/macros.h"
 
+DECLARE_bool(cudnn_deterministic);
+
 namespace paddle {
 namespace platform {
 
@@ -76,8 +78,44 @@ enum class DataLayout {  // Not use
 enum class PoolingMode {
   kMaximum,
   kAverage,
+  kMaximumDeterministic,
 };
 
+#if CUDNN_VERSION < 6000
+#pragma message "CUDNN version under 6.0 is supported at best effort."
+#pragma message "We strongly encourage you to move to 6.0 and above."
+#pragma message "This message is intended to annoy you enough to update."
+#pragma message \
+    "please see https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/"
+
+inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
+  switch (mode) {
+    case PoolingMode::kMaximumDeterministic:
+      return CUDNN_POOLING_MAX;
+    case PoolingMode::kAverage:
+      return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+    case PoolingMode::kMaximum:
+      return CUDNN_POOLING_MAX;
+    default:
+      PADDLE_THROW("Unexpected pooling mode.");
+  }
+}
+#else
+
+inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
+  switch (mode) {
+    case PoolingMode::kMaximumDeterministic:
+      return CUDNN_POOLING_MAX_DETERMINISTIC;
+    case PoolingMode::kAverage:
+      return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+    case PoolingMode::kMaximum:
+      return CUDNN_POOLING_MAX;
+    default:
+      PADDLE_THROW("Unexpected pooling mode.");
+  }
+}
+#endif  // CUDNN_VERSION < 6000
+
 template <typename T>
 class CudnnDataType;
 
@@ -293,9 +331,7 @@ class ScopedPoolingDescriptor {
     PADDLE_ENFORCE_EQ(kernel.size(), pads.size());
     PADDLE_ENFORCE_EQ(kernel.size(), strides.size());
     PADDLE_ENFORCE(dynload::cudnnSetPoolingNdDescriptor(
-        desc_, (mode == PoolingMode::kMaximum
-                    ? CUDNN_POOLING_MAX
-                    : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING),
+        desc_, (GetPoolingMode(mode)),
         CUDNN_PROPAGATE_NAN,  // Always propagate nans.
         kernel.size(), kernel.data(), pads.data(), strides.data()));
     return desc_;
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 1f733d71bdfb777d4a2f316a5fefc3c874879862..6c50ab2685c56bafe146c67fe2ef081ee4c55628 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -175,7 +175,6 @@ CUDADeviceContext::~CUDADeviceContext() {
 Place CUDADeviceContext::GetPlace() const { return place_; }
 
 void CUDADeviceContext::Wait() const {
-  std::lock_guard<std::recursive_mutex> guard(mutex_);
   PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
   PADDLE_ENFORCE(cudaGetLastError());
 }
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index a9c1984616bc731e0557f2cb89282423aa9c3bac..6b82d93237b6baa20703c5b54b56f5381dd858df 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -100,7 +100,6 @@ class CUDADeviceContext : public DeviceContext {
 
   template <typename Callback>
   void RecordEvent(cudaEvent_t ev, Callback callback) {
-    std::lock_guard<std::recursive_mutex> guard(mutex_);
     callback();
     PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
   }
@@ -110,8 +109,6 @@ class CUDADeviceContext : public DeviceContext {
 
   std::unique_ptr<Eigen::GpuDevice> eigen_device_;
   std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
-
-  mutable std::recursive_mutex mutex_;
   cudaStream_t stream_;
   cudnnHandle_t cudnn_handle_;
   cublasHandle_t cublas_handle_;
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index 3870e69ba7fe6c304b7b5a7bbfe23c725f18e43a..b79768eba258835520daae28f1b2e18266f1e43e 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -247,7 +247,6 @@ class DeviceTracerImpl : public DeviceTracer {
   void Enable() {
     std::lock_guard<std::mutex> l(trace_mu_);
     if (enabled_) {
-      fprintf(stderr, "DeviceTracer already enabled\n");
       return;
     }
     EnableActivity();
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index 81acaff87d3c2025cf0d6185a1590b018bfbd83c..25bcda7eedc1ef42f75fb8fd1439f0c8f55015c3 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -45,7 +45,7 @@ extern void *cublas_dso_handle;
       std::call_once(cublas_dso_flag, []() {                                 \
         cublas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
       });                                                                    \
-      void *p_##__name = dlsym(cublas_dso_handle, #__name);                  \
+      static void *p_##__name = dlsym(cublas_dso_handle, #__name);           \
       return reinterpret_cast<FUNC_TYPE>(p_##__name)(args...);               \
     }                                                                        \
   };                                                                         \
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 34d83e395694f55eafca74d63ebf363169ab30e8..77e46fa768b62c277d7b4027de7173e39a5672b4 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -39,7 +39,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
         cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
       });                                                                  \
       EnforceCUDNNLoaded(#__name);                                         \
-      void* p_##__name = dlsym(cudnn_dso_handle, #__name);                 \
+      static void* p_##__name = dlsym(cudnn_dso_handle, #__name);          \
       return reinterpret_cast<cudnn_func>(p_##__name)(args...);            \
     }                                                                      \
   };                                                                       \
diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h
index e64de7c20fc9d145e51cfc4528e321b3c4ec86c8..2ad52bc7d328f1d05b1bf1dcd4bb39a7c67b8179 100644
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -45,7 +45,7 @@ extern void *cupti_dso_handle;
       std::call_once(cupti_dso_flag, []() {                                \
         cupti_dso_handle = paddle::platform::dynload::GetCUPTIDsoHandle(); \
       });                                                                  \
-      void *p_##__name = dlsym(cupti_dso_handle, #__name);                 \
+      static void *p_##__name = dlsym(cupti_dso_handle, #__name);          \
       return reinterpret_cast<cuptiFunc>(p_##__name)(args...);             \
     }                                                                      \
   };                                                                       \
diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h
index 46ad4379d5f9572d415ef1d747077217ae29391e..5b9e0820e0b319fe7a636a57a0029caf038b4db3 100644
--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
@@ -34,7 +34,7 @@ extern void *curand_dso_handle;
       std::call_once(curand_dso_flag, []() {                                 \
         curand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
       });                                                                    \
-      void *p_##__name = dlsym(curand_dso_handle, #__name);                  \
+      static void *p_##__name = dlsym(curand_dso_handle, #__name);           \
       return reinterpret_cast<curandFunc>(p_##__name)(args...);              \
     }                                                                        \
   };                                                                         \
diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h
index 37902ae20c5d9d64486232bbd468375c4a50a615..575516f81870fc9f7b92919ffc20a201cb5cbce8 100644
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -37,7 +37,7 @@ extern void* nccl_dso_handle;
       std::call_once(nccl_dso_flag, []() {                               \
         nccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
       });                                                                \
-      void* p_##__name = dlsym(nccl_dso_handle, #__name);                \
+      static void* p_##__name = dlsym(nccl_dso_handle, #__name);         \
       return reinterpret_cast<nccl_func>(p_##__name)(args...);           \
     }                                                                    \
   };                                                                     \
diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
index f584a49da0fefe0b064b5fb55b01ec132225ce5e..5d67658b94af75680a100e13eed7b6b052162e00 100644
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -40,7 +40,7 @@ extern void* tensorrt_dso_handle;
             paddle::platform::dynload::GetTensorRtDsoHandle();          \
         PADDLE_ENFORCE(tensorrt_dso_handle, "load tensorrt so failed"); \
       });                                                               \
-      void* p_##__name = dlsym(tensorrt_dso_handle, #__name);           \
+      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);    \
       PADDLE_ENFORCE(p_##__name, "load %s failed", #__name);            \
       return reinterpret_cast<tensorrt_func>(p_##__name)(args...);      \
     }                                                                   \
diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h
index 7c70649d21c547beb824576d4a8ecf6219a9bddf..d157c1fda789b98f06ad069d2a9c4f421ff82dcd 100644
--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -40,7 +40,7 @@ extern void* warpctc_dso_handle;
       std::call_once(warpctc_dso_flag, []() {                                  \
         warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \
       });                                                                      \
-      void* p_##_name = dlsym(warpctc_dso_handle, #__name);                    \
+      static void* p_##_name = dlsym(warpctc_dso_handle, #__name);             \
       return reinterpret_cast<warpctcFunc>(p_##_name)(args...);                \
     }                                                                          \
   };                                                                           \
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 23f1d615daab91f0e4b353bc7d9a3ca7f5cec5ae..de711b7d23ef01d57a62087c552ea090f01f0386 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <mkldnn.h>
 #include <vector>
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace platform {
@@ -38,6 +39,11 @@ void* to_void_cast(const Type* t) {
   return static_cast<void*>(const_cast<Type*>(t));
 }
 
+template <typename Type>
+void* to_void_reinterpret_cast(const Type* t) {
+  return reinterpret_cast<void*>(const_cast<Type*>(t));
+}
+
 template <class Type>
 using tf_desc = typename Type::desc;
 
@@ -71,5 +77,27 @@ inline bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx) {
   return use_mkldnn && platform::is_cpu_place(ctx.GetPlace());
 }
 
+template <typename Type>
+mkldnn::memory::data_type MKLDNNGetDataType() {
+  return mkldnn::memory::data_undef;
+}
+
+template <>
+inline mkldnn::memory::data_type MKLDNNGetDataType<float>() {
+  return mkldnn::memory::f32;
+}
+
+inline void Reorder(const mkldnn::memory& src, const mkldnn::memory& dst) {
+  auto reorder_prim = mkldnn::reorder(src, dst);
+  std::vector<mkldnn::primitive> pipeline;
+  pipeline.push_back(reorder_prim);
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+}
+
+inline mkldnn::memory::format GetMKLDNNFormat(const mkldnn::memory memory) {
+  return static_cast<mkldnn::memory::format>(
+      memory.get_primitive_desc().desc().data.format);
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index 0013597fd516d15c7d502370eec77e1a6a5dca88..6f8e3f22db54d166cf97cfdd3d009058207a7ca5 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -14,12 +14,16 @@
 
 #pragma once
 
+#include <stdio.h>
+#include <string>
 #include <thread>  // NOLINT
 #include <typeindex>
 #include <vector>
 #include "paddle/fluid/platform/dynload/nccl.h"
 #include "paddle/fluid/platform/enforce.h"
 
+#define NCCL_ID_VARNAME "NCCLID"
+
 namespace paddle {
 namespace platform {
 
@@ -50,7 +54,7 @@ class NCCLGroupGuard {
   }
 
   inline ~NCCLGroupGuard() {
-    PADDLE_ENFORCE(dynload::ncclGroupEnd());
+    CHECK_EQ(dynload::ncclGroupEnd(), ncclSuccess);
     NCCLMutex().unlock();
   }
 };
@@ -73,7 +77,9 @@ struct NCCLContextMap {
   std::unordered_map<int, NCCLContext> contexts_;
   std::vector<int> order_;
 
-  explicit NCCLContextMap(const std::vector<platform::Place> &places) {
+  explicit NCCLContextMap(const std::vector<platform::Place> &places,
+                          ncclUniqueId *nccl_id = nullptr,
+                          size_t num_trainers = 1, size_t trainer_id = 0) {
     PADDLE_ENFORCE(!places.empty());
     order_.reserve(places.size());
     for (auto &p : places) {
@@ -85,18 +91,34 @@ struct NCCLContextMap {
         order_.size(), contexts_.size(),
         "NCCL Context Map does not support contain two or more same device");
 
-    if (places.size() > 1) {
-      std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
+    if (places.size() <= 1) {
+      return;
+    }
+    std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
+    // if pass nccl_id here, can assume we are doing multi node training
+    if (nccl_id == nullptr) {
+      std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
+      PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
+          comms.get(), static_cast<int>(order_.size()), order_.data()));
+    } else {
+      PADDLE_ENFORCE_GT(num_trainers, 1);
+      // TODO(wuyi): need to ensure each node have same number of GPUs
       {
-        std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
-        PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
-            comms.get(), static_cast<int>(order_.size()), order_.data()));
-      }
-      int i = 0;
-      for (auto &dev_id : order_) {
-        contexts_.at(dev_id).comm_ = comms[i++];
+        int nranks = num_trainers * order_.size();
+        NCCLGroupGuard gurad;
+        for (auto &gpu_id : order_) {
+          int rank = trainer_id * order_.size() + gpu_id;
+          VLOG(3) << "init nccl rank: " << rank << " nranks: " << nranks;
+          PADDLE_ENFORCE(cudaSetDevice(gpu_id));
+          PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
+              comms.get() + gpu_id, nranks, *nccl_id, rank));
+        }
       }
     }
+    int i = 0;
+    for (auto &dev_id : order_) {
+      contexts_.at(dev_id).comm_ = comms[i++];
+    }
   }
 
   NCCLContextMap(const NCCLContextMap &other) = delete;
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index cfddd8e8711f8005e0eff7ef7a2980f535b2f851..01de9d7041bf3eb40884e2a6295027cccfaebd2a 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -38,6 +38,7 @@ struct EventList;
 
 static int64_t profiler_lister_id = 0;
 static bool should_send_profile_state = false;
+std::mutex profiler_mu;
 
 // The profiler state, the initial value is ProfilerState::kDisabled
 static ProfilerState g_state = ProfilerState::kDisabled;
@@ -126,6 +127,7 @@ double Event::CpuElapsedMs(const Event& e) const {
 
 double Event::CudaElapsedMs(const Event& e) const {
 #ifdef PADDLE_WITH_CUDA
+  if (!has_cuda_) return 0.0;
   PADDLE_ENFORCE(e.has_cuda() && has_cuda());
   PADDLE_ENFORCE(e.device() == device());
   PADDLE_ENFORCE(cudaEventSynchronize(event_));
@@ -173,8 +175,9 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
 }
 
 RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
-    : start_ns_(PosixInNsec()) {
+    : is_enabled_(false), start_ns_(PosixInNsec()) {
   if (g_state == ProfilerState::kDisabled) return;
+  is_enabled_ = true;
   dev_ctx_ = dev_ctx;
   name_ = name;
   PushEvent(name_, dev_ctx_);
@@ -183,7 +186,7 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
 }
 
 RecordEvent::~RecordEvent() {
-  if (g_state == ProfilerState::kDisabled) return;
+  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
   DeviceTracer* tracer = GetDeviceTracer();
   if (tracer) {
     tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(),
@@ -193,14 +196,16 @@ RecordEvent::~RecordEvent() {
   PopEvent(name_, dev_ctx_);
 }
 
-RecordBlock::RecordBlock(int block_id) : start_ns_(PosixInNsec()) {
+RecordBlock::RecordBlock(int block_id)
+    : is_enabled_(false), start_ns_(PosixInNsec()) {
   if (g_state == ProfilerState::kDisabled) return;
+  is_enabled_ = true;
   SetCurBlock(block_id);
   name_ = string::Sprintf("block_%d", block_id);
 }
 
 RecordBlock::~RecordBlock() {
-  if (g_state == ProfilerState::kDisabled) return;
+  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
   DeviceTracer* tracer = GetDeviceTracer();
   if (tracer) {
     // We try to put all blocks at the same nested depth in the
@@ -225,6 +230,8 @@ void EnableProfiler(ProfilerState state) {
   PADDLE_ENFORCE(state != ProfilerState::kDisabled,
                  "Can't enbale profling, since the input state is ",
                  "ProfilerState::kDisabled");
+
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (state == g_state) {
     return;
   }
@@ -292,7 +299,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
   } else if (g_state == ProfilerState::kAll) {
     place = "All";
   } else {
-    PADDLE_THROW("Invalid profiler state");
+    PADDLE_THROW("Invalid profiler state", g_state);
   }
 
   std::cout << "Place: " << place << std::endl;
@@ -440,6 +447,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
 
 void DisableProfiler(EventSortingKey sorted_key,
                      const std::string& profile_path) {
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled) return;
   // Mark the profiling stop.
   Mark("_stop_profiler_", nullptr);
@@ -463,7 +471,7 @@ void SetProfileListener() {
   std::mt19937 rng;
   rng.seed(std::random_device()());
   std::uniform_int_distribution<std::mt19937::result_type> dist6(
-      1, std::numeric_limits<int64_t>::max());
+      1, std::numeric_limits<int>::max());
   profiler_lister_id = dist6(rng);
 }
 int64_t ListenerId() { return profiler_lister_id; }
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 61b98143e41abb9e47d2c717c7876f1bab7f5077..bf43925373a12cd9ff2155d68c42d0266ba4df60 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -74,6 +74,7 @@ struct RecordEvent {
 
   ~RecordEvent();
 
+  bool is_enabled_;
   uint64_t start_ns_;
   // The device context is used by Event to get the current cuda stream.
   const DeviceContext* dev_ctx_;
@@ -89,6 +90,7 @@ struct RecordBlock {
   ~RecordBlock();
 
  private:
+  bool is_enabled_;
   std::string name_;
   uint64_t start_ns_;
 };
@@ -114,6 +116,8 @@ void ResetProfiler();
 void DisableProfiler(EventSortingKey sorted_key,
                      const std::string& profile_path);
 
+const int kEnableProfiler = 1;
+const int kDisableProfiler = 2;
 // Test if the profiler is currently enabled.
 bool IsProfileEnabled();
 // Whether the trainer should send profiling state to PS.
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
index 3f28e616494ad1322708ad6403aaf50b22d724e6..76aa7d2010682416f68e982e9b89da9813abb078 100644
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/pybind/const_value.h"
+#include <paddle/fluid/framework/op_proto_maker.h>
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
@@ -23,6 +24,22 @@ void BindConstValue(pybind11::module* m) {
   m->def("kTempVarName", [] { return framework::kTempVarName; });
   m->def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
   m->def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
+
+  auto op_proto_and_checker_maker =
+      m->def_submodule("op_proto_and_checker_maker");
+
+  pybind11::enum_<framework::OpRole>(op_proto_and_checker_maker, "OpRole")
+      .value("Forward", framework::OpRole::kForward)
+      .value("Backward", framework::OpRole::kBackward)
+      .value("Optimize", framework::OpRole::kOptimize)
+      .value("Loss", framework::OpRole::kLoss)
+      .value("RPC", framework::OpRole::kRPC);
+
+  op_proto_and_checker_maker.def(
+      "kOpRoleAttrName", framework::OpProtoAndCheckerMaker::OpRoleAttrName);
+  op_proto_and_checker_maker.def(
+      "kOpRoleVarAttrName",
+      framework::OpProtoAndCheckerMaker::OpRoleVarAttrName);
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 6471eb3ab7bf05365c0bb2bf68bb74ef9044c527..bcf6d4dd3087060c016e53722cde80704ef2e834 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -238,6 +238,7 @@ void BindVarDsec(pybind11::module *m) {
 
   pybind11::enum_<pd::proto::VarType::Type>(var_desc, "VarType", "")
       .value("BOOL", pd::proto::VarType::BOOL)
+      .value("UINT8", pd::proto::VarType::UINT8)
       .value("INT16", pd::proto::VarType::INT16)
       .value("INT32", pd::proto::VarType::INT32)
       .value("INT64", pd::proto::VarType::INT64)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index c925686f8382da1758fb7cdc048253290ef69513..669d1bdaa3ec194be817cdc5e1f8484770c70c68 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -117,6 +117,7 @@ PYBIND11_PLUGIN(core) {
       .def("set", PyCPUTensorSetFromArray<int64_t>)
       .def("set", PyCPUTensorSetFromArray<bool>)
       .def("set", PyCPUTensorSetFromArray<uint16_t>)
+      .def("set", PyCPUTensorSetFromArray<uint8_t>)
 #ifdef PADDLE_WITH_CUDA
       .def("set", PyCUDATensorSetFromArray<float>)
       .def("set", PyCUDATensorSetFromArray<int>)
@@ -124,12 +125,14 @@ PYBIND11_PLUGIN(core) {
       .def("set", PyCUDATensorSetFromArray<int64_t>)
       .def("set", PyCUDATensorSetFromArray<bool>)
       .def("set", PyCUDATensorSetFromArray<uint16_t>)
+      .def("set", PyCUDATensorSetFromArray<uint8_t>)
       .def("set", PyCUDAPinnedTensorSetFromArray<float>)
       .def("set", PyCUDAPinnedTensorSetFromArray<int>)
       .def("set", PyCUDAPinnedTensorSetFromArray<double>)
       .def("set", PyCUDAPinnedTensorSetFromArray<int64_t>)
       .def("set", PyCUDAPinnedTensorSetFromArray<bool>)
       .def("set", PyCUDAPinnedTensorSetFromArray<uint16_t>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<uint8_t>)
 #endif
       .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
       .def("set_float_element", TensorSetElement<float>)
@@ -492,22 +495,78 @@ All parameter, weight, gradient are variables in Paddle.
 
   m.def("enable_profiler", platform::EnableProfiler);
   m.def("disable_profiler", platform::DisableProfiler);
+  m.def("is_profiler_enabled", platform::IsProfileEnabled);
   m.def("reset_profiler", platform::ResetProfiler);
 
-  py::class_<ParallelExecutor>(m, "ParallelExecutor")
-      .def("__init__",
-           [](ParallelExecutor &self, size_t num_threads, bool use_event,
-              const std::vector<platform::Place> &places,
-              const std::unordered_set<std::string> &params,
-              const std::unordered_set<std::string> &bcast_vars,
-              const ProgramDesc &main_program, const std::string &loss_var_name,
-              Scope *scope, std::vector<Scope *> &local_scopes,
-              bool allow_op_delay, bool use_default_grad_scale) {
-             new (&self) ParallelExecutor(
-                 num_threads, use_event, places, params, bcast_vars,
-                 main_program, loss_var_name, scope, local_scopes,
-                 allow_op_delay, use_default_grad_scale);
-           })
+  // -- python binds for parallel executor.
+  py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
+  py::class_<ExecutionStrategy>(pe, "ExecutionStrategy")
+      .def(py::init())
+      .def_property(
+          "num_threads",
+          [](const ExecutionStrategy &self) { return self.num_threads_; },
+          [](ExecutionStrategy &self, size_t num_threads) {
+            self.num_threads_ = num_threads;
+          })
+      .def_property(
+          "use_event",
+          [](const ExecutionStrategy &self) { return self.use_event_; },
+          [](ExecutionStrategy &self, bool use_event) {
+            self.use_event_ = use_event;
+          })
+      .def_property(
+          "allow_op_delay",
+          [](const ExecutionStrategy &self) { return self.allow_op_delay_; },
+          [](ExecutionStrategy &self, bool allow_op_delay) {
+            self.allow_op_delay_ = allow_op_delay;
+          })
+      .def_property(
+          "num_iteration_per_drop_scope",
+          [](const ExecutionStrategy &self) {
+            return self.num_iteration_per_drop_scope_;
+          },
+          [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
+            self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
+          });
+  py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy");
+
+  py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
+      .value("Reduce", BuildStrategy::ReduceStrategy::kReduce)
+      .value("AllReduce", BuildStrategy::ReduceStrategy::kAllReduce);
+  py::enum_<BuildStrategy::GradientScaleStrategy>(build_strategy,
+                                                  "GradientScaleStrategy")
+      .value("CoeffNumDevice",
+             BuildStrategy::GradientScaleStrategy::kCoeffNumDevice)
+      .value("One", BuildStrategy::GradientScaleStrategy::kOne)
+      .value("Customized", BuildStrategy::GradientScaleStrategy::kCustomized);
+
+  build_strategy.def(py::init())
+      .def_property(
+          "reduce_strategy",
+          [](const BuildStrategy &self) { return self.reduce_; },
+          [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
+            self.reduce_ = strategy;
+          })
+      .def_property(
+          "gradient_scale_strategy",
+          [](const BuildStrategy &self) { return self.gradient_scale_; },
+          [](BuildStrategy &self,
+             BuildStrategy::GradientScaleStrategy strategy) {
+            self.gradient_scale_ = strategy;
+          })
+      .def_property(
+          "debug_graphviz_path",
+          [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
+          [](BuildStrategy &self, const std::string &path) {
+            self.debug_graphviz_path_ = path;
+          });
+
+  pe.def(py::init<const std::vector<platform::Place> &,
+                  const std::unordered_set<std::string> &,
+                  const std::unordered_set<std::string> &, const ProgramDesc &,
+                  const std::string &, Scope *, std::vector<Scope *> &,
+                  const ExecutionStrategy &, const BuildStrategy &, size_t,
+                  size_t>())
       .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs)
       // NOTE: even we return a vec<Scope*>* to Python use reference policy.
       // We still cannot get local_scope from this vector, since the element
diff --git a/paddle/fluid/recordio/chunk.cc b/paddle/fluid/recordio/chunk.cc
index 82d9aa601cf450b8f90573d6c582bb12ced7a48a..6c65d9160c059ac143ee258b2bdaed5915a1dca1 100644
--- a/paddle/fluid/recordio/chunk.cc
+++ b/paddle/fluid/recordio/chunk.cc
@@ -119,40 +119,56 @@ bool Chunk::Write(std::ostream& os, Compressor ct) const {
 }
 
 bool Chunk::Parse(std::istream& sin) {
-  Header hdr;
-  bool ok = hdr.Parse(sin);
+  ChunkParser parser(sin);
+  if (!parser.Init()) {
+    return false;
+  }
+  Clear();
+  while (parser.HasNext()) {
+    Add(parser.Next());
+  }
+  return true;
+}
+
+ChunkParser::ChunkParser(std::istream& sin) : in_(sin) {}
+bool ChunkParser::Init() {
+  pos_ = 0;
+  bool ok = header_.Parse(in_);
   if (!ok) {
     return ok;
   }
-  auto beg_pos = sin.tellg();
-  uint32_t crc = Crc32Stream(sin, hdr.CompressSize());
-  PADDLE_ENFORCE_EQ(hdr.Checksum(), crc);
-  Clear();
-  sin.seekg(beg_pos, sin.beg);
-  std::unique_ptr<std::istream> compressed_stream;
-  switch (hdr.CompressType()) {
+  auto beg_pos = in_.tellg();
+  uint32_t crc = Crc32Stream(in_, header_.CompressSize());
+  PADDLE_ENFORCE_EQ(header_.Checksum(), crc);
+  in_.seekg(beg_pos, in_.beg);
+
+  switch (header_.CompressType()) {
     case Compressor::kNoCompress:
       break;
     case Compressor::kSnappy:
-      compressed_stream.reset(new snappy::iSnappyStream(sin));
+      compressed_stream_.reset(new snappy::iSnappyStream(in_));
       break;
     default:
       PADDLE_THROW("Not implemented");
   }
+  return true;
+}
 
-  std::istream& stream = compressed_stream ? *compressed_stream : sin;
+bool ChunkParser::HasNext() const { return pos_ < header_.NumRecords(); }
 
-  for (uint32_t i = 0; i < hdr.NumRecords(); ++i) {
-    uint32_t rec_len;
-    stream.read(reinterpret_cast<char*>(&rec_len), sizeof(uint32_t));
-    std::string buf;
-    buf.resize(rec_len);
-    stream.read(&buf[0], rec_len);
-    PADDLE_ENFORCE_EQ(rec_len, stream.gcount());
-    Add(buf);
+std::string ChunkParser::Next() {
+  if (!HasNext()) {
+    return "";
   }
-  return true;
+  ++pos_;
+  std::istream& stream = compressed_stream_ ? *compressed_stream_ : in_;
+  uint32_t rec_len;
+  stream.read(reinterpret_cast<char*>(&rec_len), sizeof(uint32_t));
+  std::string buf;
+  buf.resize(rec_len);
+  stream.read(&buf[0], rec_len);
+  PADDLE_ENFORCE_EQ(rec_len, stream.gcount());
+  return buf;
 }
-
 }  // namespace recordio
 }  // namespace paddle
diff --git a/paddle/fluid/recordio/chunk.h b/paddle/fluid/recordio/chunk.h
index 71a1556a33bfa5c937d6a799d2818cd5a5ef2094..cfb954a591679c2d2c4f42ecd99ca0c8bd1084cf 100644
--- a/paddle/fluid/recordio/chunk.h
+++ b/paddle/fluid/recordio/chunk.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -53,9 +54,20 @@ class Chunk {
   DISABLE_COPY_AND_ASSIGN(Chunk);
 };
 
-size_t CompressData(const char* in, size_t in_length, Compressor ct, char* out);
+class ChunkParser {
+ public:
+  explicit ChunkParser(std::istream& sin);
+
+  bool Init();
+  std::string Next();
+  bool HasNext() const;
 
-void DeflateData(const char* in, size_t in_length, Compressor ct, char* out);
+ private:
+  Header header_;
+  uint32_t pos_{0};
+  std::istream& in_;
+  std::unique_ptr<std::istream> compressed_stream_;
+};
 
 }  // namespace recordio
 }  // namespace paddle
diff --git a/paddle/fluid/recordio/scanner.cc b/paddle/fluid/recordio/scanner.cc
index 88b4d4001bc1b6dc935a9aabc2db5edfb55a60e4..06a13e6c5b6ea76456e231e3f7b1eb33492b16ea 100644
--- a/paddle/fluid/recordio/scanner.cc
+++ b/paddle/fluid/recordio/scanner.cc
@@ -22,35 +22,33 @@ namespace paddle {
 namespace recordio {
 
 Scanner::Scanner(std::unique_ptr<std::istream> &&stream)
-    : stream_(std::move(stream)) {
+    : stream_(std::move(stream)), parser_(*stream_) {
   Reset();
 }
 
-Scanner::Scanner(const std::string &filename) {
-  stream_.reset(new std::ifstream(filename));
+Scanner::Scanner(const std::string &filename)
+    : stream_(new std::ifstream(filename)), parser_(*stream_) {
   Reset();
 }
 
 void Scanner::Reset() {
   stream_->clear();
   stream_->seekg(0, std::ios::beg);
-  ParseNextChunk();
+  parser_.Init();
 }
 
 std::string Scanner::Next() {
-  PADDLE_ENFORCE(!eof_, "StopIteration");
-  auto rec = cur_chunk_.Record(offset_++);
-  if (offset_ == cur_chunk_.NumRecords()) {
-    ParseNextChunk();
+  if (stream_->eof()) {
+    return "";
   }
-  return rec;
-}
 
-void Scanner::ParseNextChunk() {
-  eof_ = !cur_chunk_.Parse(*stream_);
-  offset_ = 0;
+  auto res = parser_.Next();
+  if (!parser_.HasNext() && HasNext()) {
+    parser_.Init();
+  }
+  return res;
 }
 
-bool Scanner::HasNext() const { return !eof_; }
+bool Scanner::HasNext() const { return !stream_->eof(); }
 }  // namespace recordio
 }  // namespace paddle
diff --git a/paddle/fluid/recordio/scanner.h b/paddle/fluid/recordio/scanner.h
index 34f1b0c78d6b5af6072a993579e1866d38c6d009..0d885dd87a2f819ba1d9f76259196f6cfff0b2a0 100644
--- a/paddle/fluid/recordio/scanner.h
+++ b/paddle/fluid/recordio/scanner.h
@@ -37,11 +37,7 @@ class Scanner {
 
  private:
   std::unique_ptr<std::istream> stream_;
-  Chunk cur_chunk_;
-  size_t offset_;
-  bool eof_;
-
-  void ParseNextChunk();
+  ChunkParser parser_;
 };
 }  // namespace recordio
 }  // namespace paddle
diff --git a/paddle/fluid/train/demo/CMakeLists.txt b/paddle/fluid/train/demo/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..78d6e5ff554b9cd9facae85be166a697e0b75337
--- /dev/null
+++ b/paddle/fluid/train/demo/CMakeLists.txt
@@ -0,0 +1,66 @@
+cmake_minimum_required(VERSION 3.0)
+
+project(cpp_train_demo CXX C)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+
+if(NOT DEFINED PADDLE_LIB)
+  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/paddle/lib/dir")
+endif()
+
+option(WITH_MKLDNN     "Compile PaddlePaddle with MKLDNN"                                   OFF)
+option(WITH_MKL        "Compile PaddlePaddle with MKL support, default use openblas."       OFF)
+
+include_directories("${PADDLE_LIB}")
+include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
+include_directories("${PADDLE_LIB}/third_party/install/glog/include")
+include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
+include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
+include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
+include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
+
+include_directories("${PADDLE_LIB}/third_party/boost")
+include_directories("${PADDLE_LIB}/third_party/eigen3")
+
+link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
+link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
+link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
+link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
+link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
+link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
+
+add_executable(demo_trainer demo_trainer.cc)
+
+if(WITH_MKLDNN)
+  include_directories("${PADDLE_LIB}/third_party/install/mkldnn/include")
+  set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/libmkldnn.so.0)
+endif()
+
+if(WITH_MKL)
+  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so)
+else()
+  if(APPLE)
+    set(MATH_LIB cblas)
+  else(APPLE)
+    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
+  endif(APPLE)
+endif()
+
+if(APPLE)
+  set(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load -framework CoreFoundation -framework Security")
+else(APPLE)
+  set(ARCHIVE_START "-Wl,--whole-archive")
+  set(ARCHIVE_END "-Wl,--no-whole-archive")
+  set(EXTERNAL_LIB "-lrt -ldl -lpthread")
+endif(APPLE)
+
+target_link_libraries(demo_trainer
+        ${MACOS_LD_FLAGS}
+        ${ARCHIVE_START}
+        ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a
+        ${ARCHIVE_END}
+        ${MATH_LIB}
+        ${MKLDNN_LIB}
+        glog gflags protobuf snappystream snappy z
+        ${EXTERNAL_LIB})
diff --git a/paddle/fluid/train/demo/README.md b/paddle/fluid/train/demo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..41b01d33828f750f67bba5f82cb7ed6fe4d4ea0a
--- /dev/null
+++ b/paddle/fluid/train/demo/README.md
@@ -0,0 +1,66 @@
+
+### step 1. build paddle lib
+
+```
+
+# WITH_MKL=ON|OFF
+# WITH_MKLDNN=ON|OFF
+
+PADDLE_LIB=/paddle/lib/dir
+cmake .. -DFLUID_INSTALL_DIR=$PADDLE_LIB \
+         -DCMAKE_BUILD_TYPE=Release \
+         -DWITH_FLUID_ONLY=ON \
+         -DWITH_GPU=OFF \
+         -DWITH_STYLE_CHECK=OFF \
+         -DWITH_MKL=OFF \
+         -DWITH_MKLDNN=OFF
+make -j8
+make -j8 inference_lib_dist
+```
+
+### step 2. generate program desc
+```
+# please install paddle before run this scripe
+pip install --upgrade paddlepaddle-*.whl
+python demo_network.py
+```
+
+This will generate two program desc files:
+  - startup_program: used to init all parameters
+  - main_program: main logic of the network
+
+### step 3. build demo_trainer and run it.
+
+
+```
+# Make a build dir at the same dir of this README.md document.
+# The demo dir can be put anywhere.
+mkdir build
+cd build
+
+# WITH_MKL=ON|OFF
+# WITH_MKLDNN=ON|OFF
+PADDLE_LIB=/paddle/lib/dir
+
+# PADDLE_LIB is the same with FLUID_INSTALL_DIR when building the lib
+cmake .. -DPADDLE_LIB=$PADDLE_LIB \
+         -DWITH_MKLDNN=OFF \
+         -DWITH_MKL=OFF
+make
+
+# copy startup_program and main_program to this dir
+cp ../startup_program .
+cp ../main_program .
+
+# run demo cpp trainer
+./demo_trainer
+
+```
+
+The output will be:
+```
+step: 0 loss: 1069.02
+step: 1 loss: 1069.02
+step: 2 loss: 1069.02
+....
+```
diff --git a/paddle/fluid/train/demo/demo_network.py b/paddle/fluid/train/demo/demo_network.py
new file mode 100644
index 0000000000000000000000000000000000000000..41e98c6a24a750a9300b5c2a6d370303cc0e59c5
--- /dev/null
+++ b/paddle/fluid/train/demo/demo_network.py
@@ -0,0 +1,47 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+
+
+def train_network(with_optimize):
+    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+
+    if with_optimize:
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.00001)
+        sgd_optimizer.minimize(avg_cost)
+    else:
+        fluid.backward.append_backward(avg_cost)
+
+
+def save_program_desc(network_func):
+    startup_program = framework.Program()
+    train_program = framework.Program()
+
+    with framework.program_guard(train_program, startup_program):
+        network_func(with_optimize=False)
+
+    with open("startup_program", "w") as f:
+        f.write(startup_program.desc.serialize_to_string())
+    with open("main_program", "w") as f:
+        f.write(train_program.desc.serialize_to_string())
+
+
+save_program_desc(train_network)
diff --git a/paddle/fluid/train/demo/demo_trainer.cc b/paddle/fluid/train/demo/demo_trainer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..813d8386868558bd62a9d5670d540ddeddb2b77d
--- /dev/null
+++ b/paddle/fluid/train/demo/demo_trainer.cc
@@ -0,0 +1,103 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fstream>
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace train {
+
+void ReadBinaryFile(const std::string& filename, std::string* contents) {
+  std::ifstream fin(filename, std::ios::in | std::ios::binary);
+  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
+  fin.seekg(0, std::ios::end);
+  contents->clear();
+  contents->resize(fin.tellg());
+  fin.seekg(0, std::ios::beg);
+  fin.read(&(contents->at(0)), contents->size());
+  fin.close();
+}
+
+std::unique_ptr<paddle::framework::ProgramDesc> Load(
+    paddle::framework::Executor* executor, const std::string& model_filename) {
+  VLOG(3) << "loading model from " << model_filename;
+  std::string program_desc_str;
+  ReadBinaryFile(model_filename, &program_desc_str);
+
+  std::unique_ptr<paddle::framework::ProgramDesc> main_program(
+      new paddle::framework::ProgramDesc(program_desc_str));
+  return main_program;
+}
+
+}  // namespace train
+}  // namespace paddle
+
+int main() {
+  paddle::framework::InitDevices(false);
+
+  const auto cpu_place = paddle::platform::CPUPlace();
+
+  paddle::framework::Executor executor(cpu_place);
+  paddle::framework::Scope scope;
+  auto startup_program = paddle::train::Load(&executor, "startup_program");
+  auto train_program = paddle::train::Load(&executor, "main_program");
+
+  std::string loss_name = "";
+  for (auto op_desc : train_program->Block(0).AllOps()) {
+    if (op_desc->Type() == "mean") {
+      loss_name = op_desc->Output("Out")[0];
+      break;
+    }
+  }
+
+  PADDLE_ENFORCE_NE(loss_name, "", "loss not found");
+
+  // init all parameters
+  executor.Run(*startup_program.get(), &scope, 0);
+
+  // prepare data
+  auto x_var = scope.Var("x");
+  auto x_tensor = x_var->GetMutable<paddle::framework::LoDTensor>();
+  x_tensor->Resize({2, 13});
+
+  auto x_data = x_tensor->mutable_data<float>(cpu_place);
+  for (int i = 0; i < 2 * 13; ++i) {
+    x_data[i] = static_cast<float>(i);
+  }
+
+  auto y_var = scope.Var("y");
+  auto y_tensor = y_var->GetMutable<paddle::framework::LoDTensor>();
+  y_tensor->Resize({2, 1});
+  auto y_data = y_tensor->mutable_data<float>(cpu_place);
+  for (int i = 0; i < 2 * 1; ++i) {
+    y_data[i] = static_cast<float>(i);
+  }
+
+  auto loss_var = scope.Var(loss_name);
+
+  for (int i = 0; i < 10; ++i) {
+    executor.Run(*train_program.get(), &scope, 0, false, true);
+    std::cout << "step: " << i << " loss: "
+              << loss_var->Get<paddle::framework::LoDTensor>().data<float>()[0]
+              << std::endl;
+  }
+  return 0;
+}
diff --git a/paddle/function/BlockExpandOp.cpp b/paddle/function/BlockExpandOp.cpp
index aa53853e08716ff0dd8dce7c73766d9543bed2b9..f01f89a7277acc5fe494b92a3e7ca3ca18498c97 100644
--- a/paddle/function/BlockExpandOp.cpp
+++ b/paddle/function/BlockExpandOp.cpp
@@ -33,7 +33,7 @@ namespace paddle {
  * \param outputs[0] Image data of NCHW format.
  */
 class BlockExpandFunction : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     // function arguments
     strides_ = config.get<std::vector<size_t>>("strides");
@@ -81,7 +81,7 @@ public:
                         (size_t)blockW()});
   }
 
-protected:
+ protected:
   std::vector<size_t> strides_;
   std::vector<size_t> paddings_;
   std::vector<size_t> blocks_;
@@ -101,7 +101,7 @@ protected:
 
 template <DeviceType Device>
 class BlockExpandForward : public BlockExpandFunction {
-public:
+ public:
   void init(const FuncConfig& config) override {
     BlockExpandFunction::init(config);
   }
@@ -149,7 +149,7 @@ public:
 
 template <DeviceType Device>
 class BlockExpandBackward : public BlockExpandFunction {
-public:
+ public:
   void init(const FuncConfig& config) override {
     BlockExpandFunction::init(config);
   }
diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index 89ee09837db69d79bbd678312f02f6dc87e8067c..6de8c94e778c8d1439b2a2aa3c581a5a3cf70261 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -63,12 +63,12 @@ enum ArgType {
   ADD_TO = 2,
 };
 class BufferArg {
-public:
+ public:
   void setArgType(ArgType argType) { argType_ = argType; }
 
   ArgType getArgType() const { return argType_; }
 
-public:
+ public:
   BufferArg(ValueType valueType,
             const TensorShape& shape,
             ArgType argType = UNSPECIFIED)
@@ -169,7 +169,7 @@ public:
   const SequenceArg& sequence() const;
   const SparseMatrixArg& sparse() const;
 
-protected:
+ protected:
   void* buf_;
   ValueType valueType_;
   TensorShape shape_;
@@ -185,7 +185,7 @@ protected:
 // valueType_ = int32
 // if a < b then value_.buf_[a] < value_.buf_[b]
 class SequenceIdArg : public BufferArg {
-public:
+ public:
   SequenceIdArg(const TensorShape& shape, ArgType argType = UNSPECIFIED)
       : BufferArg(VALUE_TYPE_INT32, shape, argType) {
     bufferType_ = TENSOR_SEQUENCE_ID;
@@ -212,7 +212,7 @@ public:
 
   size_t numSeqs() const { return numSeqs_; }
 
-private:
+ private:
   size_t numSeqs_;
 };
 
@@ -222,7 +222,7 @@ private:
 // SequenceArg can be used to represent sequences that contain multiple
 // unequal lengths.
 class SequenceArg : public BufferArg {
-public:
+ public:
   SequenceArg(ValueType valueType,
               const TensorShape& shape,
               ArgType argType = UNSPECIFIED)
@@ -255,7 +255,7 @@ public:
   SequenceIdArg& getSequenceId() { return startPositions_; }
   const SequenceIdArg& getSequenceId() const { return startPositions_; }
 
-private:
+ private:
   SequenceIdArg startPositions_;
 };
 
@@ -263,7 +263,7 @@ private:
 // valueType_ == float or double
 // shape_.ndims() == 2
 class SparseMatrixArg : public BufferArg {
-public:
+ public:
   SparseMatrixArg(void* buf,
                   ValueType valueType,
                   const TensorShape& shape,
@@ -353,7 +353,7 @@ public:
 
   SparseDataType dataType() const { return type_; }
 
-private:
+ private:
   BufferArg row_;
   BufferArg col_;
   size_t nnz_;
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 9b2779b42cad324253dadf27dbff20fd8e8c8e16..29b4ac098e21ee315d5c9b2f2499521d1aa1c322 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -52,9 +52,3 @@ add_simple_unittest(Im2ColTest)
 add_simple_unittest(GemmConvOpTest)
 add_simple_unittest(DepthwiseConvOpTest)
 endif()
-
-add_style_check_target(paddle_function ${h_files})
-add_style_check_target(paddle_function ${cpp_files})
-if(WITH_GPU)
-    add_style_check_target(paddle_function ${cu_files})
-endif()
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index 904b0958e6f2c1b8fb8cf56f3cd7d07ad8e24f19..1187842452460ac3fd71f48150fab6467f93dc6c 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -100,7 +100,7 @@ void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
  */
 template <DeviceType Device>
 class ContextProjectionForwardFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     context_length_ = config.get<size_t>("context_length");
     context_start_ = config.get<int>("context_start");
@@ -146,7 +146,7 @@ public:
                                      begin_pad_);
   }
 
-private:
+ private:
   size_t context_length_;
   int context_start_;
   size_t begin_pad_;
@@ -223,7 +223,7 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
  */
 template <DeviceType Device>
 class ContextProjectionBackwardFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     context_length_ = config.get<size_t>("context_length");
     context_start_ = config.get<int>("context_start");
@@ -278,7 +278,7 @@ public:
                                       total_pad_);
   }
 
-private:
+ private:
   size_t context_length_;
   int context_start_;
   size_t begin_pad_;
@@ -299,7 +299,7 @@ private:
  */
 template <DeviceType Device>
 class ContextProjectionBackwardDataFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     context_length_ = config.get<size_t>("context_length");
     context_start_ = config.get<int>("context_start");
@@ -331,7 +331,7 @@ public:
         out_grad_mat, in_grad_mat, seq_vec, context_length_, context_start_);
   }
 
-private:
+ private:
   size_t context_length_;
   int context_start_;
 };
@@ -348,7 +348,7 @@ private:
  */
 template <DeviceType Device>
 class ContextProjectionBackwardWeightFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     context_length_ = config.get<size_t>("context_length");
     context_start_ = config.get<int>("context_start");
@@ -382,7 +382,7 @@ public:
                                             begin_pad_);
   }
 
-private:
+ private:
   size_t context_length_;
   int context_start_;
   size_t begin_pad_;
diff --git a/paddle/function/ConvOp.h b/paddle/function/ConvOp.h
index 7d23d0079c8f62b2c8912dfcb9f191c622a60bc9..2d8437bcfe60d1d81897f1c4be1cbfecb5b27fe0 100644
--- a/paddle/function/ConvOp.h
+++ b/paddle/function/ConvOp.h
@@ -56,7 +56,7 @@ namespace paddle {
  *      H and W is height and width of filter.
  */
 class ConvFunctionBase : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     // function arguments
     strides_ = config.get<std::vector<size_t>>("strides");
@@ -101,7 +101,7 @@ public:
     }
   }
 
-protected:
+ protected:
   size_t getFilterHeight(const TensorShape& filter) const {
     return filter[filter.ndims() - 2];
   }
diff --git a/paddle/function/CosSimOp.cpp b/paddle/function/CosSimOp.cpp
index 81bccc1a9c7d614763a10e3838271b57eef2c603..2c25e1af44965d30591faeccc9a181e36c7e0a0f 100644
--- a/paddle/function/CosSimOp.cpp
+++ b/paddle/function/CosSimOp.cpp
@@ -97,7 +97,7 @@ class CosSimForwardFunc : public FunctionBase {
     CosSimForward<Device>(out_mat, in1_mat, in2_mat, scale_);
   }
 
-private:
+ private:
   real scale_;
 };
 
@@ -227,7 +227,7 @@ class CosSimBackwardFunc : public FunctionBase {
         out_grad, out_val, in1_val, in2_val, in1_grad, in2_grad, scale_);
   }
 
-private:
+ private:
   real scale_;
 };
 
diff --git a/paddle/function/CropOp.cpp b/paddle/function/CropOp.cpp
index 7aa527d21615e19257bd003d0563b5e26b2fcb2f..5bd98910fe838751935f8ef2387ce96e755c6df1 100644
--- a/paddle/function/CropOp.cpp
+++ b/paddle/function/CropOp.cpp
@@ -112,7 +112,7 @@ void CropGrad<DEVICE_TYPE_CPU>(const real* inGrad,
  */
 template <DeviceType Device>
 class CropFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override { conf_ = config; }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -130,7 +130,7 @@ public:
                  conf_);
   }
 
-private:
+ private:
   FuncConfig conf_;
 };
 
@@ -145,7 +145,7 @@ private:
 
 template <DeviceType Device>
 class CropGradFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override { conf_ = config; }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -163,7 +163,7 @@ public:
                      conf_);
   }
 
-private:
+ private:
   FuncConfig conf_;
 };
 
diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index 75c0fc2a3d047a9162d49809a717629f2270872d..7ff9227e5c2702d9d5334db501730b57ec10bfe3 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -160,7 +160,7 @@ void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
  */
 template <DeviceType Device>
 class CrossMapNormalFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     // function arguments
     size_ = config.get<size_t>("size");
@@ -220,7 +220,7 @@ public:
     return ops;
   }
 
-private:
+ private:
   size_t size_;
   real scale_;
   real pow_;
@@ -260,7 +260,7 @@ private:
  */
 template <DeviceType Device>
 class CrossMapNormalGradFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     // function arguments
     size_ = config.get<size_t>("size");
@@ -328,7 +328,7 @@ public:
     return ops;
   }
 
-private:
+ private:
   size_t size_;
   real scale_;
   real pow_;
diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/function/DepthwiseConvOp.cpp
index 46651345b45e4ced9a3ef3373af437d939a66716..958034e08e60c9a63d1c480bde7c84b760205ae4 100644
--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
@@ -19,7 +19,7 @@ namespace paddle {
 
 template <class T>
 class DepthwiseConvFunctor<DEVICE_TYPE_CPU, T> {
-public:
+ public:
   void operator()(const T* inputData,
                   const T* filterData,
                   int batchSize,
@@ -43,7 +43,7 @@ public:
 
 template <class T>
 class DepthwiseConvGradInputFunctor<DEVICE_TYPE_CPU, T> {
-public:
+ public:
   void operator()(const T* outputGrad,
                   const T* filterData,
                   int batchSize,
@@ -66,7 +66,7 @@ public:
 
 template <class T>
 class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_CPU, T> {
-public:
+ public:
   void operator()(const T* outputGrad,
                   const T* inputData,
                   int batchSize,
@@ -93,7 +93,7 @@ public:
  */
 template <DeviceType Device>
 class DepthwiseConvFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
@@ -156,7 +156,7 @@ public:
  */
 template <DeviceType Device>
 class DepthwiseConvGradInputFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
@@ -220,7 +220,7 @@ public:
  */
 template <DeviceType Device>
 class DepthwiseConvGradFilterFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
diff --git a/paddle/function/DepthwiseConvOp.h b/paddle/function/DepthwiseConvOp.h
index 6700747314fa8377828dab0c436eb4b2053f46f6..7837edd1c071980592b1cf36ecb69a3b7c12cc5e 100644
--- a/paddle/function/DepthwiseConvOp.h
+++ b/paddle/function/DepthwiseConvOp.h
@@ -44,7 +44,7 @@ namespace paddle {
  */
 template <DeviceType Device, class T>
 class DepthwiseConvFunctor {
-public:
+ public:
   void operator()(const T* inputData,
                   const T* filterData,
                   int batchSize,
@@ -89,7 +89,7 @@ public:
  */
 template <DeviceType Device, class T>
 class DepthwiseConvGradInputFunctor {
-public:
+ public:
   void operator()(const T* outputGrad,
                   const T* filterData,
                   int batchSize,
@@ -135,7 +135,7 @@ public:
  */
 template <DeviceType Device, class T>
 class DepthwiseConvGradFilterFunctor {
-public:
+ public:
   void operator()(const T* outputGrad,
                   const T* inputData,
                   int batchSize,
diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu
index cd1d55a416c84c6327226ffaae4d5d9d5be81038..2c0e71b19b22abac25d273d8bbeddc330e67f8b0 100644
--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ b/paddle/function/DepthwiseConvOpGpu.cu
@@ -199,7 +199,7 @@ __global__ void ConvolutionDepthwiseFilterBackward(const int num_i,
 
 template <class T>
 class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T> {
-public:
+ public:
   void operator()(const T* inputData,
                   const T* filterData,
                   int batchSize,
@@ -249,7 +249,7 @@ public:
 
 template <class T>
 class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T> {
-public:
+ public:
   void operator()(const T* outputGrad,
                   const T* filterData,
                   int batchSize,
@@ -300,7 +300,7 @@ public:
 
 template <class T>
 class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, T> {
-public:
+ public:
   void operator()(const T* outputGrad,
                   const T* inputData,
                   int batchSize,
diff --git a/paddle/function/EigenGemm.cpp b/paddle/function/EigenGemm.cpp
index bac4659e62b107dd80ef95dd0907b3da4becffbc..8e9dbbd7a154095a7298bb2f59a82d13a60f9bd3 100644
--- a/paddle/function/EigenGemm.cpp
+++ b/paddle/function/EigenGemm.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <glog/logging.h>
-#include "unsupported/Eigen/CXX11/Tensor"
+#include "paddle/function/EigenThreadDevice.h"
 
 namespace paddle {
 
@@ -70,25 +70,26 @@ struct EigenBlasGemm {
     dims[0].first = transA ? 0 : 1;
     dims[0].second = transB ? 1 : 0;
 
-    Eigen::DefaultDevice device;
+    auto* device = EigenDeviceWarpper::device();
     if (N == ldc) {
       if (alpha == T(1) && beta == T(0)) {
-        c.device(device) = a.contract(b, dims);
+        c.device(*device) = a.contract(b, dims);
       } else if (alpha == T(1) && beta == T(1)) {
-        c.device(device) += a.contract(b, dims);
+        c.device(*device) += a.contract(b, dims);
       } else {
-        c.device(device) = alpha * a.contract(b, dims) + beta * c;
+        c.device(*device) = alpha * a.contract(b, dims) + beta * c;
       }
     } else {
       if (alpha == T(1) && beta == T(0)) {
-        c.slice(offsetC, extentC).device(device) = a.contract(b, dims);
+        c.slice(offsetC, extentC).device(*device) = a.contract(b, dims);
       } else if (alpha == T(1) && beta == T(1)) {
-        c.slice(offsetC, extentC).device(device) += a.contract(b, dims);
+        c.slice(offsetC, extentC).device(*device) += a.contract(b, dims);
       } else {
-        c.slice(offsetC, extentC).device(device) =
+        c.slice(offsetC, extentC).device(*device) =
             alpha * a.contract(b, dims) + beta * c.slice(offsetC, extentC);
       }
     }
+    EigenDeviceWarpper::free_device(device);
   }
 };
 
diff --git a/paddle/function/EigenThreadDevice.h b/paddle/function/EigenThreadDevice.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb92251c827a26d55ca021c4418182bae28dd6a5
--- /dev/null
+++ b/paddle/function/EigenThreadDevice.h
@@ -0,0 +1,73 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License. */
+
+#pragma once
+
+#if defined(__OSX__) || defined(__APPLE__)
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#endif
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+
+#if defined(__ANDROID__)
+int GetCpuCount() {
+  FILE* fp = fopen("/sys/devices/system/cpu/possible", "r");
+  if (!fp) {
+    return 1;
+  }
+  int rank0, rank1;
+  int num = fscanf(fp, "%d-%d", &rank0, &rank1);
+  fclose(fp);
+  if (num < 2) return 1;
+  return rank1 + 1;
+}
+#elif defined(__OSX__) || defined(__APPLE__)
+int GetCpuCount() {
+  int count = 0;
+  size_t len = sizeof(int);
+  sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
+  return count > 0 ? count : 1;
+}
+#else
+int GetCpuCount() { return 1; }
+#endif
+
+class EigenDeviceWarpper {
+ public:  // NOLINT
+#if EIGEN_USE_THREADS
+  static Eigen::ThreadPoolDevice* device() {
+    const int num_cpus = GetCpuCount();
+    const int num_threads = (num_cpus > 2) ? 2 : num_cpus;
+    static Eigen::ThreadPool tp(num_threads);
+    static Eigen::ThreadPoolDevice* device =
+        new Eigen::ThreadPoolDevice(&tp, num_threads);
+    return device;
+  }
+
+  static void free_device(Eigen::ThreadPoolDevice* device) {
+    // do nothing
+  }
+#else
+  static Eigen::DefaultDevice* device() {
+    Eigen::DefaultDevice* device = new Eigen::DefaultDevice;
+    return device;
+  }
+
+  static void free_device(Eigen::DefaultDevice* device) { delete device; }
+#endif
+};
+
+}  // namespace paddle
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index 01288ef92e7b59d7958e6e23daf641b30a60eed1..a6c14ef29b760faa393c37bd2357824a061c7b38 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -29,7 +29,7 @@ namespace paddle {
  * The argument type of Function::init.
  */
 class FuncConfig {
-public:
+ public:
   template <typename T>
   T get(const std::string& key, Error* err = nullptr) const {
     try {
@@ -59,7 +59,7 @@ public:
     return *this;
   }
 
-protected:
+ protected:
   mutable std::unordered_map<std::string, any> valueMap_;
 };
 
@@ -77,7 +77,7 @@ protected:
  * in the BufferArgs life time.
  */
 class BufferArgs {
-public:
+ public:
   BufferArgs() {}
 
   ~BufferArgs() {
@@ -137,7 +137,7 @@ public:
 
   void addArg(SparseMatrixArg& arg) { args_.push_back(&arg); }
 
-private:
+ private:
   std::vector<BufferArg*> args_;
   // The BufferArg object is constructed and freed by BufferArgs.
   std::vector<BufferArg*> _args_;
@@ -163,7 +163,7 @@ private:
  * If Function has more than one output, each output can have different modes.
  */
 class FunctionBase {
-public:
+ public:
   virtual ~FunctionBase() {}
 
   virtual void init(const FuncConfig& config) {}
@@ -192,7 +192,7 @@ public:
 
   static ClassRegistrar<FunctionBase> funcRegistrar_;
 
-protected:
+ protected:
   // numInputs_ and numOutputs_ represents the maximum
   // input and output supported by Function.
   // Some functions are optimized for input and output,
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
index 56c3537b6a96c8042d172f8aca2163fa18c813c1..14003d2c885c8f846f9445ad8844869c9112816e 100644
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -39,7 +39,7 @@ struct Allocator<DEVICE_TYPE_GPU> {
 // Copy argument1 to argument2
 template <DeviceType DType1, DeviceType DType2>
 class CopyArgument {
-public:
+ public:
   void operator()(const BufferArg& arg1, BufferArg& arg2) {
     CHECK_EQ(arg1.valueType(), arg2.valueType());
     CHECK_LE(arg1.shape().getElements(), arg2.shape().getElements());
@@ -95,7 +95,7 @@ public:
  */
 template <DeviceType DType1, DeviceType DType2>
 class Compare2Function {
-public:
+ public:
   typedef typename test::Allocator<DType1>::type Allocator1;
   typedef typename test::Allocator<DType2>::type Allocator2;
   typedef typename Tensor<real, DType1>::Vector Vector1;
@@ -305,7 +305,7 @@ public:
 
   std::shared_ptr<FunctionBase> getFunction2() const { return function2_; }
 
-protected:
+ protected:
   // only init cpu argument, gpu argument copy from cpu argument.
   void initArg(BufferArg& arg) {
     Vector1 vector(arg.shape().getElements(), (real*)arg.data());
@@ -381,7 +381,7 @@ protected:
     }
   }
 
-protected:
+ protected:
   std::shared_ptr<FunctionBase> function1_;
   std::shared_ptr<FunctionBase> function2_;
   std::vector<std::shared_ptr<Allocator1>> func1Memory_;
@@ -400,7 +400,7 @@ protected:
 
 class CpuGpuFuncCompare
     : public Compare2Function<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> {
-public:
+ public:
   CpuGpuFuncCompare(const std::string& name, const FuncConfig& config)
       : Compare2Function(name + "-CPU", name + "-GPU", config) {}
 
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index 2b7c6f9eab223c8d6a2107ff4605ac6e60295f7d..5b023e2c10e5040a28660d555efceb0e26b40d49 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -24,7 +24,7 @@ namespace paddle {
  */
 template <DeviceType Device>
 class GemmConvFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
@@ -136,7 +136,7 @@ public:
  */
 template <DeviceType Device>
 class GemmConvMobileFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
@@ -297,7 +297,7 @@ public:
  */
 template <DeviceType Device>
 class GemmConvGradInputFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
@@ -404,7 +404,7 @@ public:
  */
 template <DeviceType Device>
 class GemmConvGradFilterFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h
index 6a0778700037c142d62fdb99667403ade806f7c1..e0ce6918a2a5324a396ade734945cf426b81ab56 100644
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -70,7 +70,7 @@ enum ColFormat { kCFO = 0, kOCF = 1 };
  */
 template <ColFormat Format, DeviceType Device, class T>
 class Im2ColFunctor {
-public:
+ public:
   void operator()(const T* imData,
                   const TensorShape& imShape,
                   T* colData,
@@ -85,7 +85,7 @@ public:
 
 template <ColFormat Format, DeviceType Device, class T>
 class Col2ImFunctor {
-public:
+ public:
   void operator()(T* imData,
                   const TensorShape& imShape,
                   const T* colData,
@@ -100,7 +100,7 @@ public:
 
 template <class T>
 class Im2ColMobileFunctor {
-public:
+ public:
   void operator()(const T* imData,
                   const TensorShape& imShape,
                   T* colData,
diff --git a/paddle/function/Im2ColOp.cpp b/paddle/function/Im2ColOp.cpp
index ad2aed8f3c237cf9c0f7f0dcc4900cac807e25ea..55a3ff98db63ede96094a3d3fdeedf03b573294f 100644
--- a/paddle/function/Im2ColOp.cpp
+++ b/paddle/function/Im2ColOp.cpp
@@ -23,7 +23,7 @@ namespace paddle {
  */
 template <class T>
 class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> {
-public:
+ public:
   void operator()(const T* imData,
                   const TensorShape& imShape,
                   T* colData,
@@ -75,7 +75,7 @@ public:
  */
 template <class T>
 class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, T> {
-public:
+ public:
   void operator()(T* imData,
                   const TensorShape& imShape,
                   const T* colData,
@@ -130,7 +130,7 @@ template class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, double>;
  */
 template <class T>
 class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, T> {
-public:
+ public:
   void operator()(const T* imData,
                   const TensorShape& imShape,
                   T* colData,
@@ -188,7 +188,7 @@ public:
  */
 template <class T>
 class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, T> {
-public:
+ public:
   void operator()(T* imData,
                   const TensorShape& imShape,
                   const T* colData,
diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/function/Im2ColOpGpu.cu
index a944a0ee687fefc5e002096b9c5b869495554167..96dd8f528eaa38f9d174ab7c2a5ea5eb96e2a060 100644
--- a/paddle/function/Im2ColOpGpu.cu
+++ b/paddle/function/Im2ColOpGpu.cu
@@ -71,7 +71,7 @@ __global__ void im2col(const T* data_im,
  */
 template <class T>
 class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, T> {
-public:
+ public:
   void operator()(const T* imData,
                   const TensorShape& imShape,
                   T* colData,
@@ -184,7 +184,7 @@ __global__ void col2im(size_t n,
  */
 template <class T>
 class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, T> {
-public:
+ public:
   void operator()(T* imData,
                   const TensorShape& imShape,
                   const T* colData,
@@ -292,7 +292,7 @@ __global__ void im2colOCF(const T* imData,
  */
 template <class T>
 class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, T> {
-public:
+ public:
   void operator()(const T* imData,
                   const TensorShape& imShape,
                   T* colData,
@@ -399,7 +399,7 @@ __global__ void col2imOCF(T* imData,
  */
 template <class T>
 class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, T> {
-public:
+ public:
   void operator()(T* imData,
                   const TensorShape& imShape,
                   const T* colData,
diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp
index 90cd4a2b6d1bfb2529e1c966cf7a1fb904a844d7..7bf36c8050a8c33d836ce98dc7f3cf6d3de38d55 100644
--- a/paddle/function/MulOp.cpp
+++ b/paddle/function/MulOp.cpp
@@ -240,7 +240,7 @@ void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
  */
 template <DeviceType Device>
 class MulFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     aTrans_ = config.get<bool>("aTrans");
     bTrans_ = config.get<bool>("bTrans");
@@ -335,7 +335,7 @@ public:
     }
   }
 
-private:
+ private:
   bool aTrans_;
   bool bTrans_;
 };
diff --git a/paddle/function/NaiveConvOp.cpp b/paddle/function/NaiveConvOp.cpp
index 22d3b33d0f4a730691234c6c742978abd72294a6..99c8b81acbbb16a91bc0faa1c7f2873fa94ab108 100644
--- a/paddle/function/NaiveConvOp.cpp
+++ b/paddle/function/NaiveConvOp.cpp
@@ -24,7 +24,7 @@ namespace paddle {
  */
 template <class T>
 class NaiveConvFunctor {
-public:
+ public:
   void operator()(const T* inputData,
                   size_t batchSize,
                   size_t inputChannels,
@@ -85,7 +85,7 @@ public:
 
 template <DeviceType Device>
 class NaiveConvFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
diff --git a/paddle/function/PadOp.cpp b/paddle/function/PadOp.cpp
index db6dd518ca5df9d852e545b37f61f1141c81f57c..5d7515e8c053439b95fb18de3c8ffe70705600a3 100644
--- a/paddle/function/PadOp.cpp
+++ b/paddle/function/PadOp.cpp
@@ -132,7 +132,7 @@ static inline PadConf castToPadConf(const FuncConfig& conf) {
 
 template <DeviceType Device>
 class PadFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -157,7 +157,7 @@ public:
                 pad_);
   }
 
-private:
+ private:
   PadConf pad_;
 };
 
@@ -173,7 +173,7 @@ private:
 
 template <DeviceType Device>
 class PadGradFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -201,7 +201,7 @@ public:
                     pad_);
   }
 
-private:
+ private:
   PadConf pad_;
 };
 
diff --git a/paddle/function/RowConvOp.cpp b/paddle/function/RowConvOp.cpp
index 925860346e1a53065b0fe4ccbd26853afc8898a1..129e9334582fad011c259e8ab8268b00a7fab7b6 100644
--- a/paddle/function/RowConvOp.cpp
+++ b/paddle/function/RowConvOp.cpp
@@ -129,7 +129,7 @@ void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
 
 template <DeviceType Device>
 class RowConvFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {}
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -176,7 +176,7 @@ public:
 template <DeviceType Device>
 class RowConvGradFunc : public FunctionBase {
   // TODO(qingqing): split into RowConvDataFunc and RowConvWeightFunc
-public:
+ public:
   void init(const FuncConfig& config) override {}
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
diff --git a/paddle/function/ScaleSubRegionOp.cpp b/paddle/function/ScaleSubRegionOp.cpp
index 6ed6eb2dba477722664ca4a29f4689114f368846..9a06ef2a96f25b5b7326049df2a708637f319561 100644
--- a/paddle/function/ScaleSubRegionOp.cpp
+++ b/paddle/function/ScaleSubRegionOp.cpp
@@ -92,7 +92,7 @@ void ScaleSubRegionGrad<DEVICE_TYPE_CPU>(const real* inGrad,
  */
 template <DeviceType Device>
 class ScaleSubRegionFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override { conf_ = config; }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -109,7 +109,7 @@ public:
                            conf_);
   }
 
-private:
+ private:
   FuncConfig conf_;
 };
 
@@ -124,7 +124,7 @@ private:
 
 template <DeviceType Device>
 class ScaleSubRegionGradFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override { conf_ = config; }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -141,7 +141,7 @@ public:
                                conf_);
   }
 
-private:
+ private:
   FuncConfig conf_;
 };
 
diff --git a/paddle/function/SwitchOp.cpp b/paddle/function/SwitchOp.cpp
index 50e1d6c04c54fed5b847aa10dbb253f00cfa42d4..750fb6bf28baf050b1f9f965a1a9b315363e5645 100644
--- a/paddle/function/SwitchOp.cpp
+++ b/paddle/function/SwitchOp.cpp
@@ -75,7 +75,7 @@ void NHWC2NCHW<DEVICE_TYPE_CPU>(real* outputs,
  */
 template <DeviceType Device>
 class NCHW2NHWCFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {}
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -108,7 +108,7 @@ public:
  */
 template <DeviceType Device>
 class NHWC2NCHWFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {}
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
diff --git a/paddle/function/TensorShape.h b/paddle/function/TensorShape.h
index 02d38c32c007325a928910d136d48214ba5f6bc3..d4d1eae3960c333a2a7dc6099ae7a68677fdcd5f 100644
--- a/paddle/function/TensorShape.h
+++ b/paddle/function/TensorShape.h
@@ -22,7 +22,7 @@ namespace paddle {
  * TensorShape used to represent shape of normal tensor.
  */
 class TensorShape {
-public:
+ public:
   TensorShape() : ndims_(0), nelements_(0) { initDims(0); }
 
   TensorShape(size_t ndims) : ndims_(ndims), nelements_(1) { initDims(ndims); };
@@ -80,7 +80,7 @@ public:
 
   bool operator!=(const TensorShape& t) const { return !(*this == t); }
 
-private:
+ private:
   // compute number of elements
   void numElements() {
     nelements_ = 1;
diff --git a/paddle/function/neon/NeonDepthwiseConv.cpp b/paddle/function/neon/NeonDepthwiseConv.cpp
index d3298c753853ca6d212a619cf8d0bd9356a8dbd7..d7ac83da41aaba5cd38b042d0381dea527f9c42d 100644
--- a/paddle/function/neon/NeonDepthwiseConv.cpp
+++ b/paddle/function/neon/NeonDepthwiseConv.cpp
@@ -21,7 +21,7 @@ namespace paddle {
 
 template <DeviceType Device>
 class NeonDepthwiseConvFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
@@ -66,18 +66,18 @@ public:
     float* inputPadding = inputData;
     int padInputHeight = inputHeight + 2 * paddingH();
     int padInputWidth = inputWidth + 2 * paddingW();
-    if (paddingH() > 0 || paddingW() > 0) {
-      int newSize = batchSize * inputChannels * padInputHeight * padInputWidth;
-      resizeBuffer<Device>(newSize);
-      inputPadding = reinterpret_cast<float*>(memory_->getBuf());
-      neon::Padding<float>::run(inputData,
-                                inputPadding,
-                                batchSize * inputChannels,
-                                inputHeight,
-                                inputWidth,
-                                padInputHeight,
-                                padInputWidth);
-    }
+    int newSize =
+        batchSize * (inputChannels + 1) * padInputHeight * padInputWidth;
+
+    resizeBuffer<Device>(newSize);
+    inputPadding = reinterpret_cast<float*>(memory_->getBuf());
+    neon::Padding<float>::run(inputData,
+                              inputPadding,
+                              batchSize * inputChannels,
+                              inputHeight,
+                              inputWidth,
+                              padInputHeight,
+                              padInputWidth);
 
     std::function<void(
         const float*, const float*, int, int, int, int, int, int, float*)>
diff --git a/paddle/function/neon/NeonDepthwiseConvTranspose.cpp b/paddle/function/neon/NeonDepthwiseConvTranspose.cpp
index d443d3fa4902f998230651c5c64355d93c4c4f6a..1fc5daf6078bbd5b4506ff2e0832e2cc3ec48fe3 100644
--- a/paddle/function/neon/NeonDepthwiseConvTranspose.cpp
+++ b/paddle/function/neon/NeonDepthwiseConvTranspose.cpp
@@ -21,7 +21,7 @@ namespace paddle {
 
 template <DeviceType Device>
 class NeonDepthwiseConvTransposeFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
diff --git a/paddle/function/nnpack/NNPACKConvOp.cpp b/paddle/function/nnpack/NNPACKConvOp.cpp
index 3cdba4f2ed0dad42035fe2d0de87ad5aeeef20ca..48c997b50d8c73b25c58801c30e597c9d1f3232a 100644
--- a/paddle/function/nnpack/NNPACKConvOp.cpp
+++ b/paddle/function/nnpack/NNPACKConvOp.cpp
@@ -46,7 +46,7 @@ nnp_convolution_algorithm get_nnp_convolution_algorithm(
 
 template <DeviceType Device>
 class NNPACKConvFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
     algorithm_ = get_nnp_convolution_algorithm(config.get<std::string>("algo"));
@@ -231,7 +231,7 @@ public:
     }
   }
 
-private:
+ private:
   nnp_convolution_algorithm algorithm_;
   nnp_convolution_transform_strategy transform_strategy_;
   void* workspaceBuffer_;
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt
index 3d6ced713f00bd72622d8aeed3967642b6774ffe..6dc877dd90ee2ae3d99406299a9244eb3e3d7b53 100644
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -146,8 +146,6 @@ else()
         ${GSERVER_SOURCES})
 endif()
 
-add_style_check_target(paddle_gserver ${GSERVER_SOURCES})
-add_style_check_target(paddle_gserver ${GSERVER_HEADER})
 add_dependencies(paddle_gserver paddle_proto ${external_project_dependencies})
 if(WITH_TESTING)
     add_subdirectory(tests)
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index 8d8f01234fe3859989e44fe6147105fb72b832ff..71c238fbfe9f32f3764601ebb441336931f8ef5f 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -44,10 +44,10 @@ static ClassRegistrar<ActivationFunction> gActivationRegistrar;
  */
 #define BEGIN_DEFINE_ACTIVATION(ACTIVATION_NAME)                             \
   class ACTIVATION_CLASS_NAME(ACTIVATION_NAME) : public ActivationFunction { \
-  private:                                                                   \
+   private:                                                                  \
     static const std::string name;                                           \
                                                                              \
-  public:                                                                    \
+   public:                                                                   \
     const std::string& getName() const { return name; }
 /**
  * @def END_DEFINE_ACTIVATION
@@ -70,7 +70,7 @@ static ClassRegistrar<ActivationFunction> gActivationRegistrar;
  * Do nothing when forward/backward.
  */
 class IdentityActivation : public ActivationFunction {
-public:
+ public:
   static const std::string name;
   Error __must_check forward(Argument& act) {
     (void)act;
diff --git a/paddle/gserver/activations/ActivationFunction.h b/paddle/gserver/activations/ActivationFunction.h
index 0f4b0fe0abb85403d42fc8a2ac28560e10058c20..8e2e144769f2e668a9a8f02890d29c4a7fe128a3 100644
--- a/paddle/gserver/activations/ActivationFunction.h
+++ b/paddle/gserver/activations/ActivationFunction.h
@@ -31,7 +31,7 @@ struct Argument;
  *
  */
 class ActivationFunction {
-public:
+ public:
   static ActivationFunction* create(const std::string& type);
   static std::vector<std::string> getAllRegisteredTypes();
 
diff --git a/paddle/gserver/activations/MKLDNNActivation.cpp b/paddle/gserver/activations/MKLDNNActivation.cpp
index 56ffb839344aabe43eaae0bd46e6dbf95e4d8f20..672444c6561adbeb78c3c453f12ab6aaedeed646 100644
--- a/paddle/gserver/activations/MKLDNNActivation.cpp
+++ b/paddle/gserver/activations/MKLDNNActivation.cpp
@@ -35,10 +35,10 @@ static ClassRegistrar<ActivationFunction> gMKLDNNActivationRegistrar;
  * @def END_MKLDNN_ACTIVATION
  */
 #define END_MKLDNN_ACTIVATION(ACT_TYPE)                            \
-private:                                                           \
+ private:                                                          \
   static const std::string name;                                   \
                                                                    \
-public:                                                            \
+ public:                                                           \
   const std::string& getName() const { return name; }              \
   }                                                                \
   ;                                                                \
@@ -63,11 +63,11 @@ public:                                                            \
 #define DEFINE_MKLDNN_ELTWISE_ACTIVATION(                            \
     ACT_TYPE, BASE_CLASS, ALPHA, BWD_ALPHA)                          \
   BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS)                      \
-private:                                                             \
+ private:                                                            \
   static const float alpha;                                          \
   static const float bwdAlpha;                                       \
                                                                      \
-public:                                                              \
+ public:                                                             \
   float getAlpha() const { return alpha; }                           \
   float getBwdAlpha() const { return bwdAlpha; }                     \
   END_MKLDNN_ACTIVATION(ACT_TYPE)                                    \
diff --git a/paddle/gserver/activations/MKLDNNActivation.h b/paddle/gserver/activations/MKLDNNActivation.h
index 392b32c70dae3728e13ee64f09f135c015c122cf..eece1b9c37e72624dffd119804c65f7bd36e20fb 100644
--- a/paddle/gserver/activations/MKLDNNActivation.h
+++ b/paddle/gserver/activations/MKLDNNActivation.h
@@ -27,7 +27,7 @@ namespace paddle {
  * including mkldnn_relu, mkldnn_elu, mkldnn_tanh, mkldnn_softmax
  */
 class MKLDNNActivation : public ActivationFunction {
-protected:
+ protected:
   // input value element count
   size_t cnt_;
   // should not merge the resetBwd into resetFwd,
@@ -43,7 +43,7 @@ protected:
   std::vector<mkldnn::primitive> pipelineFwd_;
   std::vector<mkldnn::primitive> pipelineBwd_;
 
-public:
+ public:
   MKLDNNActivation() : cnt_(0), needResetBwd_(true) {}
   ~MKLDNNActivation() {}
   static ActivationFunction* create(const std::string& type);
@@ -72,7 +72,7 @@ class MKLDNNEltwiseActivation : public MKLDNNActivation {
   typedef mkldnn::eltwise_backward eltwise_bwd;
   typedef mkldnn::algorithm algorithm;
 
-protected:
+ protected:
   // save the forward primitive desc, which can be used backward
   std::shared_ptr<eltwise_fwd::primitive_desc> fwdPD_;
   // eltwise_bwd need src input value
@@ -80,7 +80,7 @@ protected:
   // use for copy data
   std::shared_ptr<mkldnn::reorder> copyInVal_;
 
-public:
+ public:
   MKLDNNEltwiseActivation() {}
   ~MKLDNNEltwiseActivation() {}
   virtual const std::string& getName() const = 0;
@@ -102,12 +102,12 @@ public:
 class MKLDNNSoftmaxActivation : public MKLDNNActivation {
   typedef mkldnn::softmax_forward softmax_fwd;
 
-private:
+ private:
   // for backward
   MatrixPtr sftMaxSum_;
   MatrixPtr sftMaxDot_;
 
-public:
+ public:
   MKLDNNSoftmaxActivation() {}
   ~MKLDNNSoftmaxActivation() {}
   virtual const std::string& getName() const = 0;
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 4851168abab7179d552648c88923a529d55e6a7e..21822b10c2ebf1d353195794cf8f49e02b64c177 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -71,7 +71,7 @@ typedef std::shared_ptr<BufferBatch> BufferBatchPtr;
  * @brief Data for batch training a neural network
  */
 class DataBatch {
-public:
+ public:
   DataBatch() : size_(0) { data_.clear(); }
   /**
    * @brief Get batch size
@@ -181,7 +181,7 @@ public:
     }
   }
 
-protected:
+ protected:
   /**
    * @brief batch size
    */
@@ -194,7 +194,7 @@ protected:
 };
 
 class BufferBatch {
-public:
+ public:
   BufferBatch() {
     hlStream_ = HPPL_STREAM_DEFAULT;
     hlEvent_ = NULL;
@@ -235,7 +235,7 @@ public:
   void swap(BufferBatch* bufBatch);
   void clone(DataBatch* srcBatch, bool useGpu);
 
-protected:
+ protected:
   DataBatch* batchData_;
   hl_stream_t hlStream_;
   hl_event_t hlEvent_;
@@ -247,7 +247,7 @@ typedef std::shared_ptr<DataProvider> DataProviderPtr;
 typedef Queue<BufferBatch*> BufferBatchQueue;
 
 class DoubleBuffer {
-public:
+ public:
   DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0);
   virtual ~DoubleBuffer();
   void removeOneBatch(DataBatch* dataBatch);
@@ -267,7 +267,7 @@ public:
 
   void setPending(bool pending) { pending_ = pending; }
 
-protected:
+ protected:
   virtual void asyncLoadBatch();
   void insertOneBatch(DataBatch* batch);
 
@@ -290,7 +290,7 @@ protected:
  * one is for input, one is for label.
  */
 class DataProvider {
-public:
+ public:
   static ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool> registrar_;
   static DataProvider* create(const DataConfig& config,
                               const ModelConfig& modelConfig,
@@ -359,7 +359,7 @@ public:
    */
   virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch) = 0;
 
-protected:
+ protected:
   DataConfig config_;
   bool skipShuffle_;
   float usageRatio_;
@@ -382,7 +382,7 @@ protected:
  * necessary configurations such as stream_names
  */
 class DummyDataProvider : public DataProvider {
-public:
+ public:
   DummyDataProvider(const DataConfig& config, bool useGpu)
       : DataProvider(config, useGpu) {}
   virtual void shuffle() {}
@@ -399,7 +399,7 @@ public:
  * Data provider for one input and one integer label.
  */
 class SimpleDataProviderBase : public DataProvider {
-protected:
+ protected:
   /// sample feature dimension
   int64_t sampleDim_;
   /// the number of samples
@@ -425,7 +425,7 @@ protected:
 
   RWLock lock_;
 
-public:
+ public:
   SimpleDataProviderBase(const DataConfig& config, bool useGpu, bool withInfo);
   ~SimpleDataProviderBase() {}
 
@@ -440,7 +440,7 @@ public:
   /// return the number of samples in the buffer
   int64_t fillBuffer();
 
-protected:
+ protected:
   /**
    * @brief Fill at most size samples into data and label.
    *
@@ -458,12 +458,12 @@ protected:
 };
 
 class SimpleDataProvider : public SimpleDataProviderBase {
-public:
+ public:
   SimpleDataProvider(const DataConfig& config, bool useGpu);
   ~SimpleDataProvider();
   virtual void reset();
 
-protected:
+ protected:
   void loadData(const std::string& fileName);
   void loadDataFile(const std::string& fileName);
   virtual int64_t fillBufferImp(real* data,
@@ -471,7 +471,7 @@ protected:
                                 int* info,
                                 int64_t size);
 
-protected:
+ protected:
   size_t currentSampleIndex_;
   std::vector<int> labels_;
   std::vector<real> data_;
diff --git a/paddle/gserver/dataproviders/DataProviderGroup.h b/paddle/gserver/dataproviders/DataProviderGroup.h
index 768e54fe82bedd6faca5ad9eb2b6f2ee0017dc3d..91c94dc986c7aeb70df25511ce14a5f9c312a159 100644
--- a/paddle/gserver/dataproviders/DataProviderGroup.h
+++ b/paddle/gserver/dataproviders/DataProviderGroup.h
@@ -20,7 +20,7 @@ namespace paddle {
 
 template <class T>
 class DataProviderGroup : public DataProvider {
-protected:
+ protected:
   typedef T ProviderType;
   typedef std::shared_ptr<ProviderType> ProviderPtrType;
   ProviderPtrType provider_;
@@ -29,7 +29,7 @@ protected:
   std::mutex lock_;
   std::unique_ptr<MultiThreadWorker<ProviderType>> loader_;
 
-public:
+ public:
   DataProviderGroup(const DataConfig& config, bool useGpu);
   ~DataProviderGroup() {}
 
@@ -38,7 +38,7 @@ public:
   virtual int64_t getSize() { return -1; }
   virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
 
-private:
+ private:
   void startLoader();
   void stopLoader();
   void forceStopLoader();
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.h b/paddle/gserver/dataproviders/MultiDataProvider.h
index 9a863c896773d71a99e21660fc13e3dd477a0c12..baa1fc019002f86414c9c45734ad65cda916d457 100644
--- a/paddle/gserver/dataproviders/MultiDataProvider.h
+++ b/paddle/gserver/dataproviders/MultiDataProvider.h
@@ -19,10 +19,10 @@ limitations under the License. */
 namespace paddle {
 
 class MultiDataProvider : public DataProvider {
-protected:
+ protected:
   std::vector<std::unique_ptr<DataProvider>> subDataProviders_;
 
-public:
+ public:
   MultiDataProvider(const DataConfig& config,
                     const ModelConfig& modelConfig,
                     bool useGpu);
@@ -33,7 +33,7 @@ public:
   virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
   bool isTestMode() const { return isTestMode_; }
 
-private:
+ private:
   int totalDataRatio_;
   bool isTestMode_;
 };
diff --git a/paddle/gserver/dataproviders/ProtoReader.h b/paddle/gserver/dataproviders/ProtoReader.h
index 786703f4dee4802bb967f9d15fb69ebcbc15d997..08d045226e1ebb014bdd91ebf0e8f0353179b0c8 100644
--- a/paddle/gserver/dataproviders/ProtoReader.h
+++ b/paddle/gserver/dataproviders/ProtoReader.h
@@ -28,7 +28,7 @@ namespace paddle {
  * messages from/to i/ostream.
  */
 class ProtoReader {
-public:
+ public:
   explicit ProtoReader(std::istream* s, bool dataCompression = false) {
     CHECK(s) << "istream pointer is nullptr";
     istreamInput_.reset(new google::protobuf::io::IstreamInputStream(s));
@@ -109,7 +109,7 @@ public:
     return true;
   }
 
-protected:
+ protected:
   std::unique_ptr<google::protobuf::io::ZeroCopyInputStream> istreamInput_;
   std::unique_ptr<google::protobuf::io::GzipInputStream> gzipInput_;
   std::unique_ptr<google::protobuf::io::CodedInputStream> codedInput_;
@@ -144,7 +144,7 @@ protected:
 };
 
 class ProtoWriter {
-public:
+ public:
   explicit ProtoWriter(std::ostream* s, bool dataCompression = false) {
     CHECK(s) << "ostream pointer is nullptr";
     ostreamOutput_.reset(new google::protobuf::io::OstreamOutputStream(s));
@@ -168,7 +168,7 @@ public:
     return ret;
   }
 
-protected:
+ protected:
   std::unique_ptr<google::protobuf::io::ZeroCopyOutputStream> ostreamOutput_;
   std::unique_ptr<google::protobuf::io::GzipOutputStream> gzipOutput_;
   std::unique_ptr<google::protobuf::io::CodedOutputStream> codedOutput_;
diff --git a/paddle/gserver/dataproviders/PyDataProvider.h b/paddle/gserver/dataproviders/PyDataProvider.h
index e53354c9e43ea9dc58fd4bd38a533025b6f17482..da50dd4e2ebb743ef45af319bc713ed7ac3d3e10 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.h
+++ b/paddle/gserver/dataproviders/PyDataProvider.h
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 
 class PyDataProvider : public DataProvider {
-public:
+ public:
   PyDataProvider(const DataConfig& config,
                  bool useGpu,
                  bool loadDataAll = true);
@@ -40,7 +40,7 @@ public:
 
   virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
 
-protected:
+ protected:
   struct ProtoSlot;
   // return false if each each sample is one sequence, i.e., independent
   // of other samples.
@@ -73,7 +73,7 @@ protected:
   void resetSlots();
   void loadData(const std::vector<std::string>& fileList);
 
-protected:
+ protected:
   struct ProtoSlot {
     SlotDef::SlotType type;
     int dim;
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp
index b4215bb307cc31ce64bb724986b88fdc20bbbf45..54ee091e8f257f76b113d4ca6f8a7c3989c0c1df 100644
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -93,7 +93,7 @@ inline std::ostream& operator<<(std::ostream& os, const SlotHeader& header) {
  * prepare step, fill data into argument during fill step.
  */
 class IFieldScanner {
-public:
+ public:
   DISABLE_COPY(IFieldScanner);
   /**
    * Ctor.
@@ -146,7 +146,7 @@ public:
    */
   static IFieldScanner* create(SlotHeader* header);
 
-protected:
+ protected:
   SlotHeader* headerPtr_;
 };
 
@@ -154,7 +154,7 @@ protected:
  * Py Data Provider Cache Interface.
  */
 class IPyDataProviderCache {
-public:
+ public:
   virtual ~IPyDataProviderCache() {}
 
   /**
@@ -193,7 +193,7 @@ public:
  * data. And it support cache strategies.
  */
 class PyDataProvider2 : public DataProvider {
-public:
+ public:
   /**
    * Ctor
    */
@@ -234,7 +234,7 @@ public:
    */
   virtual ~PyDataProvider2() { resetImpl(false); }
 
-private:
+ private:
   void createPyDataObj(const std::string& model,
                        const std::string& className,
                        const std::string& fileListName,
@@ -435,7 +435,7 @@ private:
     exit_ = false;
   }
 
-private:
+ private:
   std::unique_ptr<std::thread> loadThread_;
   std::atomic<bool> exit_;
   std::deque<PyObjectPtr> callingContexts_;
@@ -461,7 +461,7 @@ private:
   static PyObjectPtr zeroTuple_;
 
   class PositionRandom {
-  public:
+   public:
     inline explicit PositionRandom(bool skipRand)
         : eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {}
 
@@ -476,14 +476,14 @@ private:
       }
     }
 
-  private:
+   private:
     std::default_random_engine& eng_;
     std::unique_ptr<std::uniform_int_distribution<size_t>> dist_;
     bool skipRand_;
   };
 
   // DataProvider interface
-public:
+ public:
   /**
    * Resetting the PyDataProvider. May start reading thread here.
    */
@@ -666,7 +666,7 @@ REGISTER_DATA_PROVIDER_EX(py2, PyDataProvider2);
  * Scanner for dense slot.
  */
 class DenseScanner : public IFieldScanner {
-public:
+ public:
   explicit DenseScanner(SlotHeader* ptr) : IFieldScanner(ptr), height_(0) {}
 
   /**
@@ -708,7 +708,7 @@ public:
     ++height_;
   }
 
-private:
+ private:
   size_t height_;
 };
 
@@ -716,7 +716,7 @@ private:
  * Scanner for index slot
  */
 class IndexScanner : public IFieldScanner {
-public:
+ public:
   explicit IndexScanner(SlotHeader* ptr) : IFieldScanner(ptr), cnt_(0) {}
 
   /**
@@ -740,12 +740,12 @@ public:
     CHECK(ok) << "Cannot cast int " << py::repr(obj);
   }
 
-private:
+ private:
   size_t cnt_;
 };
 
 class SparseNonValueScanner : public IFieldScanner {
-public:
+ public:
   explicit SparseNonValueScanner(SlotHeader* ptr)
       : IFieldScanner(ptr), nnz_(0), height_(0) {}
 
@@ -790,7 +790,7 @@ public:
     ++height_;
   }
 
-protected:
+ protected:
   /**
    * Set a single sparse index and value.
    * @param [out] col sparse index
@@ -809,7 +809,7 @@ protected:
 };
 
 class SparseValueScanner : public SparseNonValueScanner {
-public:
+ public:
   explicit SparseValueScanner(SlotHeader* ptr) : SparseNonValueScanner(ptr) {}
 
   virtual void finishPrepare(Argument& argument) {
@@ -817,7 +817,7 @@ public:
         argument.value, height_, headerPtr_->dim, nnz_, FLOAT_VALUE);
   }
 
-protected:
+ protected:
   virtual void setData(int* col, real* dat, PyObject* obj) {
     py::SequenceHelper s(obj);
     SparseNonValueScanner::setData(col, dat, s[0]);
@@ -829,7 +829,7 @@ protected:
  * Sequence Scanner. Scanner for sequence or sub-sequence.
  */
 class SequenceScanner : public IFieldScanner {
-public:
+ public:
   /**
    * Ctor
    * @param innerScanner inner scanner for each timestep or sub-sequence.
@@ -902,7 +902,7 @@ public:
    */
   virtual void finishFill(Argument& argument) { inner_->finishFill(argument); }
 
-protected:
+ protected:
   size_t getSize(PyObject* obj) {
     py::SequenceHelper s(obj);
     auto sc = dynamic_cast<SequenceScanner*>(inner_.get());
@@ -917,7 +917,7 @@ protected:
     }
   }
 
-private:
+ private:
   std::unique_ptr<IFieldScanner> inner_;
   size_t cnt_;
   std::function<ICpuGpuVectorPtr&(Argument&)> getSeqStartPos_;
@@ -969,7 +969,7 @@ IFieldScanner* IFieldScanner::create(SlotHeader* header) {
  * python every pass.
  */
 class NoCacheStrategy : public IPyDataProviderCache {
-public:
+ public:
   virtual bool reset() { return true; }
 
   virtual void drop(std::deque<PyObjectPtr>* data) { data->clear(); }
@@ -984,7 +984,7 @@ public:
  * The rest passes, will load data from memory.
  */
 class CacheOnePassInMemory : public IPyDataProviderCache {
-public:
+ public:
   CacheOnePassInMemory()
       : objPool_(new std::deque<PyObjectPtr>()),
         droppedPool_(new std::deque<PyObjectPtr>()) {}
@@ -1011,7 +1011,7 @@ public:
 
   virtual std::deque<PyObjectPtr>* load() { return objPool_.get(); }
 
-private:
+ private:
   std::unique_ptr<std::deque<PyObjectPtr>> objPool_;
   std::unique_ptr<std::deque<PyObjectPtr>> droppedPool_;
 };
diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
index 0f680de776f4755ca5fe83c86ea759d88f93ed01..c6cd41de9a1a22470d8659eb90d1ac2b075b2df9 100644
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -22,7 +22,7 @@ namespace paddle {
  * calculate sequence-to-sequence edit distance
  */
 class CTCErrorEvaluator : public Evaluator {
-private:
+ private:
   MatrixPtr outActivations_;
   int numTimes_, numClasses_, numSequences_, blank_;
   real deletions_, insertions_, substitutions_;
@@ -197,7 +197,7 @@ private:
         (real)seqClassficationError_ / numSequences_;
   }
 
-public:
+ public:
   CTCErrorEvaluator()
       : numTimes_(0),
         numClasses_(0),
diff --git a/paddle/gserver/evaluators/ChunkEvaluator.cpp b/paddle/gserver/evaluators/ChunkEvaluator.cpp
index 755b91d05caf33745e66415e7b111ba348c575d9..a2216293b1ab3a32e9cc903b805ca0aca10d58c1 100644
--- a/paddle/gserver/evaluators/ChunkEvaluator.cpp
+++ b/paddle/gserver/evaluators/ChunkEvaluator.cpp
@@ -77,7 +77,7 @@ class ChunkEvaluator : public Evaluator {
   std::set<int> excludedChunkTypes_;
   mutable std::unordered_map<std::string, real> values_;
 
-public:
+ public:
   virtual void init(const EvaluatorConfig& config) {
     Evaluator::init(config);
     if (config.chunk_scheme() == "IOB") {
@@ -276,7 +276,7 @@ public:
     return "chunk";
   }
 
-private:
+ private:
   void storeLocalValues() const {
     CHECK_GE(numOutputSegments_, 0);
     CHECK_GE(numLabelSegments_, 0);
diff --git a/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp b/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp
index f43ef5dd51407236a3a36b300b33f92a9fad885a..ddb8ebca784db4a83c328ff75f5c50c7aecd7352 100644
--- a/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp
+++ b/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp
@@ -28,7 +28,7 @@ namespace paddle {
  * The config file api is detection_map_evaluator.
  */
 class DetectionMAPEvaluator : public Evaluator {
-public:
+ public:
   DetectionMAPEvaluator()
       : evaluateDifficult_(false), cpuOutput_(nullptr), cpuLabel_(nullptr) {}
 
@@ -132,7 +132,7 @@ public:
     LOG(FATAL) << "Distribute detection evaluation not implemented.";
   }
 
-protected:
+ protected:
   void calcTFPos(const size_t batchSize,
                  const vector<map<size_t, vector<NormalizedBBox>>>& allGTBBoxes,
                  const vector<map<size_t, vector<pair<real, NormalizedBBox>>>>&
@@ -287,7 +287,7 @@ protected:
 
   real getValueImpl() const { return calcMAP(); }
 
-private:
+ private:
   real overlapThreshold_;  // overlap threshold when determining whether matched
   bool evaluateDifficult_;  // whether evaluate difficult ground truth
   size_t backgroundId_;     // class index of background
diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp
index 79478e7fac63a49c494105d53a6944b4b89e6c63..941fb8fb539d58cca22ecf563d2effa816243c3b 100644
--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -38,7 +38,7 @@ void Evaluator::eval(const NeuralNetwork& nn) {
  * The config file api is classification_error_evaluator.
  */
 class ClassificationErrorEvaluator : public Evaluator {
-public:
+ public:
   /*
   ClassificationErrorEvaluator() : totalScore2_(0) {}
 
@@ -124,7 +124,7 @@ public:
   }
 
   // Evaluator interface
-protected:
+ protected:
   std::string getTypeImpl() const { return "classification_error"; }
 };
 
@@ -135,7 +135,7 @@ protected:
  */
 class SequenceClassificationErrorEvaluator
     : public ClassificationErrorEvaluator {
-public:
+ public:
   virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
     numSamples_ += arguments[0].getNumSequences();
   }
@@ -166,7 +166,7 @@ public:
   }
 
   // Evaluator interface
-protected:
+ protected:
   std::string getTypeImpl() const { return "seq_classification_error"; }
 };
 REGISTER_EVALUATOR(seq_classification_error,
@@ -178,7 +178,7 @@ REGISTER_EVALUATOR(seq_classification_error,
  * The config file api is sum_evaluator.
  */
 class SumEvaluator : public Evaluator {
-public:
+ public:
   SumEvaluator() : cpuLabel_(nullptr), cpuWeight_(nullptr) {}
 
   virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
@@ -255,12 +255,12 @@ public:
     mergeResultsOfAllClients(client);
   }
 
-private:
+ private:
   IVectorPtr cpuLabel_;
   MatrixPtr cpuWeight_;
 
   // Evaluator interface
-protected:
+ protected:
   std::string getTypeImpl() const { return "sum"; }
 };
 /**
@@ -274,7 +274,7 @@ protected:
  *
  */
 class ColumnSumEvaluator : public Evaluator {
-public:
+ public:
   explicit ColumnSumEvaluator(int32_t colIdx)
       : colIdx_(colIdx), colNum_(0), sum_(nullptr) {}
 
@@ -368,13 +368,13 @@ public:
     client->reduce(&numSamples_, &numSamples_, 1, FLAGS_trainer_id, 0);
   }
 
-private:
+ private:
   int32_t colIdx_;
   size_t colNum_;
   MatrixPtr sum_; /* cpu matrix */
 
   // Evaluator interface
-protected:
+ protected:
   std::string getTypeImpl() const {
     if (colIdx_ == -1)
       return "last-column-sum";
@@ -1018,7 +1018,7 @@ static InitFunction __reg_type_auc_sum__([]() {
  * The config file api is value_printer_evaluator.
  */
 class ValuePrinter : public NotGetableEvaluator {
-public:
+ public:
   virtual void eval(const NeuralNetwork& nn) {
     for (const std::string& name : config_.input_layers()) {
       nn.getLayer(name)->getOutput().printValueString(LOG(INFO),
@@ -1038,7 +1038,7 @@ REGISTER_EVALUATOR(value_printer, ValuePrinter);
  * The config file api is gradient_printer_evaluator.
  */
 class GradientPrinter : public NotGetableEvaluator {
-public:
+ public:
   virtual void eval(const NeuralNetwork& nn) {
     for (const std::string& name : config_.input_layers()) {
       const Argument& argu = nn.getLayer(name)->getOutput();
@@ -1061,11 +1061,11 @@ REGISTER_EVALUATOR(gradient_printer, GradientPrinter);
  * The config file api is maxid_printer_evaluator.
  */
 class MaxIdPrinter : public NotGetableEvaluator {
-private:
+ private:
   IVectorPtr maxIds_;
   MatrixPtr maxValues_;
 
-public:
+ public:
   MaxIdPrinter() {}
 
   virtual void eval(const NeuralNetwork& nn) {
@@ -1103,12 +1103,12 @@ REGISTER_EVALUATOR(max_id_printer, MaxIdPrinter);
  * The config file api is maxframe_printer_evaluator.
  */
 class MaxFramePrinter : public NotGetableEvaluator {
-private:
+ private:
   IVectorPtr maxIds_;
   MatrixPtr maxValues_;
   MatrixPtr value_;
 
-public:
+ public:
   MaxFramePrinter() {
     value_ =
         Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, false);
@@ -1190,7 +1190,7 @@ REGISTER_EVALUATOR(max_frame_printer, MaxFramePrinter);
  *
  */
 class SequenceTextPrinter : public NotGetableEvaluator {
-private:
+ private:
   /// dict_file, which contains a list of tokens
   std::vector<std::string> dict_;
   /// result_file, which is the output file
@@ -1203,7 +1203,7 @@ private:
   /// store the probability associated with each sequence
   std::vector<MatrixPtr> cpuIn_;
 
-public:
+ public:
   SequenceTextPrinter() {}
 
   virtual void init(const EvaluatorConfig& config) {
@@ -1334,7 +1334,7 @@ REGISTER_EVALUATOR(seq_text_printer, SequenceTextPrinter);
  * The config file api is classification_error_printer_evaluator.
  */
 class ClassificationErrorPrinter : public ClassificationErrorEvaluator {
-public:
+ public:
   virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
 
   virtual real evalImp(std::vector<Argument>& arguments) {
diff --git a/paddle/gserver/evaluators/Evaluator.h b/paddle/gserver/evaluators/Evaluator.h
index be2032992c455fe2b442dbe05d84128ef8ebf82f..42948f1097d9a12600f4b11646a47e45b9bf4e96 100644
--- a/paddle/gserver/evaluators/Evaluator.h
+++ b/paddle/gserver/evaluators/Evaluator.h
@@ -40,7 +40,7 @@ class NeuralNetwork;
  * has been by a trained model.
  */
 class Evaluator {
-public:
+ public:
   static Evaluator* create(const EvaluatorConfig& config);
 
   Evaluator() : numSamples_(0), totalScore_(0) {}
@@ -172,7 +172,7 @@ public:
     return this->getTypeImpl();
   }
 
-protected:
+ protected:
   /**
    * @brief getValueImpl The simplest way to define getValue result. If this
    * evaluator doesn't contain multiple fields, and do not throw any error, just
@@ -191,7 +191,7 @@ protected:
    */
   virtual std::string getTypeImpl() const { return "base"; }
 
-protected:
+ protected:
   EvaluatorConfig config_;
   double numSamples_;
   double totalScore_;
@@ -204,7 +204,7 @@ protected:
  */
 class NotGetableEvaluator : public Evaluator {
   // Evaluator interface
-public:
+ public:
   void getNames(std::vector<std::string>* names) {}
 
   real getValue(const std::string& name, Error* err) const {
@@ -219,7 +219,7 @@ public:
 };
 
 class DummyEvaluator : public Evaluator {
-public:
+ public:
   DummyEvaluator() {}
   virtual void init(const EvaluatorConfig&) {}
   virtual void start() {}
@@ -232,7 +232,7 @@ public:
   virtual void printStats(std::ostream&) const {}
 
   // Evaluator interface
-protected:
+ protected:
   std::string getTypeImpl() const;
 };
 /**
@@ -251,7 +251,7 @@ protected:
  *
  */
 class AucEvaluator : public Evaluator {
-public:
+ public:
   AucEvaluator(int32_t colIdx)
       : colIdx_(colIdx),
         realColumnIdx_(0),
@@ -269,7 +269,7 @@ public:
 
   virtual void distributeEval(ParameterClient2* client);
 
-private:
+ private:
   static const uint32_t kBinNum_ = (1 << 24) - 1;
   static const int kNegativeLabel_ = 0;
   double statPos_[kBinNum_ + 1];
@@ -292,7 +292,7 @@ private:
   double calcAuc() const;
 
   // Evaluator interface
-protected:
+ protected:
   real getValueImpl() const;
   std::string getTypeImpl() const;
 };
@@ -305,7 +305,7 @@ protected:
  * dense value.
  */
 class RankAucEvaluator : public Evaluator {
-public:
+ public:
   // evaluate ranking AUC
   virtual void start();
 
@@ -317,7 +317,7 @@ public:
     mergeResultsOfAllClients(client);
   }
 
-private:
+ private:
   MatrixPtr output_;
   MatrixPtr click_;
   MatrixPtr pv_;
@@ -329,7 +329,7 @@ private:
                      size_t size);
 
   // Evaluator interface
-protected:
+ protected:
   std::string getTypeImpl() const;
 };
 
@@ -344,7 +344,7 @@ protected:
  * The config file api is precision_recall_evaluator.
  */
 class PrecisionRecallEvaluator : public Evaluator {
-public:
+ public:
   // Evaluate precision, recall and F1 score
   PrecisionRecallEvaluator()
       : isMultiBinaryLabel_(false),
@@ -379,7 +379,7 @@ public:
     StatsInfo() : TP(0.0), TN(0.0), FP(0.0), FN(0.0) {}
   };
 
-private:
+ private:
   bool isMultiBinaryLabel_;
   std::vector<StatsInfo> statsInfo_;
 
@@ -444,7 +444,7 @@ private:
  * The config file api is pnpair_evaluator.
  */
 class PnpairEvaluator : public Evaluator {
-public:
+ public:
   PnpairEvaluator()
       : cpuOutput_(nullptr),
         cpuLabel_(nullptr),
@@ -491,7 +491,7 @@ public:
               << " calc total neg pair: " << pairArray_[1];
   }
 
-private:
+ private:
   static const uint32_t kPairArrayNum_ = 2;
   double pairArray_[kPairArrayNum_];
   MatrixPtr cpuOutput_;
@@ -500,7 +500,7 @@ private:
   MatrixPtr cpuWeight_;
 
   // Evaluator interface
-protected:
+ protected:
   real getValueImpl() const {
     return pairArray_[0] / ((pairArray_[1] <= 0) ? 1.0 : pairArray_[1]);
   }
diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h
index 60936c311d1b0119186c76d5c95b8819294446ce..22cf5d265f429ecbcea1808a54c85d7e89f8bc99 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
@@ -73,7 +73,7 @@ class GradientMachine;
 typedef std::shared_ptr<GradientMachine> GradientMachinePtr;
 
 class GradientMachine {
-public:
+ public:
   enum CreateMode {
     kNormal = 0,
     kSgdSparseCpuTraining = 3,
@@ -240,7 +240,7 @@ public:
    */
   virtual void releaseOutput() {}
 
-protected:
+ protected:
   virtual void onLoadParameter() {}
 
   std::vector<ParameterPtr> parameters_;
diff --git a/paddle/gserver/gradientmachines/GradientMachineMode.h b/paddle/gserver/gradientmachines/GradientMachineMode.h
index 898b68fbbc329145109ad0ae4b97c872d4f9a37c..dd944a35f8952e354f8e4f3eb5c67b136c5f080e 100644
--- a/paddle/gserver/gradientmachines/GradientMachineMode.h
+++ b/paddle/gserver/gradientmachines/GradientMachineMode.h
@@ -19,14 +19,14 @@ limitations under the License. */
 namespace paddle {
 
 class IGradientMachineMode {
-public:
+ public:
   virtual ~IGradientMachineMode() {}
 
-public:  // interfaces
-         /**
-          * @brief create current mode's gradient machine by model config.
-          * @param config model config
-          */
+ public:  // interfaces
+          /**
+           * @brief create current mode's gradient machine by model config.
+           * @param config model config
+           */
   virtual GradientMachine* create(const ModelConfig& config) = 0;
 
   /**
@@ -55,14 +55,14 @@ public:  // interfaces
    */
   virtual bool needTrainWholeDataInOneBatch() const = 0;
 
-public:  // static methods.
-         /**
-          * @brief register a custom gradient machine mode.
-          * @note For user to register a custom gradient machine mode, id should >=
-          * kCustom.
-          * @param mode mode id.
-          * @param ptr mode description object.
-          */
+ public:  // static methods.
+          /**
+           * @brief register a custom gradient machine mode.
+           * @note For user to register a custom gradient machine mode, id should >=
+           * kCustom.
+           * @param mode mode id.
+           * @param ptr mode description object.
+           */
   static void regGradientMachineMode(
       int32_t mode, std::unique_ptr<IGradientMachineMode>&& ptr) {
     modes_.insert(std::make_pair(mode, std::move(ptr)));
@@ -141,7 +141,7 @@ public:  // static methods.
     }
   }
 
-private:
+ private:
   static std::unordered_map<int32_t, std::unique_ptr<IGradientMachineMode>>
       modes_;
 };
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.h b/paddle/gserver/gradientmachines/MultiGradientMachine.h
index 83d2651f34b3698848427f29b1a90e606e57950e..eff7d5284c6dd4898344203b50acc94ae61b4d59 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.h
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h
@@ -166,7 +166,7 @@ struct GradBuffer {
  *     the merged gradient to parameter server.
  */
 class MultiGradientMachine : public GradientMachine {
-public:
+ public:
   enum TaskType {
     TASK_FORWARD_BACKWARD = 0,
     TASK_FORWARD = 1,
@@ -213,7 +213,7 @@ public:
   /// The gradietns will be copied to each thread in the computing threads.
   virtual void setOutputGrad(const std::vector<Argument>& args);
 
-protected:
+ protected:
   friend class TrainerThread;
 
   std::vector<TrainerThreadPtr>& getAllThreads() { return threads_; }
@@ -281,7 +281,7 @@ protected:
 
   int paraMainThread(int pid) const { return paraMainThread_[pid]; }
 
-protected:
+ protected:
   virtual void forwardImp(const std::vector<Argument>& inArgs,
                           std::vector<Argument>* outArgs,
                           PassType passType,
@@ -298,7 +298,7 @@ protected:
 
   void allocGradBufs();
 
-protected:
+ protected:
   bool useGpu_;
 
   bool hasNonstaticCpuParamters_;
@@ -342,7 +342,7 @@ protected:
 };
 
 class TrainerThread {
-public:
+ public:
   TrainerThread(const ModelConfig& config,
                 int threadId,
                 MultiGradientMachine* multiMachine);
@@ -392,7 +392,7 @@ public:
   /// Whether the thread has input data.
   bool hasInputData() { return batchSize_ != 0; }
 
-protected:
+ protected:
   void mergeCpuGradients();
 
   void mergeGradSparse(
@@ -421,7 +421,7 @@ protected:
   /// GradientMachine::backward
   void doCallback(int pid);
 
-protected:
+ protected:
   MultiGradientMachine* multiMachine_;
   ModelConfig config_;
   /// whether the thread should stop
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.cpp b/paddle/gserver/gradientmachines/MultiNetwork.cpp
index a1140402b8baaae20e20802ebf87462e301b60f9..5f3d09dda26772850828e6d44e8cc65635b314dc 100644
--- a/paddle/gserver/gradientmachines/MultiNetwork.cpp
+++ b/paddle/gserver/gradientmachines/MultiNetwork.cpp
@@ -122,7 +122,7 @@ void MultiNetwork::finish() {
 }
 
 class MultiCombinedEvaluator : public Evaluator {
-public:
+ public:
   MultiCombinedEvaluator() {}
   void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
     evaluators_.emplace_back(std::move(evaluator));
@@ -167,7 +167,7 @@ public:
     }
   }
 
-protected:
+ protected:
   std::vector<std::unique_ptr<Evaluator>> evaluators_;
 };
 
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.h b/paddle/gserver/gradientmachines/MultiNetwork.h
index 186a9ad0a39cd7815aea6738e6c6bc4a0c944aa9..495d5592017b5fb937fb8243bf12a5f2f30d67e7 100644
--- a/paddle/gserver/gradientmachines/MultiNetwork.h
+++ b/paddle/gserver/gradientmachines/MultiNetwork.h
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 
 class MultiNetwork : public NeuralNetwork {
-public:
+ public:
   explicit MultiNetwork(std::string subModelName = "")
       : NeuralNetwork(subModelName) {}
 
@@ -58,7 +58,7 @@ public:
 
   virtual void finish();
 
-protected:
+ protected:
   std::vector<std::unique_ptr<NeuralNetwork>> subNetworks_;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index a3c13df3dbad973505d8919bce8b95348527e273..ac60a3a3408d37b66cb712d893c6b93a1750f448 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -362,7 +362,7 @@ void NeuralNetwork::releaseOutput() {
 #ifndef PADDLE_MOBILE_INFERENCE
 
 class CombinedEvaluator : public Evaluator {
-public:
+ public:
   void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
     evaluators_.emplace_back(std::move(evaluator));
   }
@@ -400,11 +400,11 @@ public:
     }
   }
 
-protected:
+ protected:
   std::vector<std::unique_ptr<Evaluator>> evaluators_;
 
   // Evaluator interface
-public:
+ public:
   /**
    * @brief getNames will return all inside evaluators' names.
    * @param names [out]: return names.
@@ -435,7 +435,7 @@ public:
         });
   }
 
-private:
+ private:
   template <typename T>
   T getMethodHelper(const std::string& name,
                     Error* err,
@@ -454,7 +454,7 @@ private:
 };
 
 class SubnetEvaluator : public CombinedEvaluator {
-public:
+ public:
   SubnetEvaluator(const std::string& layerName,
                   std::unique_ptr<Evaluator>&& evaluator)
       : layerName_(layerName) {
@@ -473,7 +473,7 @@ public:
                     << " in submodel " << nn.getName();
   }
 
-protected:
+ protected:
   std::string layerName_;
 };
 
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
index 5b32f844f742c07c8bee6638cb46dc00285f49b0..3e5615c8f0b30ab1283d41e025496051869289dc 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@@ -56,7 +56,7 @@ void parameterInitNN(int paramId,
                      std::vector<ParameterPtr>* sharedParams);
 
 class NeuralNetwork : public GradientMachine {
-public:
+ public:
   virtual void init(const ModelConfig& config,
                     ParamInitCallback callback = nullptr,
                     const std::vector<ParameterType>& parameterTypes =
@@ -144,7 +144,7 @@ public:
    */
   void releaseOutput();
 
-protected:
+ protected:
   /**
    * The constructor of NeuralNetwork.
    * The sub networks can get parameters_ and parameterMap_
diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
index e3b6812123141e8e0afb9368fb06f2b34f526800..c091459506ad477bed3f429a22071eccedd664bb 100644
--- a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
@@ -32,7 +32,7 @@ enum TaskType {
  * multiple threads in parallel.
  */
 class ParallelNeuralNetwork : public NeuralNetwork {
-public:
+ public:
   ParallelNeuralNetwork(std::string subModelName = "",
                         NeuralNetwork *rootNetwork = nullptr)
       : NeuralNetwork(subModelName, rootNetwork) {}
@@ -66,7 +66,7 @@ public:
 
   // virtual void eval(Evaluator* evaluator);
 
-protected:
+ protected:
   bool useGpu_;
   /// number of gpu devices
   int numDevices_;
@@ -74,7 +74,7 @@ protected:
 };
 
 class ParallelThread {
-public:
+ public:
   ParallelThread(int threadId, int deviceId, bool useGpu);
   ~ParallelThread();
   void jobEnqueue(LayerPtr layer, TaskType task);
@@ -87,10 +87,10 @@ public:
   }
   void setForwardPassType(PassType passType) { passType_ = passType; }
 
-protected:
+ protected:
   void computeThread();
 
-public:
+ public:
   struct Job {
     LayerPtr layer_;
     TaskType task_;
@@ -98,7 +98,7 @@ public:
   typedef Queue<Job> JobQueue;
   JobQueue queue_;
 
-protected:
+ protected:
   /// from 0 to threads-1
   int threadId_;
   /// the GPU device Id which the computeThread_ used
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index 2429b5d1a0a5ccf66db365b82c494c53d8e1fd4b..73ac8cda721f200c1a02cd9c1d9456df70d7b7d2 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -96,7 +96,7 @@ static InitFunction __init__diy_prob_method(
     std::numeric_limits<int>::max());
 
 class BeamSearchControlCallbacks {
-public:
+ public:
   RecurrentGradientMachine::BeamSearchCandidatesAdjustCallback
       beamSearchCandidateAdjust;
   RecurrentGradientMachine::NormOrDropNodeCallback normOrDropNode;
@@ -115,7 +115,7 @@ public:
 };
 
 class BeamSearchStatisticsCallbacks {
-public:
+ public:
   RecurrentGradientMachine::EachStepCallback onEachStepStarted;
   RecurrentGradientMachine::EachStepCallback onEachStepStoped;
 
@@ -148,11 +148,11 @@ RecurrentGradientMachine::RecurrentGradientMachine(
  *    so it's should not be placed in root network.
  */
 class BootBiasLayer : public Layer {
-protected:
+ protected:
   std::unique_ptr<Weight> biases_;
   IVectorPtr cpuIds_;
 
-public:
+ public:
   explicit BootBiasLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
index 0032b72cdae44588af976f1ac542149545f551f1..7e943cebd35234ba7af357c9f64fde6b0a9546ce 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -30,7 +30,7 @@ class BeamSearchControlCallbacks;
 class BeamSearchStatisticsCallbacks;
 
 class RecurrentGradientMachine : public NeuralNetwork {
-public:
+ public:
   RecurrentGradientMachine(const std::string& subModelName,
                            NeuralNetwork* rootNetwork);
 
@@ -290,7 +290,7 @@ public:
     return this->finalPaths_;
   }
 
-protected:
+ protected:
   std::vector<Argument::SeqInfo> commonSeqInfo_;
   ICpuGpuVectorPtr sequenceStartPositions_;
   void calcSequenceStartPositions();
@@ -447,7 +447,7 @@ protected:
   MatrixPtr cpuProb_;
   IVectorPtr cpuEos_;
 
-private:
+ private:
   /*
    * @return beam size in beam search
    */
diff --git a/paddle/gserver/layers/AddtoLayer.h b/paddle/gserver/layers/AddtoLayer.h
index 1d000630567cb1116ab0ff69e42380fc0eae6173..6ea54f4a53d466594055db2fb5167fa1a9d6c9da 100644
--- a/paddle/gserver/layers/AddtoLayer.h
+++ b/paddle/gserver/layers/AddtoLayer.h
@@ -33,10 +33,10 @@ namespace paddle {
  * The config file api is addto_layer.
  */
 class AddtoLayer : public Layer {
-protected:
+ protected:
   std::unique_ptr<Weight> biases_;
 
-public:
+ public:
   explicit AddtoLayer(const LayerConfig& config) : Layer(config) {}
 
   ~AddtoLayer() {}
diff --git a/paddle/gserver/layers/AgentLayer.h b/paddle/gserver/layers/AgentLayer.h
index da0ac4530836205757399ac8eb64dd003740a53f..51f346d5c9fdf9599cddf4b668c128035fd94187 100644
--- a/paddle/gserver/layers/AgentLayer.h
+++ b/paddle/gserver/layers/AgentLayer.h
@@ -26,11 +26,11 @@ namespace paddle {
  * called to set one and only one real layer
  */
 class AgentLayer : public Layer {
-protected:
+ protected:
   LayerPtr realLayer_;
   int numSamples_;
 
-public:
+ public:
   explicit AgentLayer(const LayerConfig& config) : Layer(config) {}
 
   ~AgentLayer() {}
@@ -55,14 +55,14 @@ public:
  * GatherAgentLayer collect a complete sequence.
  */
 class GatherAgentLayer : public Layer {
-protected:
+ protected:
   std::vector<LayerPtr> realLayers_;
   std::vector<IVectorPtr> idsVec_;
   // we don't clear idsVec_ vector to aviod IVector alloc/free
   IVectorPtr allIds_;
   std::vector<int> idIndex_;
 
-public:
+ public:
   explicit GatherAgentLayer(const LayerConfig& config) : Layer(config) {}
 
   virtual ~GatherAgentLayer() {}
@@ -95,7 +95,7 @@ public:
  * if it is, the agent will select a few ids in real layer.
  */
 class ScatterAgentLayer : public Layer {
-protected:
+ protected:
   LayerPtr realLayer_;
   IVectorPtr ids_;
   IVectorPtr cpuIds_;
@@ -113,7 +113,7 @@ protected:
   // true for setRealLayer, false for setRealLayerAndOutput
   bool selectionMode_;
 
-public:
+ public:
   explicit ScatterAgentLayer(const LayerConfig& config) : Layer(config) {}
 
   virtual ~ScatterAgentLayer() {}
diff --git a/paddle/gserver/layers/AverageLayer.h b/paddle/gserver/layers/AverageLayer.h
index 24602d2a9c3e08cf76f6f98b5f9e3f593118e6e1..03e2673b55ceca7a698f1b858327ad6fad739087 100644
--- a/paddle/gserver/layers/AverageLayer.h
+++ b/paddle/gserver/layers/AverageLayer.h
@@ -37,7 +37,7 @@ namespace paddle {
  * The config file api is pooling_layer.
  */
 class AverageLayer : public SequencePoolLayer {
-public:
+ public:
   enum AverageStrategy { kAverage = 0, kSum = 1, kAverageSquareRootN = 2 };
   explicit AverageLayer(const LayerConfig& config)
       : SequencePoolLayer(config) {}
@@ -48,7 +48,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   int mode_;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.h b/paddle/gserver/layers/BatchNormBaseLayer.h
index 69d642af4f12593e8db8a726310e6b1934c8e3be..5a446c0843a22adecbaf2ae09fcd526b68865ae2 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.h
+++ b/paddle/gserver/layers/BatchNormBaseLayer.h
@@ -40,7 +40,7 @@ namespace paddle {
  */
 
 class BatchNormBaseLayer : public Layer {
-public:
+ public:
   explicit BatchNormBaseLayer(const LayerConfig& config) : Layer(config) {}
 
   ~BatchNormBaseLayer() {}
@@ -61,7 +61,7 @@ public:
    */
   void calFeatureMapSize();
 
-protected:
+ protected:
   /// Batch normalization scale parameter, which is referred to as gamma in
   /// in original paper.
   std::unique_ptr<Weight> weight_;
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.h b/paddle/gserver/layers/BatchNormalizationLayer.h
index 95add69215e3ea0b0225d0a245fe37905c33127b..e5e4e690b6017f32de0f4d7557065c02c03d689f 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.h
+++ b/paddle/gserver/layers/BatchNormalizationLayer.h
@@ -27,7 +27,7 @@ namespace paddle {
  */
 
 class BatchNormalizationLayer : public BatchNormBaseLayer {
-public:
+ public:
   explicit BatchNormalizationLayer(const LayerConfig& config)
       : BatchNormBaseLayer(config), firstTest_(true) {}
 
@@ -38,7 +38,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   /// Load pre-calculated mean and std.
   void setMeanAndStd();
 
diff --git a/paddle/gserver/layers/BilinearInterpLayer.h b/paddle/gserver/layers/BilinearInterpLayer.h
index acd320420f4bbfe313f3ae77577ffc6b5cbfbfdf..8e08c2e1ce80172f55c93d8242821f683fa1a731 100644
--- a/paddle/gserver/layers/BilinearInterpLayer.h
+++ b/paddle/gserver/layers/BilinearInterpLayer.h
@@ -26,13 +26,13 @@ namespace paddle {
  * @note  The config file api is bilinear_interp_layer.
  */
 class BilinearInterpLayer : public Layer {
-protected:
+ protected:
   size_t outImgH_, outImgW_;
   size_t inImgH_, inImgW_;
   real ratioH_, ratioW_;
   size_t numChannels_;
 
-public:
+ public:
   explicit BilinearInterpLayer(const LayerConfig& config) : Layer(config) {}
 
   virtual ~BilinearInterpLayer() {}
diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h
index 1797b64036b5cb9f97477d5a44b2f58e2d6c0cd4..9d76584f3a4eda19a9e8f806256a7b8da617cc37 100644
--- a/paddle/gserver/layers/BlockExpandLayer.h
+++ b/paddle/gserver/layers/BlockExpandLayer.h
@@ -40,7 +40,7 @@ namespace paddle {
  * The config file api is block_expand_layer.
  */
 class BlockExpandLayer : public Layer {
-protected:
+ protected:
   /**
    * @brief Calculate outputH_ and outputW_ and return block number which
    * actually is time steps.
@@ -53,7 +53,7 @@ protected:
   TensorShape inputShape_;
   TensorShape outputShape_;
 
-public:
+ public:
   explicit BlockExpandLayer(const LayerConfig& config) : Layer(config) {}
 
   ~BlockExpandLayer() {}
diff --git a/paddle/gserver/layers/CRFDecodingLayer.h b/paddle/gserver/layers/CRFDecodingLayer.h
index fba3cebac1a375008c58d21c458d9e0b98305ffa..018162e146fa93725fe84bdf2da9a6124f3cea6f 100644
--- a/paddle/gserver/layers/CRFDecodingLayer.h
+++ b/paddle/gserver/layers/CRFDecodingLayer.h
@@ -30,14 +30,14 @@ namespace paddle {
  * See LinearChainCRF.h for the detail of the CRF formulation.
  */
 class CRFDecodingLayer : public CRFLayer {
-public:
+ public:
   explicit CRFDecodingLayer(const LayerConfig& config) : CRFLayer(config) {}
   bool init(const LayerMap& layerMap,
             const ParameterMap& parameterMap) override;
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override;
 
-protected:
+ protected:
   std::unique_ptr<LinearChainCRF> crf_;
 };
 
diff --git a/paddle/gserver/layers/CRFLayer.h b/paddle/gserver/layers/CRFLayer.h
index cb5bd05568cc79c0093d6af0791cf0b3ce2dae47..88c2ed343ad5743068c871fe351437270d85f223 100644
--- a/paddle/gserver/layers/CRFLayer.h
+++ b/paddle/gserver/layers/CRFLayer.h
@@ -27,14 +27,14 @@ namespace paddle {
  * See class LinearChainCRF for the detail of the CRF formulation.
  */
 class CRFLayer : public Layer {
-public:
+ public:
   explicit CRFLayer(const LayerConfig& config) : Layer(config) {}
   bool init(const LayerMap& layerMap,
             const ParameterMap& parameterMap) override;
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override;
 
-protected:
+ protected:
   size_t numClasses_;
   ParameterPtr parameter_;
   std::vector<LinearChainCRF> crfs_;
diff --git a/paddle/gserver/layers/CTCLayer.h b/paddle/gserver/layers/CTCLayer.h
index fcbc42565e9340903d05aca2d0ba2091ffe20be0..5d70b1f4ceb03028865378d1d01b5706b35b10de 100644
--- a/paddle/gserver/layers/CTCLayer.h
+++ b/paddle/gserver/layers/CTCLayer.h
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 
 class CTCLayer : public Layer {
-public:
+ public:
   explicit CTCLayer(const LayerConfig& config) : Layer(config) {}
   bool init(const LayerMap& layerMap,
             const ParameterMap& parameterMap) override;
@@ -31,7 +31,7 @@ public:
                    const Argument& softmaxSeqs,
                    const Argument& labelSeqs);
 
-protected:
+ protected:
   size_t numClasses_;
   bool normByTimes_;
   std::vector<LinearChainCTC> ctcs_;
diff --git a/paddle/gserver/layers/ClipLayer.cpp b/paddle/gserver/layers/ClipLayer.cpp
index dbc3337499788af5a9b6f68a6016e94c2072d61b..6aa3c8fe64f5a59e82f3271baed99fd17fd6653f 100644
--- a/paddle/gserver/layers/ClipLayer.cpp
+++ b/paddle/gserver/layers/ClipLayer.cpp
@@ -24,11 +24,11 @@ namespace paddle {
  */
 
 class ClipLayer : public Layer {
-protected:
+ protected:
   double min_;
   double max_;
 
-public:
+ public:
   explicit ClipLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/ConcatenateLayer.cpp b/paddle/gserver/layers/ConcatenateLayer.cpp
index f5ab29a509e45e72c71ba122c73aeba1b3b6a827..e6de329ff3f9ccfdd1cbe697c1de1a9cd8c7926a 100644
--- a/paddle/gserver/layers/ConcatenateLayer.cpp
+++ b/paddle/gserver/layers/ConcatenateLayer.cpp
@@ -23,7 +23,7 @@ namespace paddle {
  * each input as one row for the output of this layer and apply activation.
  */
 class ConcatenateLayer : public Layer {
-public:
+ public:
   explicit ConcatenateLayer(const LayerConfig& config) : Layer(config) {}
 
   ~ConcatenateLayer() {}
@@ -97,7 +97,7 @@ void ConcatenateLayer::backward(const UpdateCallback& callback) {
  * processed by a Projection.
  */
 class ConcatenateLayer2 : public Layer {
-public:
+ public:
   explicit ConcatenateLayer2(const LayerConfig& config) : Layer(config) {}
 
   ~ConcatenateLayer2() {}
@@ -108,7 +108,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   std::vector<std::unique_ptr<Projection>> projections_;
   std::vector<Argument> projOutput_;
   std::vector<std::pair<size_t, size_t>> projCol_;
diff --git a/paddle/gserver/layers/ContextProjection.h b/paddle/gserver/layers/ContextProjection.h
index e30f98f58d2be9ac538f6385efe68990b705ac5f..9c217145419048282a9a09ad899dc970e7c9704f 100644
--- a/paddle/gserver/layers/ContextProjection.h
+++ b/paddle/gserver/layers/ContextProjection.h
@@ -42,7 +42,7 @@ namespace paddle {
  * The config file api is context_projection.
  */
 class ContextProjection : public Projection {
-public:
+ public:
   /**
    * Constructor. If context_start is zero and context_lenth is one, it will
    * set trainable_padding false. trainable_padding is an optional arguments
@@ -63,7 +63,7 @@ public:
 
   virtual bool init();
 
-protected:
+ protected:
   std::unique_ptr<Weight> weight_;
   /// number of extra timesteps added at the beginning
   size_t beginPad_;
diff --git a/paddle/gserver/layers/Conv3DLayer.h b/paddle/gserver/layers/Conv3DLayer.h
index 5ab5ff3d4af07449484c441958c31c8fb06de894..07b804bad02beb6ec9c3e9fd43c3cd3aa6d50b22 100644
--- a/paddle/gserver/layers/Conv3DLayer.h
+++ b/paddle/gserver/layers/Conv3DLayer.h
@@ -26,7 +26,7 @@ namespace paddle {
  * calculate convolution operation.
  */
 class Conv3DLayer : public ConvBaseLayer {
-public:
+ public:
   explicit Conv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
   ~Conv3DLayer() {}
 
@@ -40,7 +40,7 @@ public:
   void bpropWeights(int i);
   size_t getSize();
 
-protected:
+ protected:
   // Figure out the dimensions for individual gemms.
   IntV M_;  /// numFilters_ / filter_group_;
   IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
diff --git a/paddle/gserver/layers/ConvBaseLayer.h b/paddle/gserver/layers/ConvBaseLayer.h
index 93869fe68d15b1cf38296fa8e2f6197dc74f879f..801bc4f888c5a60e803c882dcf807678c64af20c 100644
--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ b/paddle/gserver/layers/ConvBaseLayer.h
@@ -24,7 +24,7 @@ namespace paddle {
  */
 
 class ConvBaseLayer : public Layer {
-protected:
+ protected:
   typedef std::vector<int> IntV;
 
   /// True if it's deconv layer, false if it's convolution layer
@@ -88,7 +88,7 @@ protected:
   /// of output size.
   bool caffeMode_;
 
-public:
+ public:
   explicit ConvBaseLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/ConvBaseOperator.h b/paddle/gserver/layers/ConvBaseOperator.h
index 27fb0362d3c9518a263eac54206e00974d08eb20..c3c647cb69da5a70eb5346737cc0092e2201c89e 100644
--- a/paddle/gserver/layers/ConvBaseOperator.h
+++ b/paddle/gserver/layers/ConvBaseOperator.h
@@ -29,7 +29,7 @@ namespace paddle {
  */
 
 class ConvBaseOperator : public Operator {
-public:
+ public:
   ConvBaseOperator(const OperatorConfig &config, bool useGpu);
   /**
    * Free workspace in device and destroy cudnn tensor descriptor.
@@ -46,7 +46,7 @@ public:
     hl_destroy_convolution_descriptor(convDesc_);
   }
 
-protected:
+ protected:
   /**
    * Get convolution parameters from layer config and
    * initialize member variables.
diff --git a/paddle/gserver/layers/ConvBaseProjection.h b/paddle/gserver/layers/ConvBaseProjection.h
index ba76d236d901187093a2e372a61c5d29d661e8bb..f3266ae1ab945042cde9f24b7c2673c18d37bc11 100644
--- a/paddle/gserver/layers/ConvBaseProjection.h
+++ b/paddle/gserver/layers/ConvBaseProjection.h
@@ -23,7 +23,7 @@ namespace paddle {
  * @brief Base class for ConvProjection and ConvTransProjection.
  */
 class ConvBaseProjection : public Projection {
-public:
+ public:
   /**
    * Constructor.
    */
@@ -33,7 +33,7 @@ public:
 
   ~ConvBaseProjection();
 
-protected:
+ protected:
   void getConvParams();
   void initCudnn();
 
diff --git a/paddle/gserver/layers/ConvOperator.h b/paddle/gserver/layers/ConvOperator.h
index fbdb7bb1cd2b81bd72912dffdc9d059c520068a8..527dbf8c270f35e19ca23acd8a3ba8197d03b988 100644
--- a/paddle/gserver/layers/ConvOperator.h
+++ b/paddle/gserver/layers/ConvOperator.h
@@ -29,7 +29,7 @@ namespace paddle {
  */
 
 class ConvOperator : public ConvBaseOperator {
-public:
+ public:
   ConvOperator(const OperatorConfig &config, bool useGpu)
       : ConvBaseOperator(config, useGpu) {}
   /**
diff --git a/paddle/gserver/layers/ConvProjection.h b/paddle/gserver/layers/ConvProjection.h
index e8ecb99431a421d4b52228600909568b0808649a..22a2202bb6cc256a4a5897724d8eb8a93fefb79f 100644
--- a/paddle/gserver/layers/ConvProjection.h
+++ b/paddle/gserver/layers/ConvProjection.h
@@ -23,7 +23,7 @@ namespace paddle {
  * @brief Convolution projection do the same calculation with CudnnConvLayer.
  */
 class ConvProjection : public ConvBaseProjection {
-public:
+ public:
   /**
    * Constructor.
    */
diff --git a/paddle/gserver/layers/ConvShiftLayer.cpp b/paddle/gserver/layers/ConvShiftLayer.cpp
index fb877710196835e025466f37b5da27bcf80a3db4..615c3478061b591ea30cbf0b3d27ef2551c0dd28 100644
--- a/paddle/gserver/layers/ConvShiftLayer.cpp
+++ b/paddle/gserver/layers/ConvShiftLayer.cpp
@@ -42,7 +42,7 @@ namespace paddle {
  */
 
 class ConvShiftLayer : public Layer {
-public:
+ public:
   explicit ConvShiftLayer(const LayerConfig& config) : Layer(config) {}
 
   ~ConvShiftLayer() {}
diff --git a/paddle/gserver/layers/ConvTransOperator.h b/paddle/gserver/layers/ConvTransOperator.h
index 1bf58f2bfb78ae7dee433455ece37d908b113045..53cb7a21b49189898d09aa20cd46d04cc5c20198 100644
--- a/paddle/gserver/layers/ConvTransOperator.h
+++ b/paddle/gserver/layers/ConvTransOperator.h
@@ -29,7 +29,7 @@ namespace paddle {
  */
 
 class ConvTransOperator : public ConvBaseOperator {
-public:
+ public:
   ConvTransOperator(const OperatorConfig &config, bool useGpu)
       : ConvBaseOperator(config, useGpu) {}
   /**
diff --git a/paddle/gserver/layers/ConvTransProjection.h b/paddle/gserver/layers/ConvTransProjection.h
index 269b2694c82ea076102633537d7c961139a19a43..0f9ed720d3b8855a3a24ac25a1c3917c4b98e81d 100644
--- a/paddle/gserver/layers/ConvTransProjection.h
+++ b/paddle/gserver/layers/ConvTransProjection.h
@@ -23,7 +23,7 @@ namespace paddle {
  * @brief Convolution projection do the same calculation with CudnnConvLayer.
  */
 class ConvTransProjection : public ConvBaseProjection {
-public:
+ public:
   /**
    * Constructor.
    */
diff --git a/paddle/gserver/layers/ConvexCombinationLayer.cpp b/paddle/gserver/layers/ConvexCombinationLayer.cpp
index dce751940c1bf1695a034a3c551412dcb9b7b8b5..31363d97c4fd318ec2c6d48f9200f6ba1f49ba11 100644
--- a/paddle/gserver/layers/ConvexCombinationLayer.cpp
+++ b/paddle/gserver/layers/ConvexCombinationLayer.cpp
@@ -36,7 +36,7 @@ namespace paddle {
  * The config file api is linear_comb_layer.
  */
 class ConvexCombinationLayer : public Layer {
-protected:
+ protected:
   /// A matrix pointer pointing to second input.
   MatrixPtr tmpMtx0;
   /// A matrix pointer pointing to first input.
@@ -44,7 +44,7 @@ protected:
   /// A matrix pointer pointing to output.
   MatrixPtr tmpRow1;
 
-public:
+ public:
   explicit ConvexCombinationLayer(const LayerConfig& config) : Layer(config) {}
 
   ~ConvexCombinationLayer() {}
diff --git a/paddle/gserver/layers/CosSimLayer.h b/paddle/gserver/layers/CosSimLayer.h
index 675cdb16b563faa7acf9e701096bd334ed661160..d9fe1ff270f1f76e3b246dca374ddf45445419f9 100644
--- a/paddle/gserver/layers/CosSimLayer.h
+++ b/paddle/gserver/layers/CosSimLayer.h
@@ -33,7 +33,7 @@ namespace paddle {
  * The config file api is cos_sim.
  */
 class CosSimLayer : public Layer {
-public:
+ public:
   explicit CosSimLayer(const LayerConfig& config) : Layer(config) {}
 
   ~CosSimLayer() {}
diff --git a/paddle/gserver/layers/CosSimVecMatLayer.cpp b/paddle/gserver/layers/CosSimVecMatLayer.cpp
index 685b4e8ef376b76b3058eeba82d803d460e7105c..230ecc768b4d7314b21ac1d76899c3c3bab12309 100644
--- a/paddle/gserver/layers/CosSimVecMatLayer.cpp
+++ b/paddle/gserver/layers/CosSimVecMatLayer.cpp
@@ -32,7 +32,7 @@ namespace paddle {
  */
 
 class CosSimVecMatLayer : public Layer {
-protected:
+ protected:
   MatrixPtr tmpMtx0;
   MatrixPtr tmpMtx1;
   MatrixPtr tmpRow0;
@@ -40,7 +40,7 @@ protected:
   MatrixPtr tmpRow2;
   MatrixPtr tmpRow3;
 
-public:
+ public:
   explicit CosSimVecMatLayer(const LayerConfig& config) : Layer(config) {}
 
   ~CosSimVecMatLayer() {}
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index 484f803a8387a16152c5911d7d5c72b0111283ae..1327616950a8887efa2cba410fa7ae8b5bd97da4 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -716,7 +716,7 @@ void HuberTwoClassification::backwardImp(Matrix& output,
  * \f]
  */
 class SumCostLayer : public Layer {
-public:
+ public:
   explicit SumCostLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/gserver/layers/CostLayer.h
index 306c067ed1c040555d2b03996cc0749faf0ea68c..9bfec0e2b169fac4f235fd13347be687c4f1a222 100644
--- a/paddle/gserver/layers/CostLayer.h
+++ b/paddle/gserver/layers/CostLayer.h
@@ -29,7 +29,7 @@ namespace paddle {
  * handled by the base class.
  */
 class CostLayer : public Layer {
-public:
+ public:
   explicit CostLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -51,7 +51,7 @@ public:
                            Argument& label,
                            Matrix& outputGrad) = 0;
 
-protected:
+ protected:
   LayerPtr weightLayer_;
   real coeff_;
 };
@@ -65,7 +65,7 @@ protected:
  * \f]
  */
 class MultiClassCrossEntropy : public CostLayer {
-public:
+ public:
   explicit MultiClassCrossEntropy(const LayerConfig& config)
       : CostLayer(config) {}
 
@@ -95,7 +95,7 @@ public:
  *     In Proceedings of the ACL 2014 Conference.
  */
 class MultiClassCrossEntropyWithSelfNorm : public CostLayer {
-public:
+ public:
   explicit MultiClassCrossEntropyWithSelfNorm(const LayerConfig& config)
       : CostLayer(config) {}
 
@@ -108,7 +108,7 @@ public:
                    Argument& label,
                    Matrix& outputGrad) override;
 
-protected:
+ protected:
   MatrixPtr sftMaxSum_;
   MatrixPtr sumInv_;
 };
@@ -120,7 +120,7 @@ protected:
  * \f]
  */
 class SoftBinaryClassCrossEntropy : public CostLayer {
-public:
+ public:
   explicit SoftBinaryClassCrossEntropy(const LayerConfig& config)
       : CostLayer(config) {}
 
@@ -133,7 +133,7 @@ public:
                    Argument& label,
                    Matrix& outputGrad) override;
 
-protected:
+ protected:
   MatrixPtr targetPerDim_;
 };
 
@@ -145,7 +145,7 @@ protected:
  * \f]
  */
 class SumOfSquaresCostLayer : public CostLayer {
-public:
+ public:
   explicit SumOfSquaresCostLayer(const LayerConfig& config)
       : CostLayer(config) {}
 
@@ -171,7 +171,7 @@ public:
  * x = output - label
  */
 class SmoothL1CostLayer : public CostLayer {
-public:
+ public:
   explicit SmoothL1CostLayer(const LayerConfig& config) : CostLayer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -197,7 +197,7 @@ public:
  *      Rank useing Gradient Descent.
  */
 class RankingCost : public Layer {
-public:
+ public:
   explicit RankingCost(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -225,7 +225,7 @@ public:
     (void)outputGrad;
   }
 
-private:
+ private:
   double posPairCount_;
   double negPairCount_;
   MatrixPtr margin_;
@@ -250,7 +250,7 @@ private:
  *     with Nonsmooth Cost Functions.
  */
 class LambdaCost : public Layer {
-public:
+ public:
   explicit LambdaCost(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -270,7 +270,7 @@ public:
                 real* gradData,
                 int size);
 
-private:
+ private:
   MatrixPtr marginGrad_;
   int truncationSize_;
   int maxSortSize_;
@@ -287,10 +287,10 @@ private:
  * \f]
  */
 class MultiBinaryLabelCrossEntropy : public CostLayer {
-protected:
+ protected:
   MatrixPtr targetPerDim_;
 
-public:
+ public:
   explicit MultiBinaryLabelCrossEntropy(const LayerConfig& config)
       : CostLayer(config) {}
 
@@ -308,7 +308,7 @@ public:
  * A base layer for HuberRegressionLoss and HuberTwoClassification.
  */
 class HuberCost : public CostLayer {
-public:
+ public:
   std::vector<Argument> tmpCpuInput_;
 
   explicit HuberCost(const LayerConfig& config) : CostLayer(config) {}
@@ -331,7 +331,7 @@ public:
  * Loss = delta * abs(y - f) - 0.5 * delta^2, otherwise
  */
 class HuberRegressionLoss : public HuberCost {
-public:
+ public:
   explicit HuberRegressionLoss(const LayerConfig& config) : HuberCost(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -343,7 +343,7 @@ public:
                    Argument& label,
                    Matrix& outputGrad) override;
 
-protected:
+ protected:
   real delta_;
 };
 
@@ -356,7 +356,7 @@ protected:
  * Loss = 0, otherwise
  */
 class HuberTwoClassification : public HuberCost {
-public:
+ public:
   explicit HuberTwoClassification(const LayerConfig& config)
       : HuberCost(config) {}
 
diff --git a/paddle/gserver/layers/CropLayer.h b/paddle/gserver/layers/CropLayer.h
index 1a85911ef75e992df587a60cfc9a727eafa4cc76..ef88bc483d157406a0f5a7924c14c345ea0df8c4 100644
--- a/paddle/gserver/layers/CropLayer.h
+++ b/paddle/gserver/layers/CropLayer.h
@@ -28,7 +28,7 @@ namespace paddle {
  *                  crop input as this shape conf
  */
 class CropLayer : public Layer {
-public:
+ public:
   explicit CropLayer(const LayerConfig& config) : Layer(config) {}
 
   ~CropLayer() {}
@@ -38,7 +38,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   void setOutDims();
   void setInDims();
 
diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.h b/paddle/gserver/layers/CrossEntropyOverBeam.h
index b47a2933c255c264ba780b2d87c9fbe53cb5665d..c8702b16165eee8d552c563082ffc708ce443deb 100644
--- a/paddle/gserver/layers/CrossEntropyOverBeam.h
+++ b/paddle/gserver/layers/CrossEntropyOverBeam.h
@@ -44,7 +44,7 @@ struct BeamExpansion {
 typedef std::shared_ptr<BeamExpansion> BeamExpansionPtr;
 
 class CostForOneSequence {
-public:
+ public:
   CostForOneSequence()
       : beamSize_(0), validExpansionCount_(0), goldAsExtraPath_(false) {}
   void setData(const BeamExpansionPtr bPtr, size_t beamSize) {
@@ -64,7 +64,7 @@ public:
   real forward();
   void backward();
 
-private:
+ private:
   void calValidExpandStep();
   void constructTotalExpansion();
   size_t initLastExpansion();
@@ -93,14 +93,14 @@ private:
 };
 
 class CrossEntropyOverBeam : public Layer {
-public:
+ public:
   explicit CrossEntropyOverBeam(const LayerConfig& config) : Layer(config) {}
   bool init(const LayerMap& layerMap,
             const ParameterMap& parameterMap) override;
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override;
 
-private:
+ private:
   void checkInputs();
   void copyInputsToCpu();
   void resizeOutput();
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.h b/paddle/gserver/layers/CudnnBatchNormLayer.h
index aa279f73d66770384815cad4d9e2ee0b04a4a1ad..1bb4eff8d2372660caa4ec4a4a20a27f365bebd0 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.h
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.h
@@ -30,7 +30,7 @@ namespace paddle {
  */
 
 class CudnnBatchNormLayer : public BatchNormBaseLayer {
-public:
+ public:
   explicit CudnnBatchNormLayer(const LayerConfig& config)
       : BatchNormBaseLayer(config) {}
 
@@ -46,7 +46,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   /// Epsilon value used in the batch normalization formula.
   /// Same epsilon value should be used in forward and backward functions.
   double eps_;
diff --git a/paddle/gserver/layers/CudnnConvBaseLayer.h b/paddle/gserver/layers/CudnnConvBaseLayer.h
index 698104e4fbd2556f426001687a581153f32773d8..1ee1aa100d8adaed04ce24ee12b5b9af52c14b13 100644
--- a/paddle/gserver/layers/CudnnConvBaseLayer.h
+++ b/paddle/gserver/layers/CudnnConvBaseLayer.h
@@ -31,14 +31,14 @@ namespace paddle {
  * The config file api is img_conv_layer.
  */
 class CudnnConvBaseLayer : public ConvBaseLayer {
-protected:
+ protected:
   std::vector<std::unique_ptr<ProjectionConfig>> projConf_;
   std::vector<std::unique_ptr<Projection>> projections_;
 
   hl_tensor_descriptor biasDesc_;
   hl_tensor_descriptor outputDesc_;
 
-public:
+ public:
   explicit CudnnConvBaseLayer(const LayerConfig& config)
       : ConvBaseLayer(config) {}
 
diff --git a/paddle/gserver/layers/CudnnPoolLayer.h b/paddle/gserver/layers/CudnnPoolLayer.h
index 9eb4fc6138b0bce59660406705d15291eb38af9b..fc249354d10333211691b6844bffa3c8da8a79ee 100644
--- a/paddle/gserver/layers/CudnnPoolLayer.h
+++ b/paddle/gserver/layers/CudnnPoolLayer.h
@@ -26,7 +26,7 @@ namespace paddle {
  */
 
 class CudnnPoolLayer : public PoolLayer {
-protected:
+ protected:
   int windowHeight, windowWidth;
   int heightPadding, widthPadding, strideHeight, strideWidth;
   int imageH_, imageW_, outputH_, outputW_;
@@ -40,7 +40,7 @@ protected:
   /// A description of a pooling operation.
   hl_pooling_descriptor poolingDesc_;
 
-public:
+ public:
   static bool typeCheck(const std::string& poolType,
                         hl_pooling_mode_t* mode = nullptr);
   explicit CudnnPoolLayer(const LayerConfig& config);
diff --git a/paddle/gserver/layers/DataLayer.h b/paddle/gserver/layers/DataLayer.h
index 4b12afe0efe81843b58e459ca1e58b4f7f4a1664..d02f5a4697b9067f7d34e4d0b2d34f8c63ffe020 100644
--- a/paddle/gserver/layers/DataLayer.h
+++ b/paddle/gserver/layers/DataLayer.h
@@ -25,7 +25,7 @@ namespace paddle {
  * The config file api is data_layer.
  */
 class DataLayer : public Layer {
-public:
+ public:
   explicit DataLayer(const LayerConfig& config) : Layer(config) {}
 
   virtual void setData(const Argument& data) { data_ = data; }
@@ -58,10 +58,10 @@ public:
     }
   }
 
-private:
+ private:
   void copyDataToOutput(Argument& output);
 
-protected:
+ protected:
   Argument data_;
 };
 
diff --git a/paddle/gserver/layers/DataNormLayer.h b/paddle/gserver/layers/DataNormLayer.h
index 2a2a2a4aa76e8e315d9d66da1b738d6d615d10f2..7ae67a877b488c8d197896b8b1e3e90057fbe1c9 100644
--- a/paddle/gserver/layers/DataNormLayer.h
+++ b/paddle/gserver/layers/DataNormLayer.h
@@ -37,7 +37,7 @@ namespace paddle {
  */
 
 class DataNormLayer : public Layer {
-public:
+ public:
   enum NormalizationStrategy { kZScore = 0, kMinMax = 1, kDecimalScaling = 2 };
 
   explicit DataNormLayer(const LayerConfig& config) : Layer(config) {}
@@ -50,7 +50,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   int mode_;
   std::unique_ptr<Weight> weight_;
   MatrixPtr min_;
diff --git a/paddle/gserver/layers/DeConv3DLayer.h b/paddle/gserver/layers/DeConv3DLayer.h
index 57d51cdec66930b9b79c0c0395da66922cd53ae4..13d1d07cf5cc6e2a6ea89768e29b1fe8cda5e81c 100644
--- a/paddle/gserver/layers/DeConv3DLayer.h
+++ b/paddle/gserver/layers/DeConv3DLayer.h
@@ -27,7 +27,7 @@ namespace paddle {
  * calculate deconvolution3D operation.
  */
 class DeConv3DLayer : public ConvBaseLayer {
-public:
+ public:
   explicit DeConv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
   ~DeConv3DLayer() {}
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
@@ -40,7 +40,7 @@ public:
   void bpropWeights(int i);
   size_t getSize();
 
-protected:
+ protected:
   // Figure out the dimensions for individual gemms.
   IntV M_;  /// numFilters_ / filter_group_;
   IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
diff --git a/paddle/gserver/layers/DetectionOutputLayer.h b/paddle/gserver/layers/DetectionOutputLayer.h
index 174a6e5d9acb476276b66627b4aabce2ae6c1037..b0270ed33141993665aeabdc53829600a4403643 100644
--- a/paddle/gserver/layers/DetectionOutputLayer.h
+++ b/paddle/gserver/layers/DetectionOutputLayer.h
@@ -33,7 +33,7 @@ namespace paddle {
  */
 
 class DetectionOutputLayer : public Layer {
-public:
+ public:
   explicit DetectionOutputLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
@@ -42,7 +42,7 @@ public:
 
   void backward(const UpdateCallback& callback = nullptr) {}
 
-protected:
+ protected:
   inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
 
   inline LayerPtr getLocInputLayer(size_t index) {
@@ -53,7 +53,7 @@ protected:
     return inputLayers_[1 + inputNum_ + index];
   }
 
-private:
+ private:
   size_t numClasses_;  // number of classes
   size_t inputNum_;    // number of input layers
   real nmsThreshold_;
diff --git a/paddle/gserver/layers/DotMulOperator.cpp b/paddle/gserver/layers/DotMulOperator.cpp
index 68db2929adee1336e52abfcb8e6495e589afa683..03d18d9b239e57dc41334462f2324ae2d0505a62 100644
--- a/paddle/gserver/layers/DotMulOperator.cpp
+++ b/paddle/gserver/layers/DotMulOperator.cpp
@@ -27,7 +27,7 @@ namespace paddle {
  * The config file api is dotmul_operator.
  */
 class DotMulOperator : public Operator {
-public:
+ public:
   DotMulOperator(const OperatorConfig& config, bool useGpu);
   virtual void forward();
   virtual void backward();
diff --git a/paddle/gserver/layers/DotMulProjection.cpp b/paddle/gserver/layers/DotMulProjection.cpp
index 86453aae84142f9f534182d085f4a96a2c7a3e15..d7780387670e83af24fa342be3d596b618b1f677 100644
--- a/paddle/gserver/layers/DotMulProjection.cpp
+++ b/paddle/gserver/layers/DotMulProjection.cpp
@@ -26,14 +26,14 @@ namespace paddle {
  * The config file api is dotmul_projection.
  */
 class DotMulProjection : public Projection {
-public:
+ public:
   DotMulProjection(const ProjectionConfig& config,
                    const ParameterPtr& parameter,
                    bool useGpu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 
-protected:
+ protected:
   /// shared memory with parameter
   std::unique_ptr<Weight> weight_;
 };
diff --git a/paddle/gserver/layers/DotProdLayer.cpp b/paddle/gserver/layers/DotProdLayer.cpp
index 5148d93e27d199b0c373221cedd4f03d6d32c8ab..72b0c707b2131dc275ba604cd20ae0007c34a9a9 100644
--- a/paddle/gserver/layers/DotProdLayer.cpp
+++ b/paddle/gserver/layers/DotProdLayer.cpp
@@ -27,7 +27,7 @@ namespace paddle {
  */
 
 class DotProdLayer : public Layer {
-public:
+ public:
   explicit DotProdLayer(const LayerConfig& config) : Layer(config) {}
 
   ~DotProdLayer() {}
diff --git a/paddle/gserver/layers/EosIdCheckLayer.cpp b/paddle/gserver/layers/EosIdCheckLayer.cpp
index 470a5b8ea208ad0acb64e3067881e0d183e1dc39..04400f2836581179849a4dd1c256bbddcc82530f 100644
--- a/paddle/gserver/layers/EosIdCheckLayer.cpp
+++ b/paddle/gserver/layers/EosIdCheckLayer.cpp
@@ -24,7 +24,7 @@ namespace paddle {
  * It is used by recurrent layer group.
  */
 class EosIdCheckLayer : public Layer {
-public:
+ public:
   explicit EosIdCheckLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/ExpandConvLayer.h b/paddle/gserver/layers/ExpandConvLayer.h
index be968155efd0b8f19503c996ccd329379c6b1104..6919ef71355a4c660b9ddd60bff75fee399cfaa9 100644
--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ b/paddle/gserver/layers/ExpandConvLayer.h
@@ -29,7 +29,7 @@ namespace paddle {
  */
 
 class ExpandConvLayer : public ConvBaseLayer {
-public:
+ public:
   explicit ExpandConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
 
   ~ExpandConvLayer() {}
@@ -42,7 +42,7 @@ public:
 
   size_t getOutputSize();
 
-protected:
+ protected:
   std::vector<TensorShape> inputShape_;
   std::vector<TensorShape> filterShape_;
   std::vector<TensorShape> outputShape_;
diff --git a/paddle/gserver/layers/ExpandLayer.h b/paddle/gserver/layers/ExpandLayer.h
index 04bbfcbd04931fa11d11a9fcc74f0e4f19767f1b..06bd4ef05ee206628d981fee8e7eec3c91b18b7a 100644
--- a/paddle/gserver/layers/ExpandLayer.h
+++ b/paddle/gserver/layers/ExpandLayer.h
@@ -37,7 +37,7 @@ namespace paddle {
  */
 
 class ExpandLayer : public Layer {
-protected:
+ protected:
   std::unique_ptr<Weight> biases_;
   /// if input[0] is dense data, ExpandLevel=kNonSeq;
   /// if input[0] is sequence data, ExpandLevel=kSeq
@@ -48,7 +48,7 @@ protected:
   /// of input[1]
   ICpuGpuVectorPtr expandStartsPos_;
 
-public:
+ public:
   explicit ExpandLayer(const LayerConfig& config) : Layer(config) {}
 
   ~ExpandLayer() {}
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
index 684da4e65a461d46204c348b3374b0e9e00eb389..148abe238173dd44cd0fcf3f5cda732f70078706 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.h
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
@@ -42,7 +42,7 @@ namespace paddle {
  */
 
 class FactorizationMachineLayer : public Layer {
-protected:
+ protected:
   // The latent vectors, shape: (size, factorSize_)
   // Each row of the latentVectors_ matrix is the latent vector
   // corresponding to one input feature dimension
@@ -50,7 +50,7 @@ protected:
   // The hyperparameter that defines the dimensionality of the factorization
   size_t factorSize_;
 
-private:
+ private:
   // Store the square values of the letent vectors matrix
   MatrixPtr latentVectorsSquare_;
   // Store the square values of input matrix
@@ -65,7 +65,7 @@ private:
   // Negative identity matrix
   MatrixPtr negOnes_;
 
-public:
+ public:
   explicit FactorizationMachineLayer(const LayerConfig& config)
       : Layer(config) {}
   ~FactorizationMachineLayer() {}
diff --git a/paddle/gserver/layers/FeatureMapExpandLayer.cpp b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
index 81b98da45bc4b9b8ef0723dd6ea2db809860e219..d95f0b9b3d13e8bff635373cb4d5705c2351bd97 100644
--- a/paddle/gserver/layers/FeatureMapExpandLayer.cpp
+++ b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
@@ -38,11 +38,11 @@ namespace paddle {
  */
 
 class FeatureMapExpandLayer : public Layer {
-private:
+ private:
   int numFilters_;
   bool asRowVector_;
 
-public:
+ public:
   explicit FeatureMapExpandLayer(const LayerConfig& config) : Layer(config) {}
 
   ~FeatureMapExpandLayer() {}
diff --git a/paddle/gserver/layers/FullMatrixProjection.h b/paddle/gserver/layers/FullMatrixProjection.h
index 7c4cd1a7066d427f54e1a280a956acb025e6dc16..a27aa4a12327ac39ec3418a849b1230e13f759ee 100644
--- a/paddle/gserver/layers/FullMatrixProjection.h
+++ b/paddle/gserver/layers/FullMatrixProjection.h
@@ -28,14 +28,14 @@ namespace paddle {
  * The config file api is full_matrix_projection.
  */
 class FullMatrixProjection : public Projection {
-public:
+ public:
   FullMatrixProjection(const ProjectionConfig& config,
                        const ParameterPtr& parameter,
                        bool useGpu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 
-protected:
+ protected:
   std::unique_ptr<Weight> weight_;
 };
 
diff --git a/paddle/gserver/layers/FullyConnectedLayer.h b/paddle/gserver/layers/FullyConnectedLayer.h
index e66aeeb7334c9c871749196d77474a02ecf82b09..e0f9d6ce55fbdf73e5507032c108c735bf04597b 100644
--- a/paddle/gserver/layers/FullyConnectedLayer.h
+++ b/paddle/gserver/layers/FullyConnectedLayer.h
@@ -28,11 +28,11 @@ namespace paddle {
  */
 
 class FullyConnectedLayer : public Layer {
-protected:
+ protected:
   WeightList weights_;
   std::unique_ptr<Weight> biases_;
 
-public:
+ public:
   explicit FullyConnectedLayer(const LayerConfig& config) : Layer(config) {}
   ~FullyConnectedLayer() {}
 
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.h b/paddle/gserver/layers/GatedRecurrentLayer.h
index f0a3a823018f3943b0295c172b19d0fe9d0674b4..46508dc977bf1a6fd33dc1fb024bd1aed36a0ff3 100644
--- a/paddle/gserver/layers/GatedRecurrentLayer.h
+++ b/paddle/gserver/layers/GatedRecurrentLayer.h
@@ -47,7 +47,7 @@ namespace paddle {
  */
 
 class GatedRecurrentLayer : public Layer, public GruCompute {
-public:
+ public:
   explicit GatedRecurrentLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -63,7 +63,7 @@ public:
 
   LayerStatePtr getState() override;
 
-protected:
+ protected:
   void forwardSequence(int batchSize,
                        size_t numSequences,
                        const int* starts,
@@ -79,7 +79,7 @@ protected:
                     MatrixPtr inputValue);
   void backwardBatch(int batchSize, MatrixPtr inputGrad);
 
-protected:
+ protected:
   std::unique_ptr<Weight> weight_;
   std::unique_ptr<Weight> gateWeight_;
   std::unique_ptr<Weight> stateWeight_;
diff --git a/paddle/gserver/layers/GetOutputLayer.cpp b/paddle/gserver/layers/GetOutputLayer.cpp
index f255681f3e678e51f069522f965fd2776680b595..7c1e3c407cca374c7aa238d07e2263c4a142b6a5 100644
--- a/paddle/gserver/layers/GetOutputLayer.cpp
+++ b/paddle/gserver/layers/GetOutputLayer.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 
 class GetOutputLayer : public Layer {
-public:
+ public:
   explicit GetOutputLayer(const LayerConfig& config) : Layer(config) {}
 
   ~GetOutputLayer() {}
diff --git a/paddle/gserver/layers/GruCompute.h b/paddle/gserver/layers/GruCompute.h
index fb6bc56422002b4d4080ccb8438767b27ceef064..50006325ce9969c4941aaf28604260f0aeb9b97a 100644
--- a/paddle/gserver/layers/GruCompute.h
+++ b/paddle/gserver/layers/GruCompute.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 
 class GruCompute {
-public:
+ public:
   void init(LayerConfig &config);
 
   template <bool useGpu>
@@ -33,7 +33,7 @@ public:
                 int frameSize,
                 int batchSize = 1);
 
-public:
+ public:
   hl_activation_mode_t activeNode_;
   hl_activation_mode_t activeGate_;
 };
diff --git a/paddle/gserver/layers/GruStepLayer.cpp b/paddle/gserver/layers/GruStepLayer.cpp
index 917c50250c1c04d6c8f113c8d42ef029e1028606..114f287411c2fccbc08b7da4c05462967c81b268 100644
--- a/paddle/gserver/layers/GruStepLayer.cpp
+++ b/paddle/gserver/layers/GruStepLayer.cpp
@@ -44,13 +44,13 @@ namespace paddle {
  * The config file api if gru_step_layer.
  */
 class GruStepLayer : public Layer, public GruCompute {
-protected:
+ protected:
   Argument gate_;
   Argument resetOutput_;
   std::unique_ptr<Weight> weight_;
   std::unique_ptr<Weight> bias_;
 
-public:
+ public:
   explicit GruStepLayer(const LayerConfig& config) : Layer(config) {}
 
   ~GruStepLayer() {}
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
index 10e501f1807ef6ba03d326a1bcf257ede0ee850a..73ef252fd5a5443fe065f3b7bd8c49951ae0b4bd 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
@@ -58,7 +58,7 @@ namespace paddle {
  * The config file api is hsigmod_layer.
  */
 class HierarchicalSigmoidLayer : public Layer {
-public:
+ public:
   explicit HierarchicalSigmoidLayer(const LayerConfig& config)
       : Layer(config) {}
   bool init(const LayerMap& layerMap,
@@ -66,7 +66,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override;
 
-protected:
+ protected:
   /**
    * The last of inputs is label layer.
    */
diff --git a/paddle/gserver/layers/IdentityProjection.cpp b/paddle/gserver/layers/IdentityProjection.cpp
index 6c70f77acc0c890e11a4929ea013d7745d8bbed0..34e9eb90161f7942c528b70f177e30f301a8f53f 100644
--- a/paddle/gserver/layers/IdentityProjection.cpp
+++ b/paddle/gserver/layers/IdentityProjection.cpp
@@ -26,7 +26,7 @@ namespace paddle {
  * The config file api is identity_projection.
  */
 class IdentityProjection : public Projection {
-public:
+ public:
   IdentityProjection(const ProjectionConfig& config,
                      const ParameterPtr& parameter,
                      bool useGpu);
@@ -68,7 +68,7 @@ void IdentityProjection::backward(const UpdateCallback& callback) {
  * The config file api is identity_projection.
  */
 class IdentityOffsetProjection : public Projection {
-public:
+ public:
   IdentityOffsetProjection(const ProjectionConfig& config,
                            const ParameterPtr& parameter,
                            bool useGpu);
diff --git a/paddle/gserver/layers/InterpolationLayer.cpp b/paddle/gserver/layers/InterpolationLayer.cpp
index 0ac92024bc7eddf05ce023708537d0aa7bab6426..509c07cf22c9bcbe9283241b38540162b3dbe26b 100644
--- a/paddle/gserver/layers/InterpolationLayer.cpp
+++ b/paddle/gserver/layers/InterpolationLayer.cpp
@@ -33,12 +33,12 @@ namespace paddle {
  */
 
 class InterpolationLayer : public Layer {
-protected:
+ protected:
   /// weightLast = 1 - weight
   MatrixPtr weightLast_;
   MatrixPtr tmpMatrix;
 
-public:
+ public:
   explicit InterpolationLayer(const LayerConfig& config) : Layer(config) {}
 
   ~InterpolationLayer() {}
diff --git a/paddle/gserver/layers/KmaxSeqScoreLayer.cpp b/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
index 0ea960902efc10007896b3f4ce915dea79d0d12d..7fd25954efeb9d9e672040f9909198f2ae3c0449 100644
--- a/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
+++ b/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
@@ -17,14 +17,14 @@ limitations under the License. */
 namespace paddle {
 
 class KmaxSeqScoreLayer : public Layer {
-private:
+ private:
   MatrixPtr scores_;
   size_t beamSize_;
   void kmaxScorePerSeq(const real* score,
                        real* sortedRes,
                        const ICpuGpuVectorPtr seqStartPos);
 
-public:
+ public:
   explicit KmaxSeqScoreLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/L2DistanceLayer.h b/paddle/gserver/layers/L2DistanceLayer.h
index 97f35daf7860fb3b082ef03203327e09dca67371..44e688e1377145845033d9d5cc3f31f5594a11f6 100644
--- a/paddle/gserver/layers/L2DistanceLayer.h
+++ b/paddle/gserver/layers/L2DistanceLayer.h
@@ -33,7 +33,7 @@ namespace paddle {
  */
 
 class L2DistanceLayer : public Layer {
-public:
+ public:
   explicit L2DistanceLayer(const LayerConfig& config) : Layer(config) {}
   ~L2DistanceLayer() {}
 
@@ -43,7 +43,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-private:
+ private:
   // Store the result of subtracting Input2 from Input1 in forward computation,
   // which will be reused in backward computation.
   MatrixPtr inputSub_;
diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
index 8da342a00f72ee1196c4af24104ce92c6bbf9f5c..13e20e8316323f9082a9615041584685853aa395 100644
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -60,7 +60,7 @@ enum PADDLE_DEVICE_ID {
  * Define necessary variables and functions for every layer.
  */
 class Layer {
-protected:
+ protected:
   /// Layer config
   LayerConfig config_;
   /// whether to use GPU
@@ -112,7 +112,7 @@ protected:
   /// Layer backward function
   std::vector<std::shared_ptr<FunctionBase>> backward_;
 
-public:
+ public:
   /**
    * Wait until all input value ready.
    * Called before Layer::forward() function.
@@ -137,7 +137,7 @@ public:
    */
   virtual void markAllInputGrad();
 
-protected:
+ protected:
   /**
    * Create layer function. Function is called in forward or backward.
    * \param function, Layer::forward_ or Layer::backward_
@@ -252,7 +252,7 @@ protected:
    */
   void addOutputArgument(int deviceId);
 
-public:
+ public:
   explicit Layer(const LayerConfig& config, bool useGpu = FLAGS_use_gpu);
   virtual ~Layer() {}
 
@@ -490,7 +490,7 @@ public:
    */
   virtual void onPassEnd() {}
 
-protected:
+ protected:
   /**
    * Forward of activation function.
    */
diff --git a/paddle/gserver/layers/LinearChainCRF.h b/paddle/gserver/layers/LinearChainCRF.h
index 1ea4c7e105703b76601499bf3944648cdc98ec99..e802b701d0237bed44adc83273fe53c3e18c92ec 100644
--- a/paddle/gserver/layers/LinearChainCRF.h
+++ b/paddle/gserver/layers/LinearChainCRF.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 
 class LinearChainCRF {
-public:
+ public:
   /**
    * The size of para must be \f$(numClasses + 2) * numClasses\f$.
    * The first numClasses values of para are for starting weights (\f$a\f$).
@@ -71,7 +71,7 @@ public:
    */
   MatrixPtr getXGrad() { return matGrad_; }
 
-protected:
+ protected:
   int numClasses_;
   MatrixPtr a_;
   MatrixPtr b_;
diff --git a/paddle/gserver/layers/LinearChainCTC.h b/paddle/gserver/layers/LinearChainCTC.h
index 0b774277dc8cf27f48c6905168cdea047365c99d..5b325a0deb0e9d8df241175159321e52f527f6c4 100644
--- a/paddle/gserver/layers/LinearChainCTC.h
+++ b/paddle/gserver/layers/LinearChainCTC.h
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 
 class LinearChainCTC {
-public:
+ public:
   LinearChainCTC(int numClasses, bool normByTimes);
 
   // Calculate the negative log probability as loss
@@ -35,7 +35,7 @@ public:
                 int* labelSeq,
                 int labelSeqLen);
 
-protected:
+ protected:
   int numClasses_, blank_, totalSegments_, totalTime_;
   bool normByTimes_;
   bool isInvalid_;
diff --git a/paddle/gserver/layers/LstmCompute.h b/paddle/gserver/layers/LstmCompute.h
index b7d55eb1f984d102802cab87ba12ca9c69a2f4be..80fb01cd1885151c8d62a4b5dfdb4ba08327926d 100644
--- a/paddle/gserver/layers/LstmCompute.h
+++ b/paddle/gserver/layers/LstmCompute.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 
 class LstmCompute {
-public:
+ public:
   void init(LayerConfig &config);
 
   /**
@@ -57,7 +57,7 @@ public:
                            hl_lstm_grad grad,
                            int frameSize);
 
-public:
+ public:
   hl_activation_mode_t activeNode_;
   hl_activation_mode_t activeGate_;
   hl_activation_mode_t activeState_;
diff --git a/paddle/gserver/layers/LstmLayer.h b/paddle/gserver/layers/LstmLayer.h
index 4568b13ade5555e3cff703ceda1bbce3007c409d..76dfe8146bf67a0b7b4fd4835851fae6ac38d80f 100644
--- a/paddle/gserver/layers/LstmLayer.h
+++ b/paddle/gserver/layers/LstmLayer.h
@@ -71,7 +71,7 @@ namespace paddle {
  */
 
 class LstmLayer : public Layer, public LstmCompute {
-public:
+ public:
   explicit LstmLayer(const LayerConfig &config) : Layer(config) {}
 
   bool init(const LayerMap &layerMap,
@@ -87,7 +87,7 @@ public:
 
   LayerStatePtr getState() override;
 
-protected:
+ protected:
   /**
    * @brief Compute lstm forward one sequence by one sequence.
    * @param batchSize The batchSize is not equal to the batch_size in
@@ -165,7 +165,7 @@ protected:
    */
   void getPrevBatchState(size_t numSequences);
 
-protected:
+ protected:
   /// Learned parameters, shape: (size, 4*size).
   /// The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
   std::unique_ptr<Weight> weight_;
diff --git a/paddle/gserver/layers/LstmStepLayer.cpp b/paddle/gserver/layers/LstmStepLayer.cpp
index 8faaa1c4e138fe1ec04b1911449d05528bb5b8b5..c44768ddb2b903763288465325899d86176df73a 100644
--- a/paddle/gserver/layers/LstmStepLayer.cpp
+++ b/paddle/gserver/layers/LstmStepLayer.cpp
@@ -22,7 +22,7 @@ namespace paddle {
  * LstmStepLayer used in recurrent layer group.
  */
 class LstmStepLayer : public Layer, public LstmCompute {
-protected:
+ protected:
   Argument state_;
   Argument gate_;
   Argument stateActive_;
@@ -30,7 +30,7 @@ protected:
   MatrixPtr checkIgGrad_, checkFgGrad_, checkOgGrad_;
   std::unique_ptr<Weight> weight_;
 
-public:
+ public:
   explicit LstmStepLayer(const LayerConfig& config) : Layer(config) {}
 
   ~LstmStepLayer() {}
diff --git a/paddle/gserver/layers/MDLstmLayer.cpp b/paddle/gserver/layers/MDLstmLayer.cpp
index 7cfdb3ff25096ad06c09434cdee48b5f85d650af..22c28157c5a5b19aa54b3151a6c9a4cdcfb01765 100644
--- a/paddle/gserver/layers/MDLstmLayer.cpp
+++ b/paddle/gserver/layers/MDLstmLayer.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 
 class CoordIterator {
-public:
+ public:
   std::vector<int> dims_;
   std::vector<bool> directions_;
   std::vector<int> curPos_;
@@ -51,7 +51,7 @@ public:
     }
   }
 
-public:
+ public:
   CoordIterator(std::vector<int> dim, std::vector<bool> directions)
       : dims_(dim), directions_(directions), end_(false) {
     CHECK_EQ(dims_.size(), directions_.size());
@@ -178,7 +178,7 @@ public:
  * */
 
 class MDLstmLayer : public LstmLayer {
-public:
+ public:
   explicit MDLstmLayer(const LayerConfig& config) : LstmLayer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -188,13 +188,13 @@ public:
 
   void backward(const UpdateCallback& callback) override;
 
-protected:
+ protected:
   void forwardOneSequence(int start, CoordIterator& coordIter);
   void backwardOneSequence(int start, CoordIterator& coordIter);
   void forwardGate2OutputSequence(int start, CoordIterator& coordIter);
   void backwardGate2OutputSequence(int start, CoordIterator& coordIter);
 
-protected:
+ protected:
   std::vector<Argument> frameInputGate_;
   std::vector<Argument> frameForgetGate_;
   std::vector<Argument> frameOutputGate_;
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.h b/paddle/gserver/layers/MKLDNNAddtoLayer.h
index e40e2f2251a1b739958773b8e6dc95a70ed58c76..0b385e804fdbc74c8612031cf415d06f15ce311a 100644
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.h
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.h
@@ -25,7 +25,7 @@ namespace paddle {
  * The config file api is mkldnn_addto
  */
 class MKLDNNAddtoLayer : public MKLDNNLayer {
-protected:
+ protected:
   // layer size == ic * ih * iw == oc * oh *ow, and can not be changed
   size_t layerSize_;
 
@@ -38,7 +38,7 @@ protected:
   std::vector<std::shared_ptr<mkldnn::primitive>> fwdBias_;
   std::shared_ptr<mkldnn::primitive> bwdBias_;
 
-public:
+ public:
   explicit MKLDNNAddtoLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
 
   ~MKLDNNAddtoLayer() {}
@@ -59,7 +59,7 @@ public:
 
   void updateWeights(const UpdateCallback& callback) override;
 
-protected:
+ protected:
   void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
diff --git a/paddle/gserver/layers/MKLDNNBase.h b/paddle/gserver/layers/MKLDNNBase.h
index d84e2859407711c13c475a19e140e2f5f51e61c2..786ceaf86086d7c04331641693181809ac019597 100644
--- a/paddle/gserver/layers/MKLDNNBase.h
+++ b/paddle/gserver/layers/MKLDNNBase.h
@@ -31,7 +31,7 @@ typedef enum {
  *
  */
 class CPUEngine {
-public:
+ public:
   static CPUEngine& Instance() {
     // Thread-safe in C++11.
     static CPUEngine myInstance;
@@ -46,12 +46,12 @@ public:
 
   mkldnn::engine& getEngine() { return cpuEngine_; }
 
-protected:
+ protected:
   CPUEngine() : cpuEngine_(mkldnn::engine::cpu, 0) {}
   //    CPUEngine() : cpuEngine_(mkldnn::engine::cpu_lazy, 0) {}
   ~CPUEngine() {}
 
-private:
+ private:
   mkldnn::engine cpuEngine_;
 };
 
@@ -60,7 +60,7 @@ private:
  *
  */
 class MKLDNNStream {
-public:
+ public:
   MKLDNNStream() : ready_(false) { resetState(); }
 
   virtual ~MKLDNNStream() {}
@@ -89,7 +89,7 @@ public:
     ready_ = true;
   }
 
-private:
+ private:
   bool ready_;
   std::shared_ptr<mkldnn::stream> stream_;
 };
diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.h b/paddle/gserver/layers/MKLDNNBatchNormLayer.h
index 93e182206a1ab1f06087cb808bb266ddea1468c9..9aa20df98f30837e1b80b4269d05d85b7d99ba76 100644
--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.h
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.h
@@ -27,7 +27,7 @@ typedef mkldnn::batch_normalization_backward bn_bwd;
  * The config file api is mkldnn_batch_norm
  */
 class MKLDNNBatchNormLayer : public MKLDNNLayer {
-protected:
+ protected:
   // save forward primitive_desc, which can be used backward
   std::shared_ptr<bn_fwd::primitive_desc> fwdPD_;
 
@@ -62,7 +62,7 @@ protected:
   MKLDNNMatrixPtr mean_;
   MKLDNNMatrixPtr var_;
 
-public:
+ public:
   explicit MKLDNNBatchNormLayer(const LayerConfig& config)
       : MKLDNNLayer(config), useGlobalStats_(true), hasInitedWgt_(false) {}
 
@@ -88,7 +88,7 @@ public:
 
   void convertWeightsFromPaddle() override;
 
-protected:
+ protected:
   void initWeight();
   /**
    * cal moving mean and variance.
diff --git a/paddle/gserver/layers/MKLDNNConcatLayer.h b/paddle/gserver/layers/MKLDNNConcatLayer.h
index f7abdabfb51df27f8db4e6d4d88c80546eeba248..d7738df6c106c68f55b313f2d119e31c6e444cbf 100644
--- a/paddle/gserver/layers/MKLDNNConcatLayer.h
+++ b/paddle/gserver/layers/MKLDNNConcatLayer.h
@@ -25,7 +25,7 @@ namespace paddle {
  * The config file api is mkldnn_concat
  */
 class MKLDNNConcatLayer : public MKLDNNLayer {
-protected:
+ protected:
   std::vector<std::shared_ptr<mkldnn::primitive>> bwds_;
   // input channel numbers
   std::vector<int> channels_;
@@ -35,7 +35,7 @@ protected:
   // if axis_ == 1, concat channel (default)
   int axis_;
 
-public:
+ public:
   explicit MKLDNNConcatLayer(const LayerConfig& config)
       : MKLDNNLayer(config), axis_(1) {}
 
@@ -75,7 +75,7 @@ public:
     return totalSize;
   }
 
-protected:
+ protected:
   void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
                        MKLDNNMatrixPtr& out);
   void resetFwdPD(std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.h b/paddle/gserver/layers/MKLDNNConvLayer.h
index 29c8735fbb91e7418797874238eb87759420f181..d399035ed3ae2f411587c1fcf1799bb71c8de63e 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.h
+++ b/paddle/gserver/layers/MKLDNNConvLayer.h
@@ -28,7 +28,7 @@ typedef mkldnn::convolution_backward_data conv_bwdData;
  * The config file api is mkldnn_conv
  */
 class MKLDNNConvLayer : public MKLDNNLayer {
-protected:
+ protected:
   // padding height and width
   int ph_, pw_;
   // stride height and width
@@ -59,7 +59,7 @@ protected:
   std::unique_ptr<Weight> weight_;
   std::unique_ptr<Weight> biases_;
 
-public:
+ public:
   explicit MKLDNNConvLayer(const LayerConfig& config)
       : MKLDNNLayer(config), hasInitedWgt_(false), caffeMode_(true) {}
 
@@ -92,7 +92,7 @@ public:
                        << ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_;
   }
 
-protected:
+ protected:
   /**
    * load the dims settings of this conv
    */
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/gserver/layers/MKLDNNFcLayer.h
index 0d41a4379d677f86f672852fec09b1241009597b..a704066cc818a6b33bd0eed4612d62b674fa72ca 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -28,7 +28,7 @@ typedef mkldnn::inner_product_backward_data fc_bwdData;
  * The config file api is mkldnn_fc
  */
 class MKLDNNFcLayer : public MKLDNNLayer {
-protected:
+ protected:
   // input layer size, can not be change after init
   size_t iLayerSize_;  // == ic * ih * iw
 
@@ -42,7 +42,7 @@ protected:
   std::unique_ptr<Weight> weight_;
   std::unique_ptr<Weight> biases_;
 
-public:
+ public:
   explicit MKLDNNFcLayer(const LayerConfig& config)
       : MKLDNNLayer(config), hasInitedWgt_(false) {}
 
@@ -68,7 +68,7 @@ public:
 
   void convertWeightsToPaddle() override;
 
-protected:
+ protected:
   void resetFwdBuffers(MKLDNNMatrixPtr& in,
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& bias,
diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.h b/paddle/gserver/layers/MKLDNNLRNLayer.h
index b503ee55947294d7c44d1760058f8c26bceed142..028438f2c93b2182318c53cd348351376d491e79 100644
--- a/paddle/gserver/layers/MKLDNNLRNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLRNLayer.h
@@ -27,7 +27,7 @@ typedef mkldnn::lrn_backward lrn_bwd;
  * The config file api is mkldnn_lrn
  */
 class MKLDNNLRNLayer : public MKLDNNLayer {
-protected:
+ protected:
   // save forward primitive_desc, which can be used in backward
   std::shared_ptr<lrn_fwd::primitive_desc> fwdPD_;
   // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
@@ -37,7 +37,7 @@ protected:
   int localSize_;
   float alpha_, beta_;  // scale and pow in paddle
 
-public:
+ public:
   explicit MKLDNNLRNLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
 
   ~MKLDNNLRNLayer() {}
@@ -56,7 +56,7 @@ public:
                 std::vector<MKLDNNMatrixPtr>& inputs,
                 MKLDNNMatrixPtr& out) override;
 
-protected:
+ protected:
   void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
   void resetFwdPD(std::shared_ptr<lrn_fwd::primitive_desc>& pd,
                   MKLDNNMatrixPtr in,
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 4a7eb74ce3a13ed38be3548d8ce34382c594205a..2b164d0d3bc0e1446d7e4d82bb8a713195dbd927 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -33,7 +33,7 @@ typedef std::shared_ptr<MKLDNNLayer> MKLDNNLayerPtr;
  *
  */
 class MKLDNNLayer : public Layer {
-protected:
+ protected:
   // batch size
   int bs_;
   // their sizes are always from the first input layer
@@ -95,7 +95,7 @@ protected:
   // tmp input argument to save input grad, only used to merge grad
   Argument tmpInArg_;
 
-public:
+ public:
   explicit MKLDNNLayer(const LayerConfig& config)
       : Layer(config),
         ih_(0),
@@ -162,7 +162,7 @@ public:
    */
   void addOutputArgument(int deviceId) { Layer::addOutputArgument(deviceId); }
 
-protected:
+ protected:
   /**
    * Some layers may have different condition to reset the forward.
    * The function returns the condition that do not need reset forward.
@@ -233,7 +233,7 @@ protected:
    */
   void resetMergeGrad(MKLDNNMatrixPtr& out);
 
-protected:
+ protected:
   /**
    * Set deviceId of this layer.
    */
@@ -340,7 +340,7 @@ protected:
     }
   }
 
-private:
+ private:
   /**
    * clear all grad
    */
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.h b/paddle/gserver/layers/MKLDNNPoolLayer.h
index 12821cda7308602dd2fe834f52c614e6112b7cea..1eb0ee4ad946f61e32b7d4f4fd376dda89d6acf7 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.h
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.h
@@ -27,7 +27,7 @@ typedef mkldnn::pooling_backward pool_bwd;
  * The config file api is mkldnn_pool
  */
 class MKLDNNPoolLayer : public MKLDNNLayer {
-protected:
+ protected:
   // padding height and width
   int ph_, pw_;
   // stride height and width
@@ -44,7 +44,7 @@ protected:
   // test_pooling_forward.cpp, pool need workspace for backward
   std::shared_ptr<mkldnn::memory> workspace_;
 
-public:
+ public:
   explicit MKLDNNPoolLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
 
   ~MKLDNNPoolLayer() {}
@@ -70,7 +70,7 @@ public:
                        << ", sw: " << sw_;
   }
 
-protected:
+ protected:
   void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
   void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
                   MKLDNNMatrixPtr in,
diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
index 37eb362d45215edc736984f8da784fe74bb43f2b..441025a9c9d75786b17db84c74995a96b6a06ea8 100644
--- a/paddle/gserver/layers/MKLPackedRecurrentLayer.h
+++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
@@ -29,7 +29,7 @@ namespace paddle {
  */
 
 class MKLPackedRecurrentLayer : public RecurrentLayer {
-public:
+ public:
   explicit MKLPackedRecurrentLayer(const LayerConfig& config)
       : RecurrentLayer(config) {}
 
@@ -38,7 +38,7 @@ public:
 
   void backward(const UpdateCallback& callback) override;
 
-protected:
+ protected:
   void forwardBatch(int batchSize,
                     size_t numSequences,
                     const int* starts) override;
@@ -47,7 +47,7 @@ protected:
                      size_t numSequences,
                      const int* starts) override;
 
-protected:
+ protected:
   /// packed_weight_ contains same data with
   /// RecurrentLayer::weight_ but is packed
   std::unique_ptr<MKLPackedWeight> packed_weight_;
diff --git a/paddle/gserver/layers/MKLPackedWeight.h b/paddle/gserver/layers/MKLPackedWeight.h
index 28b8a7db7cc3d2be12d6ce9291de1e415cf77bbc..b01a961d007a0e2e343db7b51e50fd3ee776435e 100644
--- a/paddle/gserver/layers/MKLPackedWeight.h
+++ b/paddle/gserver/layers/MKLPackedWeight.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 
 class MKLPackedWeight {
-protected:
+ protected:
   /// The pointer of weight
   real *weight_;
   /// The pointer of cblas packed gemm to weight
@@ -30,7 +30,7 @@ protected:
   size_t width_;
   bool transW_;
 
-public:
+ public:
   explicit MKLPackedWeight(MatrixPtr weight, bool transW = false) {
     packedWeight_ = nullptr;
     weight_ = weight->getData();
@@ -59,7 +59,7 @@ public:
                         dst->getWidth());
   }
 
-protected:
+ protected:
   void pack_(real *src) {
     if (!packedWeight_) {
       packedWeight_ = cblas_sgemm_alloc(CblasBMatrix, 1, width_, height_);
diff --git a/paddle/gserver/layers/MaxIdLayer.cpp b/paddle/gserver/layers/MaxIdLayer.cpp
index 84e375d7441ce3ccd8a5df94df22d85d104b5d96..eecd4996e962857b09001a1bb36bc027cbaa4308 100644
--- a/paddle/gserver/layers/MaxIdLayer.cpp
+++ b/paddle/gserver/layers/MaxIdLayer.cpp
@@ -23,11 +23,11 @@ namespace paddle {
  * The config file api is maxid_layer.
  */
 class MaxIdLayer : public Layer {
-private:
+ private:
   /// a predetermined number of best states at each level
   size_t beamSize_;
 
-public:
+ public:
   explicit MaxIdLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/MaxLayer.h b/paddle/gserver/layers/MaxLayer.h
index 9dbc672652dc2670a775f02ecd3a9de9919c8ae0..e46f997c342ce5d6b724629dff6950c4f1680ce8 100644
--- a/paddle/gserver/layers/MaxLayer.h
+++ b/paddle/gserver/layers/MaxLayer.h
@@ -39,11 +39,11 @@ namespace paddle {
  */
 
 class MaxLayer : public SequencePoolLayer {
-protected:
+ protected:
   // maxIndex_[i][j] = k : the value at (i, j) is from input[k].
   IVectorPtr maxIndex_;
 
-public:
+ public:
   explicit MaxLayer(const LayerConfig& config) : SequencePoolLayer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/MaxOutLayer.h b/paddle/gserver/layers/MaxOutLayer.h
index 1fb371836bacb9e02cc32eabfd21bf24165b0734..0eb8674b4c4f3f58b103c6b59ad13931a6992a1b 100644
--- a/paddle/gserver/layers/MaxOutLayer.h
+++ b/paddle/gserver/layers/MaxOutLayer.h
@@ -29,7 +29,7 @@ namespace paddle {
  */
 
 class MaxOutLayer : public Layer {
-protected:
+ protected:
   size_t groups_;
   size_t imgSizeH_, imgSizeW_;
   /// outputChannels_ = channels_ / groups_
@@ -38,7 +38,7 @@ protected:
   size_t featLen_;
   IVectorPtr maxoutId_;
 
-public:
+ public:
   /// return imgSizeH_ * imgSizeW_ * outputChannels_;
   size_t getSize();
 
diff --git a/paddle/gserver/layers/MaxPoolWithMaskLayer.h b/paddle/gserver/layers/MaxPoolWithMaskLayer.h
index 74cc8acf3515b10257ffb185061344fbcc94a337..c948364f6b83b0de1ee07cc185b69346f5cb1a7e 100644
--- a/paddle/gserver/layers/MaxPoolWithMaskLayer.h
+++ b/paddle/gserver/layers/MaxPoolWithMaskLayer.h
@@ -23,10 +23,10 @@ namespace paddle {
  * @brief Basic parent layer of different kinds of pooling
  */
 class MaxPoolWithMaskLayer : public PoolLayer {
-protected:
+ protected:
   Argument mask_;
 
-public:
+ public:
   explicit MaxPoolWithMaskLayer(const LayerConfig& config)
       : PoolLayer(config) {}
 
diff --git a/paddle/gserver/layers/MixedLayer.h b/paddle/gserver/layers/MixedLayer.h
index a1a43c52e4f503178a66ad8aa6c12bec89566081..43ee2bd81854f2dea837734f556c197613f6fdaf 100644
--- a/paddle/gserver/layers/MixedLayer.h
+++ b/paddle/gserver/layers/MixedLayer.h
@@ -30,7 +30,7 @@ namespace paddle {
  * The config file api is mixed_layer.
  */
 class MixedLayer : public Layer {
-public:
+ public:
   explicit MixedLayer(const LayerConfig& config) : Layer(config) {}
 
   ~MixedLayer() {}
@@ -52,7 +52,7 @@ public:
    */
   LayerStatePtr getState() override;
 
-protected:
+ protected:
   std::vector<std::unique_ptr<Projection>> projections_;
   std::vector<std::unique_ptr<Operator>> operators_;
   /// the matrix size of projection state
diff --git a/paddle/gserver/layers/MultiBoxLossLayer.h b/paddle/gserver/layers/MultiBoxLossLayer.h
index 9935da56446c1508549906becfd28548d5deecde..a358cded00bb01bfe5d02f9a6d8a24e4b2e51b74 100644
--- a/paddle/gserver/layers/MultiBoxLossLayer.h
+++ b/paddle/gserver/layers/MultiBoxLossLayer.h
@@ -41,7 +41,7 @@ namespace paddle {
  */
 
 class MultiBoxLossLayer : public CostLayer {
-public:
+ public:
   explicit MultiBoxLossLayer(const LayerConfig& config) : CostLayer(config) {}
 
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
@@ -54,7 +54,7 @@ public:
 
   void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {}
 
-protected:
+ protected:
   inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
   inline LayerPtr getLabelLayer() { return inputLayers_[1]; }
   inline LayerPtr getLocInputLayer(size_t index) {
@@ -64,7 +64,7 @@ protected:
     return inputLayers_[2 + inputNum_ + index];
   }
 
-protected:
+ protected:
   size_t numClasses_;
   real overlapThreshold_;
   real negPosRatio_;
diff --git a/paddle/gserver/layers/MultinomialSampler.h b/paddle/gserver/layers/MultinomialSampler.h
index 1f9e818ee5d21188e3bd39d1225912a1a2ae1598..8cbb229f157c0904e63a696f860ec6739d5167c4 100644
--- a/paddle/gserver/layers/MultinomialSampler.h
+++ b/paddle/gserver/layers/MultinomialSampler.h
@@ -29,7 +29,7 @@ namespace paddle {
  * The computational complexity of generate one sample is O(1).
  */
 class MultinomialSampler {
-public:
+ public:
   MultinomialSampler(const real* prob, int size);
 
   //! protobuf always using double.
@@ -53,7 +53,7 @@ public:
     return gen1([&g, this]() { return rand_(g); });
   }
 
-protected:
+ protected:
   /**
    * @brief Generation
    * @param[in] rand rand is a real random number distribution
diff --git a/paddle/gserver/layers/MultiplexLayer.cpp b/paddle/gserver/layers/MultiplexLayer.cpp
index 82857f8c3ef3e39ec451c1f26bac4996c12350a5..43ecc48cd97fb54d8dc4eb1d87ebf60f5aa040d8 100644
--- a/paddle/gserver/layers/MultiplexLayer.cpp
+++ b/paddle/gserver/layers/MultiplexLayer.cpp
@@ -37,7 +37,7 @@ namespace paddle {
  */
 
 class MultiplexLayer : public Layer {
-protected:
+ protected:
   /**
    * @brief A struct is used to save the copy information, includes input
    * layer index and copy size.
@@ -64,7 +64,7 @@ protected:
   /// Temporary matrix pointer to point to output data.
   MatrixPtr tmpDest_;
 
-public:
+ public:
   explicit MultiplexLayer(const LayerConfig& config) : Layer(config) {}
 
   ~MultiplexLayer() {}
@@ -75,7 +75,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-private:
+ private:
   /**
    * @brief Calculate copy info for input layers.
    */
diff --git a/paddle/gserver/layers/NCELayer.cpp b/paddle/gserver/layers/NCELayer.cpp
index d3d7b1fd9ac3c366d11c3060848e89c24a16a70b..cc48fe100f12446f9522078119ae2ead039a82cc 100644
--- a/paddle/gserver/layers/NCELayer.cpp
+++ b/paddle/gserver/layers/NCELayer.cpp
@@ -54,7 +54,7 @@ class NCELayer : public Layer {
 
   IVectorPtr labelIds_;
 
-public:
+ public:
   explicit NCELayer(const LayerConfig& config)
       : Layer(config),
         numClasses_(config.num_classes()),
diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h
index c89cbbfce9d9e35a6dd300864ee094ef8f9e283a..3807584415f99a7110170748501589dac85eac52 100644
--- a/paddle/gserver/layers/NormLayer.h
+++ b/paddle/gserver/layers/NormLayer.h
@@ -27,7 +27,7 @@ namespace paddle {
  * @note Normalize the input in local region
  */
 class NormLayer : public Layer {
-public:
+ public:
   explicit NormLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -49,12 +49,12 @@ public:
  * Need to implement in the futrue.
  */
 class ResponseNormLayer : public NormLayer {
-protected:
+ protected:
   size_t channels_, size_, outputX_, imgSize_, outputY_, imgSizeY_;
   real scale_, pow_;
   MatrixPtr denoms_;
 
-public:
+ public:
   explicit ResponseNormLayer(const LayerConfig& config) : NormLayer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -76,7 +76,7 @@ public:
  *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
  */
 class CrossChannelNormLayer : public NormLayer {
-public:
+ public:
   explicit CrossChannelNormLayer(const LayerConfig& config)
       : NormLayer(config) {}
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
@@ -85,7 +85,7 @@ public:
   MatrixPtr createSampleMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
   MatrixPtr createSpatialMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
 
-protected:
+ protected:
   size_t channels_;
   std::unique_ptr<Weight> scale_;
   MatrixPtr scaleDiff_;
diff --git a/paddle/gserver/layers/NormProjectionLayer.h b/paddle/gserver/layers/NormProjectionLayer.h
index 898b5823a9011c4b66e045c54afba070dd5cf772..64803a1603599f2e393ec772a32d64f4d271fe71 100644
--- a/paddle/gserver/layers/NormProjectionLayer.h
+++ b/paddle/gserver/layers/NormProjectionLayer.h
@@ -28,7 +28,7 @@ class CMRProjectionNormLayer : public ResponseNormLayer {
   size_t imgSizeH_, imgSizeW_;
   size_t outputH_, outputW_;
 
-public:
+ public:
   explicit CMRProjectionNormLayer(const LayerConfig& config)
       : ResponseNormLayer(config) {}
 
@@ -41,7 +41,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   TensorShape shape_;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/Operator.h b/paddle/gserver/layers/Operator.h
index a620926cccd3004d7bef57976047a190b4b566e2..42d525ef3e4534acea7512d5ecdbe8a0e1d110d9 100644
--- a/paddle/gserver/layers/Operator.h
+++ b/paddle/gserver/layers/Operator.h
@@ -34,7 +34,7 @@ namespace paddle {
  * @note: Operator can't have parameters.
  */
 class Operator {
-public:
+ public:
   static Operator* create(const OperatorConfig& config, bool useGpu);
 
   Operator(const OperatorConfig& config, bool useGpu)
@@ -81,7 +81,7 @@ public:
    */
   virtual LayerStatePtr getState() { return nullptr; }
 
-protected:
+ protected:
   /// Config of operator
   OperatorConfig config_;
   bool useGpu_;
diff --git a/paddle/gserver/layers/OuterProdLayer.cpp b/paddle/gserver/layers/OuterProdLayer.cpp
index 75f4abf93e5db11dc688f8f2e0b2a36bf70fbccc..11a910f3316114b309efe9007a156e842b3d6229 100644
--- a/paddle/gserver/layers/OuterProdLayer.cpp
+++ b/paddle/gserver/layers/OuterProdLayer.cpp
@@ -28,12 +28,12 @@ namespace paddle {
  */
 
 class OuterProdLayer : public Layer {
-protected:
+ protected:
   MatrixPtr tmpMtx0;
   MatrixPtr tmpRow0;
   MatrixPtr tmpRow1;
 
-public:
+ public:
   explicit OuterProdLayer(const LayerConfig& config) : Layer(config) {}
 
   ~OuterProdLayer() {}
diff --git a/paddle/gserver/layers/PadLayer.h b/paddle/gserver/layers/PadLayer.h
index 7e09d7f8a0d4dfd5300298ad0514b69781d87016..46b8a595978489c630b3ff2429ecb19d7c12521a 100644
--- a/paddle/gserver/layers/PadLayer.h
+++ b/paddle/gserver/layers/PadLayer.h
@@ -24,7 +24,7 @@ namespace paddle {
  *         the 4th dimenstion according padc_, padh_ and padw_.
  */
 class PadLayer : public Layer {
-public:
+ public:
   explicit PadLayer(const LayerConfig& config) : Layer(config) {}
 
   ~PadLayer() {}
@@ -34,7 +34,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   void setOutDims(const size_t batchSize);
   void setTensorDim(const size_t batchSize);
 
diff --git a/paddle/gserver/layers/ParameterReluLayer.h b/paddle/gserver/layers/ParameterReluLayer.h
index 3725fa4a1199285b703590255af492ebffdaab2c..4553413fcdbecbc83e1f50e8ffbe874fdf05d828 100644
--- a/paddle/gserver/layers/ParameterReluLayer.h
+++ b/paddle/gserver/layers/ParameterReluLayer.h
@@ -36,7 +36,7 @@ namespace paddle {
  */
 
 class ParameterReluLayer : public Layer {
-protected:
+ protected:
   std::unique_ptr<Weight> weight_;
 
   /**
@@ -51,7 +51,7 @@ protected:
    */
   size_t partialSum_;
 
-public:
+ public:
   explicit ParameterReluLayer(const LayerConfig& config) : Layer(config) {}
 
   ~ParameterReluLayer() {}
diff --git a/paddle/gserver/layers/Pool3DLayer.h b/paddle/gserver/layers/Pool3DLayer.h
index 59ee73f7cb9fb4287c12f3c7d0cacfc812484770..32605f8b7028cfb4909c885e83017a8cffa79575 100644
--- a/paddle/gserver/layers/Pool3DLayer.h
+++ b/paddle/gserver/layers/Pool3DLayer.h
@@ -26,7 +26,7 @@ namespace paddle {
  * Pools the input within regions
  */
 class Pool3DLayer : public Layer {
-public:
+ public:
   explicit Pool3DLayer(const LayerConfig& config) : Layer(config) {}
   ~Pool3DLayer() {}
 
@@ -36,7 +36,7 @@ public:
   void backward(const UpdateCallback& callback) override;
   size_t getSize();
 
-protected:
+ protected:
   int channels_;
   int sizeX_, sizeY_, sizeZ_;
   int strideW_, strideH_, strideD_;
diff --git a/paddle/gserver/layers/PoolLayer.h b/paddle/gserver/layers/PoolLayer.h
index 58d5fb0a095e8326f9b6f9cb2a97bb88022ceed8..99f8f148e2eb00f7e431e7d8c5acbf9e27574017 100644
--- a/paddle/gserver/layers/PoolLayer.h
+++ b/paddle/gserver/layers/PoolLayer.h
@@ -26,7 +26,7 @@ namespace paddle {
  * Pools the input within regions
  */
 class PoolLayer : public Layer {
-protected:
+ protected:
   size_t channels_, sizeX_, stride_, outputX_, imgSize_;
   int confPadding_;
 
@@ -40,7 +40,7 @@ protected:
 
   bool excludeMode_;
 
-public:
+ public:
   explicit PoolLayer(const LayerConfig& config) : Layer(config) {}
 
   /**
diff --git a/paddle/gserver/layers/PoolProjection.h b/paddle/gserver/layers/PoolProjection.h
index c99287dbf0f4503c180b9b4e9e46abafa67bf64d..8004cc1550337160b7f022c97a23ed8eb9d43ca4 100644
--- a/paddle/gserver/layers/PoolProjection.h
+++ b/paddle/gserver/layers/PoolProjection.h
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 
 class PoolProjection : public Projection {
-protected:
+ protected:
   size_t imgSizeY_, imgSize_;
   size_t outputY_, outputX_;
   size_t strideY_, stride_;
@@ -30,7 +30,7 @@ protected:
   std::string poolType_;
   bool excludeMode_;
 
-public:
+ public:
   PoolProjection(const ProjectionConfig& config,
                  ParameterPtr parameter,
                  bool useGpu);
@@ -45,7 +45,7 @@ public:
 };
 
 class MaxPoolProjection : public PoolProjection {
-public:
+ public:
   MaxPoolProjection(const ProjectionConfig& config,
                     ParameterPtr parameter,
                     bool useGpu)
@@ -56,7 +56,7 @@ public:
 };
 
 class AvgPoolProjection : public PoolProjection {
-public:
+ public:
   AvgPoolProjection(const ProjectionConfig& config,
                     ParameterPtr parameter,
                     bool useGpu)
diff --git a/paddle/gserver/layers/PoolProjectionLayer.h b/paddle/gserver/layers/PoolProjectionLayer.h
index 5a97a7769aaeebcfd4fe2c10d8ac0cc8892f68e3..9ad144cc2ad426caa522bf1061a750d47e64a755 100644
--- a/paddle/gserver/layers/PoolProjectionLayer.h
+++ b/paddle/gserver/layers/PoolProjectionLayer.h
@@ -24,13 +24,13 @@ namespace paddle {
  * @brief Basic parent layer of different kinds of pooling
  */
 class PoolProjectionLayer : public PoolLayer {
-protected:
+ protected:
   size_t imgSizeH_, imgSizeW_;
   size_t outputH_, outputW_;
   std::unique_ptr<PoolProjection> poolProjection_;
   ProjectionConfig projectionConfig_;
 
-public:
+ public:
   explicit PoolProjectionLayer(const LayerConfig& config) : PoolLayer(config) {
     PoolConfig* conf = projectionConfig_.mutable_pool_conf();
     *conf = config_.inputs(0).pool_conf();
diff --git a/paddle/gserver/layers/PowerLayer.cpp b/paddle/gserver/layers/PowerLayer.cpp
index 18f650fcdaded5ad7199510594b873fc18c3d7b5..7e8d60db8fe588026c6040099745c3aefd7237b5 100644
--- a/paddle/gserver/layers/PowerLayer.cpp
+++ b/paddle/gserver/layers/PowerLayer.cpp
@@ -32,10 +32,10 @@ namespace paddle {
  */
 
 class PowerLayer : public Layer {
-protected:
+ protected:
   MatrixPtr tmpMtx;
 
-public:
+ public:
   explicit PowerLayer(const LayerConfig& config) : Layer(config) {}
 
   ~PowerLayer() {}
diff --git a/paddle/gserver/layers/PrintLayer.cpp b/paddle/gserver/layers/PrintLayer.cpp
index 5a527d598dd5e11ae0b74a32c9b9884e73ed45a8..6fbcc447f92208439bddd14d421d62cab30d81f4 100644
--- a/paddle/gserver/layers/PrintLayer.cpp
+++ b/paddle/gserver/layers/PrintLayer.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 
 class PrintLayer : public Layer {
-public:
+ public:
   explicit PrintLayer(const LayerConfig& config) : Layer(config) {}
 
   void forward(PassType passType) override {
diff --git a/paddle/gserver/layers/PriorBox.cpp b/paddle/gserver/layers/PriorBox.cpp
index af2cc05a954b3a6857c1015104a57339282840b8..39d2c2d737fa90737635efdb209610e156c8662f 100644
--- a/paddle/gserver/layers/PriorBox.cpp
+++ b/paddle/gserver/layers/PriorBox.cpp
@@ -28,7 +28,7 @@ namespace paddle {
  */
 
 class PriorBoxLayer : public Layer {
-public:
+ public:  // NOLINT
   explicit PriorBoxLayer(const LayerConfig& config) : Layer(config) {}
   bool init(const LayerMap& layerMap,
             const ParameterMap& parameterMap) override;
@@ -36,7 +36,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override {}
 
-protected:
+ protected:  // NOLINT
   int numPriors_;
   std::vector<int> minSize_;
   std::vector<int> maxSize_;
@@ -109,11 +109,18 @@ void PriorBoxLayer::forward(PassType passType) {
         real boxWidth = minSize;
         real boxHeight = minSize;
 
-        // priors with different aspect ratios
-        for (size_t r = 0; r < aspectRatio_.size(); r++) {
-          real ar = aspectRatio_[r];
-          boxWidth = minSize * sqrt(ar);
-          boxHeight = minSize / sqrt(ar);
+        // first prior: aspect_ratio == 1.0, compatible to old logic
+        tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
+        tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
+        tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
+        tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
+        // set the variance.
+        for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
+
+        if (maxSize_.size() > 0) {
+          // square prior with size sqrt(minSize * maxSize)
+          real maxSize = maxSize_[s];
+          boxWidth = boxHeight = sqrt(minSize * maxSize);
           tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
           tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
           tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
@@ -122,10 +129,14 @@ void PriorBoxLayer::forward(PassType passType) {
           for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
         }
 
-        if (maxSize_.size() > 0) {
-          // square prior with size sqrt(minSize * maxSize)
-          real maxSize = maxSize_[s];
-          boxWidth = boxHeight = sqrt(minSize * maxSize);
+        // priors with different aspect ratios
+        for (size_t r = 0; r < aspectRatio_.size(); r++) {
+          real ar = aspectRatio_[r];
+          if (fabs(ar - 1.0) < 1e-6) {
+            continue;
+          }
+          boxWidth = minSize * sqrt(ar);
+          boxHeight = minSize / sqrt(ar);
           tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
           tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
           tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
diff --git a/paddle/gserver/layers/Projection.h b/paddle/gserver/layers/Projection.h
index 1f0b96c79ec7313cd9c5ff9139a455b3269b222b..88a41355cfce711e1e9522655058d0f1198e4e76 100644
--- a/paddle/gserver/layers/Projection.h
+++ b/paddle/gserver/layers/Projection.h
@@ -37,7 +37,7 @@ namespace paddle {
  * to output Argument.
  */
 class Projection {
-public:
+ public:
   static Projection* create(const ProjectionConfig& config,
                             ParameterPtr parameter,
                             bool useGpu);
@@ -98,7 +98,7 @@ public:
    */
   size_t getOutputSize() const { return config_.output_size(); }
 
-protected:
+ protected:
   /**
    * Create layer function. Function is called in forward or backward.
    * \param function, Layer::forward_ or Layer::backward_
@@ -119,7 +119,7 @@ protected:
     func->init(config);
   }
 
-protected:
+ protected:
   /// Config of projection
   ProjectionConfig config_;
   /// Parameter of projection
diff --git a/paddle/gserver/layers/ROIPoolLayer.h b/paddle/gserver/layers/ROIPoolLayer.h
index b1735e9748dc3956aade010f33303b55d4f9f439..801a9b3aebe6d718ea38b76246a6056891d0b1f6 100644
--- a/paddle/gserver/layers/ROIPoolLayer.h
+++ b/paddle/gserver/layers/ROIPoolLayer.h
@@ -33,7 +33,7 @@ namespace paddle {
  */
 
 class ROIPoolLayer : public Layer {
-protected:
+ protected:
   size_t channels_;
   size_t width_;
   size_t height_;
@@ -44,7 +44,7 @@ protected:
   // Since there is no int matrix, use real maxtrix instead.
   MatrixPtr maxIdxs_;
 
-public:
+ public:
   explicit ROIPoolLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/RecurrentLayer.h b/paddle/gserver/layers/RecurrentLayer.h
index 8fd4fe6b78ae6474f3cfcec605f25b72af8295bb..94e633e65777aad540738ea67ea1b4e03dd75954 100644
--- a/paddle/gserver/layers/RecurrentLayer.h
+++ b/paddle/gserver/layers/RecurrentLayer.h
@@ -40,7 +40,7 @@ namespace paddle {
  */
 
 class RecurrentLayer : public Layer {
-public:
+ public:
   explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -56,7 +56,7 @@ public:
 
   LayerStatePtr getState() override;
 
-protected:
+ protected:
   /**
    * @brief If user do not set --rnn_use_batch=true, it will
    * compute rnn forward one sequence by one sequence in default.
@@ -110,7 +110,7 @@ protected:
                              size_t numSequences,
                              const int* starts);
 
-protected:
+ protected:
   std::unique_ptr<Weight> weight_;
   std::unique_ptr<Weight> bias_;
 
diff --git a/paddle/gserver/layers/RecurrentLayerGroup.cpp b/paddle/gserver/layers/RecurrentLayerGroup.cpp
index 44b57185c5a5fa7703ca477b990a73cdad2c2aa1..6694e8f2996fdd2c98da1507e5fb3b90b271c850 100644
--- a/paddle/gserver/layers/RecurrentLayerGroup.cpp
+++ b/paddle/gserver/layers/RecurrentLayerGroup.cpp
@@ -27,7 +27,7 @@ namespace paddle {
  * between RecurrentLayerGroupBegin and RecurrentLayerGroupEnd.
  */
 class RecurrentLayerGroup : public Layer {
-public:
+ public:
   explicit RecurrentLayerGroup(const LayerConfig& config) : Layer(config) {}
 
   void initSubNetwork(NeuralNetwork* rootNetwork,
@@ -58,7 +58,7 @@ public:
     callback(*network_);
   }
 
-private:
+ private:
   std::unique_ptr<RecurrentGradientMachine> network_;
 };
 
diff --git a/paddle/gserver/layers/ResizeLayer.cpp b/paddle/gserver/layers/ResizeLayer.cpp
index 831f4c3b7e103bc51d870cfa44616980adca08e8..d4ae9945934a40719d253d4b53915530423448af 100644
--- a/paddle/gserver/layers/ResizeLayer.cpp
+++ b/paddle/gserver/layers/ResizeLayer.cpp
@@ -24,7 +24,7 @@ namespace paddle {
  * resize matrix: (height * width / size) * size
  */
 class ResizeLayer : public Layer {
-public:
+ public:
   explicit ResizeLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/RotateLayer.h b/paddle/gserver/layers/RotateLayer.h
index 3b619921ab741e1236a495e497e18e265bd6e110..7ecbff20167dd95f782f2d61dc34697ab3273934 100644
--- a/paddle/gserver/layers/RotateLayer.h
+++ b/paddle/gserver/layers/RotateLayer.h
@@ -32,7 +32,7 @@ namespace paddle {
  */
 
 class RotateLayer : public Layer {
-public:
+ public:
   explicit RotateLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
@@ -40,7 +40,7 @@ public:
   void forward(PassType passType);
   void backward(const UpdateCallback& callback = nullptr);
 
-private:
+ private:
   int batchSize_;
   int size_;
   int height_;
diff --git a/paddle/gserver/layers/RowConvLayer.h b/paddle/gserver/layers/RowConvLayer.h
index ba0af1de68a5f77d9ffefac6ef5193bb9d1b4f83..3b74df0b1af5caef1a1abd3d3c5b3ae3b67c429b 100644
--- a/paddle/gserver/layers/RowConvLayer.h
+++ b/paddle/gserver/layers/RowConvLayer.h
@@ -22,7 +22,7 @@ namespace paddle {
  * \brief Row Convolution Layer.
  */
 class RowConvLayer : public Layer {
-public:
+ public:
   explicit RowConvLayer(const LayerConfig& config) : Layer(config) {}
 
   ~RowConvLayer() {}
@@ -32,7 +32,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   // Row convolution weight, context_lenght_ * fan_out.
   // fan_out is the size of output feature.
   std::unique_ptr<Weight> weight_;
diff --git a/paddle/gserver/layers/RowL2NormLayer.cpp b/paddle/gserver/layers/RowL2NormLayer.cpp
index 7ff0c9bae927cae2bc6a332bc0bde013e07edd0a..d5e6e10a0276adb74ec31c13d9e8acc77414a85b 100644
--- a/paddle/gserver/layers/RowL2NormLayer.cpp
+++ b/paddle/gserver/layers/RowL2NormLayer.cpp
@@ -26,12 +26,12 @@ namespace paddle {
  */
 
 class RowL2NormLayer : public Layer {
-protected:
+ protected:
   MatrixPtr inSquare_;
   MatrixPtr l2NormReciprocal_;
   MatrixPtr dotSum_;
 
-public:
+ public:
   explicit RowL2NormLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/SamplingIdLayer.cpp b/paddle/gserver/layers/SamplingIdLayer.cpp
index 2edd915d226edfd7e48df1a066d5a6f51f259511..dbce63588126c012e3b9713e8be749e0001ddec7 100644
--- a/paddle/gserver/layers/SamplingIdLayer.cpp
+++ b/paddle/gserver/layers/SamplingIdLayer.cpp
@@ -31,7 +31,7 @@ class SamplingIdLayer : public Layer {
   std::uniform_real_distribution<double> rand1_;
   std::vector<Argument> tmpCpuInput_;
 
-public:
+ public:
   explicit SamplingIdLayer(const LayerConfig& config)
       : Layer(config), rand1_(0, 1) {}
 
diff --git a/paddle/gserver/layers/ScaleShiftLayer.cpp b/paddle/gserver/layers/ScaleShiftLayer.cpp
index 799d1fe51a65da10bef637894931627315daf0a2..8af78a2e27d2b50572f8bdd6e98696f3d1967eb1 100644
--- a/paddle/gserver/layers/ScaleShiftLayer.cpp
+++ b/paddle/gserver/layers/ScaleShiftLayer.cpp
@@ -30,11 +30,11 @@ namespace paddle {
  */
 
 class ScaleShiftLayer : public Layer {
-protected:
+ protected:
   std::unique_ptr<Weight> scale_;
   std::unique_ptr<Weight> offset_;
 
-public:
+ public:
   explicit ScaleShiftLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/ScaleSubRegionLayer.h b/paddle/gserver/layers/ScaleSubRegionLayer.h
index 6e861be4858cfc21a42ef7293652d5cdf81be5f5..fe431698bc6cd5e52e2c545756b40be8b307e644 100644
--- a/paddle/gserver/layers/ScaleSubRegionLayer.h
+++ b/paddle/gserver/layers/ScaleSubRegionLayer.h
@@ -29,7 +29,7 @@ namespace paddle {
  *                  region.
  */
 class ScaleSubRegionLayer : public Layer {
-public:
+ public:
   explicit ScaleSubRegionLayer(const LayerConfig& config) : Layer(config) {}
 
   ~ScaleSubRegionLayer() {}
@@ -40,7 +40,7 @@ public:
 
   void backward(const UpdateCallback& callback = nullptr);
 
-protected:
+ protected:
   TensorShape shape_;
   TensorShape indicesShape_;
   size_t imgH_;
diff --git a/paddle/gserver/layers/ScalingLayer.cpp b/paddle/gserver/layers/ScalingLayer.cpp
index 1d98a7373d172d40cddc9b4611cb00434f17e00b..15e07daebee194a789da52d37a192e031348300c 100644
--- a/paddle/gserver/layers/ScalingLayer.cpp
+++ b/paddle/gserver/layers/ScalingLayer.cpp
@@ -32,7 +32,7 @@ namespace paddle {
  */
 
 class ScalingLayer : public Layer {
-public:
+ public:
   explicit ScalingLayer(const LayerConfig& config) : Layer(config) {}
 
   ~ScalingLayer() {}
diff --git a/paddle/gserver/layers/ScalingProjection.cpp b/paddle/gserver/layers/ScalingProjection.cpp
index 99b5b68f543842d23f20b626fddd66b677ebe059..4d871cafc4d0194a61044d76a766236209c33d47 100644
--- a/paddle/gserver/layers/ScalingProjection.cpp
+++ b/paddle/gserver/layers/ScalingProjection.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 
 class ScalingProjection : public Projection {
-public:
+ public:
   ScalingProjection(const ProjectionConfig& config,
                     const ParameterPtr& parameter,
                     bool useGpu)
@@ -48,7 +48,7 @@ public:
     }
   }
 
-protected:
+ protected:
   std::unique_ptr<Weight> weight_;
 };
 
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
index 81564074185a5d9fc80d4d3a64af998098ab5472..4b32ce8b162c2a8b1a6c34adc0885a7701f5f91e 100644
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
+++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
@@ -33,11 +33,11 @@ namespace paddle {
  * The config file api is selective_fc_layer.
  */
 class SelectiveFullyConnectedLayer : public Layer {
-protected:
+ protected:
   WeightList weights_;
   std::unique_ptr<Weight> biases_;
 
-private:
+ private:
   /**
    * Get selected columns each forward.
    */
@@ -60,7 +60,7 @@ private:
   /// if true, means output_.value is the same as Fc Layer
   bool fullOutput_;
 
-public:
+ public:
   explicit SelectiveFullyConnectedLayer(const LayerConfig& config)
       : Layer(config), selCols_(nullptr) {}
 
@@ -94,7 +94,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-private:
+ private:
   /**
    * @brief Make SelectiveFC act as FullyConnectedLayer
    */
diff --git a/paddle/gserver/layers/SequenceConcatLayer.cpp b/paddle/gserver/layers/SequenceConcatLayer.cpp
index cf573f3f33fcd70c6768b164f158cb1f545414fc..c84c3ce4f080cc19f4937f04585accb5b2b347f9 100644
--- a/paddle/gserver/layers/SequenceConcatLayer.cpp
+++ b/paddle/gserver/layers/SequenceConcatLayer.cpp
@@ -29,10 +29,10 @@ namespace paddle {
  */
 
 class SequenceConcatLayer : public Layer {
-protected:
+ protected:
   std::unique_ptr<Weight> biases_;
 
-public:
+ public:
   explicit SequenceConcatLayer(const LayerConfig& config) : Layer(config) {}
 
   ~SequenceConcatLayer() {}
diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
index 6c4ae775c16ac76e237fb8f8ee5ec9ed8f11802e..28d0a9296d4accd4152e886ccae12a776fdb8f7f 100644
--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -38,12 +38,12 @@ namespace paddle {
  */
 
 class SequenceLastInstanceLayer : public SequencePoolLayer {
-protected:
+ protected:
   MatrixPtr tmpSrc_;
   MatrixPtr tmpDest_;
   std::vector<int> instanceIds_;
 
-public:
+ public:
   explicit SequenceLastInstanceLayer(const LayerConfig& config)
       : SequencePoolLayer(config) {}
 
diff --git a/paddle/gserver/layers/SequencePoolLayer.h b/paddle/gserver/layers/SequencePoolLayer.h
index 254e4cc6b3aacf21565cb03e5bdb52a2beb9fea8..01183060afd58376bb718dda64d8106cce4899f9 100644
--- a/paddle/gserver/layers/SequencePoolLayer.h
+++ b/paddle/gserver/layers/SequencePoolLayer.h
@@ -41,7 +41,7 @@ namespace paddle {
  */
 
 class SequencePoolLayer : public Layer {
-protected:
+ protected:
   int type_;
   std::unique_ptr<Weight> biases_;
   enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
@@ -51,7 +51,7 @@ protected:
   // Whether the input sequence is reversed or not.
   bool reversed_ = false;
 
-public:
+ public:
   explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/SequenceReshapeLayer.cpp b/paddle/gserver/layers/SequenceReshapeLayer.cpp
index fb96669917236b98809f1cda0d023600f1e76731..319310af8c4ac3bdefd814ad05b7fde6070f2340 100644
--- a/paddle/gserver/layers/SequenceReshapeLayer.cpp
+++ b/paddle/gserver/layers/SequenceReshapeLayer.cpp
@@ -29,12 +29,12 @@ namespace paddle {
  */
 
 class SequenceReshapeLayer : public Layer {
-protected:
+ protected:
   std::unique_ptr<Weight> biases_;
 
   MatrixPtr reshapedOutputGrad;
 
-public:
+ public:
   explicit SequenceReshapeLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/SequenceSliceLayer.cpp b/paddle/gserver/layers/SequenceSliceLayer.cpp
index 1b7c33477ea64c1cdb7c8e85d7a5302b299d7552..a6d810b583aab6e44faa583795686f06e17beeb9 100644
--- a/paddle/gserver/layers/SequenceSliceLayer.cpp
+++ b/paddle/gserver/layers/SequenceSliceLayer.cpp
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 
 class SequenceSliceLayer : public Layer {
-public:
+ public:
   explicit SequenceSliceLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -30,7 +30,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-private:
+ private:
   /*
    * TODO(caoying)
    * In PaddePaddle, currently all matrices are real number types,
diff --git a/paddle/gserver/layers/SequenceToBatch.h b/paddle/gserver/layers/SequenceToBatch.h
index 8743a5ef10f61970d3d48b105b9da29bcd10ba83..5200e702d9bc947746567c19ca7d552750828131 100644
--- a/paddle/gserver/layers/SequenceToBatch.h
+++ b/paddle/gserver/layers/SequenceToBatch.h
@@ -39,7 +39,7 @@ namespace paddle {
  *
  */
 class SequenceToBatch {
-public:
+ public:
   explicit SequenceToBatch(bool useGpu) : useGpu_(useGpu) {}
 
   /* resize and calculate the batchIndex_ */
@@ -82,7 +82,7 @@ public:
     numBatch_ = seq2batch.numBatch_;
   }
 
-protected:
+ protected:
   void sequence2BatchCopy(Matrix &batch,
                           Matrix &sequence,
                           IVector &seq2BatchIdx,
diff --git a/paddle/gserver/layers/SliceProjection.cpp b/paddle/gserver/layers/SliceProjection.cpp
index 5627ad1eb3a49a73261bc2197cbd3735489509d2..b474f2db759adfad337f9485a5a38588b6839c54 100644
--- a/paddle/gserver/layers/SliceProjection.cpp
+++ b/paddle/gserver/layers/SliceProjection.cpp
@@ -44,14 +44,14 @@ namespace paddle {
  * The config file api is slice_projection.
  */
 class SliceProjection : public Projection {
-public:
+ public:
   SliceProjection(const ProjectionConfig& config,
                   const ParameterPtr& parameter,
                   bool useGpu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 
-protected:
+ protected:
   std::vector<std::pair<size_t, size_t>> slices_;
 };
 
diff --git a/paddle/gserver/layers/SlopeInterceptLayer.cpp b/paddle/gserver/layers/SlopeInterceptLayer.cpp
index c94a07e5da7442bba1ce7e9c09c4ffea3e5cd4ac..f7f4735c1b72d4ac6540714573fd7e15ef99ea5b 100644
--- a/paddle/gserver/layers/SlopeInterceptLayer.cpp
+++ b/paddle/gserver/layers/SlopeInterceptLayer.cpp
@@ -36,7 +36,7 @@ namespace paddle {
  */
 
 class SlopeInterceptLayer : public Layer {
-public:
+ public:
   explicit SlopeInterceptLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.h b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
index 6cb5fdf83e2b88ce4adb392807a1fdbac253c51c..421bdfe09c46f656f500daff195c755274bf8bb7 100644
--- a/paddle/gserver/layers/SpatialPyramidPoolLayer.h
+++ b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
@@ -29,7 +29,7 @@ namespace paddle {
  */
 
 class SpatialPyramidPoolLayer : public Layer {
-protected:
+ protected:
   size_t channels_;
   size_t imgSizeW_;
   size_t imgSizeH_;
@@ -40,7 +40,7 @@ protected:
   std::vector<Argument> projOutput_;
   std::vector<std::pair<size_t, size_t>> projCol_;
 
-public:
+ public:
   explicit SpatialPyramidPoolLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/SubNestedSequenceLayer.cpp b/paddle/gserver/layers/SubNestedSequenceLayer.cpp
index db240ab0c96510263d90b291f6396ac51a73fbbd..e2bb00bbfacb26dc736a63877119b379f22b5983 100644
--- a/paddle/gserver/layers/SubNestedSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubNestedSequenceLayer.cpp
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 
 class SubNestedSequenceLayer : public Layer {
-public:
+ public:
   explicit SubNestedSequenceLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -30,7 +30,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-private:
+ private:
   /*
    * This functions generates the indices of rows in a batch according to the
    * indices of selected sub-sequence in each sequence.
diff --git a/paddle/gserver/layers/SubSequenceLayer.cpp b/paddle/gserver/layers/SubSequenceLayer.cpp
index 808627f09273950bb6f52a4a6e497bcb8ea170f7..ba49f5710f9d0bb985cf1e80d5c4a972d8f046a6 100644
--- a/paddle/gserver/layers/SubSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubSequenceLayer.cpp
@@ -27,12 +27,12 @@ namespace paddle {
  */
 
 class SubSequenceLayer : public Layer {
-protected:
+ protected:
   std::unique_ptr<Weight> biases_;
   MatrixPtr tmpSrc_;
   MatrixPtr tmpDest_;
 
-public:
+ public:
   explicit SubSequenceLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/SumToOneNormLayer.cpp b/paddle/gserver/layers/SumToOneNormLayer.cpp
index ffbe14925300ad1ffbd33f43a6c0afadddd231e6..00764717e8b6be30230e44626974033e929352da 100644
--- a/paddle/gserver/layers/SumToOneNormLayer.cpp
+++ b/paddle/gserver/layers/SumToOneNormLayer.cpp
@@ -32,13 +32,13 @@ namespace paddle {
  */
 
 class SumToOneNormLayer : public Layer {
-protected:
+ protected:
   /// reciprocalRowSum_ = \f$1 / \sum_{k=1}^N in[k]\f$
   MatrixPtr reciprocalRowSum_;
   /// dotSum = output_.grad \f$.*\f$ output_.value
   MatrixPtr dotSum_;
 
-public:
+ public:
   explicit SumToOneNormLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/SwitchOrderLayer.h b/paddle/gserver/layers/SwitchOrderLayer.h
index 882437f4434c2e61a5b08328d2f79c1e7f589204..8a551a2bba698374841e73dc4dbad403034dd300 100644
--- a/paddle/gserver/layers/SwitchOrderLayer.h
+++ b/paddle/gserver/layers/SwitchOrderLayer.h
@@ -22,7 +22,7 @@ namespace paddle {
  * \brief  This layer calculate softmax in image channel dimension.
  */
 class SwitchOrderLayer : public Layer {
-public:
+ public:
   explicit SwitchOrderLayer(const LayerConfig& config) : Layer(config) {}
 
   ~SwitchOrderLayer() {}
@@ -34,7 +34,7 @@ public:
   void setInDims();
   void setOutDims();
 
-protected:
+ protected:
   std::vector<std::shared_ptr<FunctionBase>> nchw2nhwc_;
   std::vector<std::shared_ptr<FunctionBase>> nhwc2nchw_;
   TensorShape inDims_;
diff --git a/paddle/gserver/layers/TableProjection.h b/paddle/gserver/layers/TableProjection.h
index ffb05e68f068a7b9abb0db5cea6133e64300cb55..60286149f4227fbc758dca7864c6d1f67782c7ae 100644
--- a/paddle/gserver/layers/TableProjection.h
+++ b/paddle/gserver/layers/TableProjection.h
@@ -32,7 +32,7 @@ namespace paddle {
  * @note If \f$ids[i] = -1\f$, it will be ignored.
  */
 class TableProjection : public Projection {
-public:
+ public:
   TableProjection(const ProjectionConfig& config,
                   const ParameterPtr& parameter,
                   bool useGpu);
@@ -43,7 +43,7 @@ public:
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 
-protected:
+ protected:
   std::unique_ptr<Weight> table_;
 };
 
diff --git a/paddle/gserver/layers/TensorLayer.h b/paddle/gserver/layers/TensorLayer.h
index 8a323aa15f6f3761c45b6ca7e3be8f15621a189e..5c1ee40ceda9387138a82368ec4edcbae4bd3419 100644
--- a/paddle/gserver/layers/TensorLayer.h
+++ b/paddle/gserver/layers/TensorLayer.h
@@ -37,11 +37,11 @@ namespace paddle {
  */
 
 class TensorLayer : public Layer {
-protected:
+ protected:
   WeightList weights_;
   std::unique_ptr<Weight> biases_;
 
-public:
+ public:
   explicit TensorLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/TransLayer.h b/paddle/gserver/layers/TransLayer.h
index 03d094862459c80aee8899c0352ffce732db08af..1cd8fd91f785d5a43fc7d7663e657702b32fa534 100644
--- a/paddle/gserver/layers/TransLayer.h
+++ b/paddle/gserver/layers/TransLayer.h
@@ -29,7 +29,7 @@ namespace paddle {
  * The config file api is trans_layer.
  */
 class TransLayer : public Layer {
-public:
+ public:
   explicit TransLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
index 755389f7074c252c0fad396e629c6ffedc74b531..45f59779896f993aface284e3485e1e3d801f4c5 100644
--- a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
+++ b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
@@ -24,14 +24,14 @@ namespace paddle {
  * The config file api is trans_full_matrix_projection.
  */
 class TransposedFullMatrixProjection : public Projection {
-public:
+ public:
   TransposedFullMatrixProjection(const ProjectionConfig& config,
                                  ParameterPtr parameter,
                                  bool useGPu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 
-protected:
+ protected:
   std::unique_ptr<Weight> weight_;
 };
 
diff --git a/paddle/gserver/layers/UpsampleLayer.h b/paddle/gserver/layers/UpsampleLayer.h
index 25efbac5e9e6e92653f7c2b2f4dca9221737e5d6..c9d079c3141c37517866bfdad10d9b2cdb89f7d5 100644
--- a/paddle/gserver/layers/UpsampleLayer.h
+++ b/paddle/gserver/layers/UpsampleLayer.h
@@ -30,7 +30,7 @@ namespace paddle {
  */
 
 class UpsampleLayer : public Layer {
-public:
+ public:
   explicit UpsampleLayer(const LayerConfig& config) : Layer(config) {}
   ~UpsampleLayer() {}
 
@@ -42,7 +42,7 @@ public:
 
   size_t getOutputSize();
 
-protected:
+ protected:
   size_t scale_, scaleY_;
   size_t upsampleSize_, upsampleSizeY_;
   size_t padOutX_, padOutY_;
diff --git a/paddle/gserver/layers/ValidationLayer.h b/paddle/gserver/layers/ValidationLayer.h
index f412d685c0541537bd4318fec2dae06215c4afbe..be41128ef4530f32a63c757648c2f393fd118ea6 100644
--- a/paddle/gserver/layers/ValidationLayer.h
+++ b/paddle/gserver/layers/ValidationLayer.h
@@ -23,7 +23,7 @@ DECLARE_int32(trainer_id);
 namespace paddle {
 
 class ValidationLayer : public Layer {
-public:
+ public:
   explicit ValidationLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -51,7 +51,7 @@ public:
  * AucValidation
  */
 class AucValidation : public ValidationLayer {
-public:
+ public:
   explicit AucValidation(const LayerConfig& config)
       : ValidationLayer(config),
         cpuOutput_(nullptr),
@@ -72,7 +72,7 @@ public:
   };
   std::vector<PredictionResult> predictArray_;
 
-private:
+ private:
   bool passBegin_;
   std::unique_ptr<Evaluator> evaluator_;
   MatrixPtr cpuOutput_;
@@ -84,7 +84,7 @@ private:
  * positive-negative pair rate Validation
  */
 class PnpairValidation : public ValidationLayer {
-public:
+ public:
   explicit PnpairValidation(const LayerConfig& config)
       : ValidationLayer(config) {}
 
@@ -95,7 +95,7 @@ public:
 
   void onPassEnd() override;
 
-private:
+ private:
   bool passBegin_;
   std::unique_ptr<Evaluator> evaluator_;
 };
diff --git a/paddle/gserver/layers/WarpCTCLayer.h b/paddle/gserver/layers/WarpCTCLayer.h
index 6f6be359c0aa46a4f3775f8405e1aa51ca1ae147..3017ca794ecc14f5a3cbd0b302a4953a191a5065 100644
--- a/paddle/gserver/layers/WarpCTCLayer.h
+++ b/paddle/gserver/layers/WarpCTCLayer.h
@@ -26,7 +26,7 @@ namespace paddle {
  * The config file api is warp_ctc_layer.
  */
 class WarpCTCLayer : public Layer {
-public:
+ public:
   explicit WarpCTCLayer(const LayerConfig& config) : Layer(config) {}
   ~WarpCTCLayer() {}
 
@@ -35,7 +35,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override;
 
-protected:
+ protected:
   /**
    * sequence matrix and batch matrix copy:
    * sequence (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
@@ -49,7 +49,7 @@ protected:
                         const ICpuGpuVectorPtr& seqStartPositions,
                         bool normByTimes);
 
-protected:
+ protected:
   size_t numClasses_;
   size_t blank_;
   size_t maxSequenceLength_;
diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h
index c1faa6fd90e06d8c742e97c9ce51eeba3c24a550..41ac46b70ab08d4071f4e6abfca94667268015d7 100644
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ b/paddle/gserver/tests/MKLDNNTester.h
@@ -44,7 +44,7 @@ class MKLDNNTester {
     std::vector<VectorPtr> paraValues;
   };
 
-protected:
+ protected:
   std::vector<TestConfig> configs_;
   vector<string> layerNames_;
   vector<vector<DataLayerPtr>> dataLayers_;
@@ -65,7 +65,7 @@ protected:
   /// passType, PASS_TRAIN, PASS_TEST or PASS_GC (Gradient Check pass)
   PassType passType_;
 
-public:
+ public:
   explicit MKLDNNTester(size_t iter = 3, float epsilon = 1e-4) {
     iter_ = iter;
     eps_ = epsilon;
@@ -75,7 +75,7 @@ public:
 
   ~MKLDNNTester() {}
 
-public:
+ public:
   void run(const TestConfig& dnn,
            const TestConfig& ref,
            size_t batchSize,
@@ -97,7 +97,7 @@ public:
                            bool use_mkldnn,
                            size_t iter = 2);
 
-private:
+ private:
   void reset(const TestConfig& dnn, const TestConfig& ref, size_t batchSize);
   void setInputImgSize();
   void runOnce();
diff --git a/paddle/gserver/tests/test_MultinomialSampler.cpp b/paddle/gserver/tests/test_MultinomialSampler.cpp
index 4a295ea9d51788f988fe79f8439cc7769f661d8e..043025239e744601cbef3ca5c241509872963bd8 100644
--- a/paddle/gserver/tests/test_MultinomialSampler.cpp
+++ b/paddle/gserver/tests/test_MultinomialSampler.cpp
@@ -27,7 +27,7 @@ using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
 class MultinomialSamplerTester : public MultinomialSampler {
-public:
+ public:
   MultinomialSamplerTester(real* prob, int size)
       : MultinomialSampler(prob, size) {}
 
diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
index 72324fcf29cc60867005da25b35a8075fd590a89..9770567b88a2af946b30439300540ed61694ba10 100644
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -26,7 +26,7 @@ DECLARE_int32(seed);
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 class TrainerForTest : public paddle::Trainer {
-public:
+ public:
   void startTrain() {
     GradientMachine& gm = *this->trainerInternal_.getGradientMachine();
     gm.start();
diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
index e5ce922f15749cb18b93f64e0e08f437c5633065..b54e37b7dbf8bffeb949f709e6a4f9ec86ea13c3 100644
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -225,7 +225,7 @@ TEST(Layer, RecurrentLayer) {
 #include "paddle/gserver/layers/RecurrentLayer.h"
 template <class T>
 class TestRecurrentLayer {
-public:
+ public:
   LayerConfig config_;
   bool useGpu_;
   bool useBatch_;
diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h
index ae60f6fe5fa142bdffeafc31b5816b8fcc94ad5c..c43a83891eb6b7eae278169736149ad1d89e950e 100644
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -27,7 +27,7 @@ namespace paddle {
  * This is the base class of all Allocator class.
  */
 class Allocator {
-public:
+ public:
   virtual ~Allocator() {}
   virtual void* alloc(size_t size) = 0;
   virtual void free(void* ptr) = 0;
@@ -38,7 +38,7 @@ public:
  * @brief CPU allocator implementation.
  */
 class CpuAllocator : public Allocator {
-public:
+ public:
   ~CpuAllocator() {}
 
   /**
@@ -76,7 +76,7 @@ public:
  * @brief GPU allocator implementation.
  */
 class GpuAllocator : public Allocator {
-public:
+ public:
   ~GpuAllocator() {}
 
   /**
@@ -107,7 +107,7 @@ public:
  * @brief CPU pinned memory allocator implementation.
  */
 class CudaHostAllocator : public Allocator {
-public:
+ public:
   ~CudaHostAllocator() {}
 
   /**
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
index 00ce5a19491048f3339d608ac37669816a9ad3f5..1958629aa0354fcc332b1e5677a64c29397e0d26 100644
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -43,7 +43,7 @@ typedef bool_constant<bool, true> true_type;
   address += row * ld + col;
 
 class MatrixOffset {
-public:
+ public:
   size_t aCol_;
   size_t aRow_;
   size_t bCol_;
@@ -72,14 +72,14 @@ public:
 
 template <class T>
 class BaseMatrixT : public TensorExpression<BaseMatrixT<T>, T> {
-public:
+ public:
   size_t height_, width_;
   size_t stride_;
   T* data_;
   bool trans_;
   bool useGpu_;
 
-public:
+ public:
   virtual ~BaseMatrixT() {}
   BaseMatrixT(size_t height, size_t width, T* data, bool trans, bool useGpu)
       : height_(height),
diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt
index 922fb5172273da24f9c48786961a6d850b1ed7c5..3c897b5f3e09cd53ddd5b767333ce4759250da71 100644
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -51,10 +51,6 @@ else()
 endif()
 
 
-
-add_style_check_target(paddle_math ${MATH_SOURCES})
-add_style_check_target(paddle_math ${MATH_HEADERS})
-
 add_dependencies(paddle_math paddle_proto ${external_project_dependencies})  # depends
 if(WITH_TESTING)
     add_subdirectory(tests)
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h
index 22b6b71688bd555cf8bf8a29088ad01b092d67cf..172792c2950ce56281715cb7f3eb076da252d77e 100644
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 
 class CpuSparseMatrix : public Matrix {
-public:
+ public:
   CpuSparseMatrix(size_t height,
                   size_t width,
                   size_t nnz, /* used to allocate space */
@@ -291,10 +291,10 @@ public:
     LOG(FATAL) << "not supported!";
   }
 
-private:
+ private:
   MatrixPtr clone(size_t height = 0, size_t width = 0, bool useGpu = false);
 
-protected:
+ protected:
   void sparseResize();
   /*for csr , record row start position, for csc, record row index for every no
    * zero value*/
@@ -310,10 +310,10 @@ protected:
   static ThreadLocal<std::vector<CpuSparseMatrixPtr>> cpuLocalMats_;
 
   // BaseMatrixT interface
-public:
+ public:
   bool isSparse() const { return true; }
 
-private:
+ private:
   using Matrix::mul;
   using Matrix::copyFrom;
   using Matrix::rowMax;
@@ -329,7 +329,7 @@ private:
 namespace paddle {
 
 class CpuSparseMatrix : public Matrix {
-public:
+ public:
   CpuSparseMatrix(size_t height,
                   size_t width,
                   size_t nnz, /* used to allocate space */
diff --git a/paddle/math/ExecViaCpu.h b/paddle/math/ExecViaCpu.h
index 9b2a3c2b8accd384aac896e86ef8315a744633e1..ec2337545e9e3efdf31d3d786a096a67283715f2 100644
--- a/paddle/math/ExecViaCpu.h
+++ b/paddle/math/ExecViaCpu.h
@@ -31,17 +31,17 @@ namespace paddle {
 
 template <typename Arg>
 class CopyToCpu {
-public:
+ public:
   explicit CopyToCpu(Arg& arg) : arg_(arg) {}
   Arg& copiedArg() const { return arg_; }
 
-private:
+ private:
   Arg& arg_;
 };
 
 template <>
 class CopyToCpu<Matrix> {
-public:
+ public:
   explicit CopyToCpu(Matrix& arg) : arg_(arg) {
     if (arg.useGpu()) {
       CHECK(!arg.isTransposed()) << "Not supported";
@@ -59,14 +59,14 @@ public:
   }
   Matrix& copiedArg() const { return copied_ ? *copied_ : arg_; }
 
-private:
+ private:
   Matrix& arg_;
   MatrixPtr copied_;
 };
 
 template <>
 class CopyToCpu<const Matrix> {
-public:
+ public:
   explicit CopyToCpu(const Matrix& arg) : arg_(arg) {
     if (arg.useGpu()) {
       CHECK(!arg.isTransposed()) << "Not supported";
@@ -79,14 +79,14 @@ public:
   }
   const Matrix& copiedArg() const { return copied_ ? *copied_ : arg_; }
 
-private:
+ private:
   const Matrix& arg_;
   MatrixPtr copied_;
 };
 
 template <>
 class CopyToCpu<IVector> {
-public:
+ public:
   explicit CopyToCpu(IVector& arg) : arg_(arg) {
     if (arg.useGpu()) {
       copied_ = IVector::create(arg.getSize(), /* useGpu= */ false);
@@ -100,14 +100,14 @@ public:
   }
   IVector& copiedArg() const { return copied_ ? *copied_ : arg_; }
 
-private:
+ private:
   IVector& arg_;
   IVectorPtr copied_;
 };
 
 template <>
 class CopyToCpu<const IVector> {
-public:
+ public:
   explicit CopyToCpu(const IVector& arg) : arg_(arg) {
     if (arg.useGpu()) {
       copied_ = IVector::create(arg.getSize(), /* useGpu= */ false);
@@ -116,7 +116,7 @@ public:
   }
   const IVector& copiedArg() const { return copied_ ? *copied_ : arg_; }
 
-private:
+ private:
   const IVector& arg_;
   IVectorPtr copied_;
 };
@@ -128,7 +128,7 @@ class GpuFuncWrapperImp;
 
 template <typename F, typename R, typename... Args>
 class GpuFuncWrapperBase {
-public:
+ public:
   typedef R ResultType;
   R operator()(F&& f, Args... args) {
     return f(CopyToCpu<typename std::remove_reference<Args>::type>(args)
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
index e1fb81679adf4658a58ceee73c8d5da6c0b61050..d4a78f3e54b73add3c00e17f13d91359839d3d14 100644
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -35,7 +35,7 @@ typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;
  *
  */
 class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
-public:
+ public:
   MKLDNNMatrix(CpuMatrixPtr m, mkldnn::memory::primitive_desc pd)
       : CpuMatrix(m->getData(), m->getHeight(), m->getWidth(), false),
         mkldnn::memory(pd, m->getData()),
@@ -107,7 +107,7 @@ public:
     dst.copyFrom(*m_);
   }
 
-public:
+ public:
   /**
    * Reorder this MKLDNNMatrix from other format.
    * Support inplace reorder.
@@ -226,7 +226,7 @@ public:
    */
   mkldnn::engine getEngine() { return getPrimitiveDesc().get_engine(); }
 
-protected:
+ protected:
   /**
    * Do reorder once.
    * Can support inplace.
@@ -248,7 +248,7 @@ protected:
     set_data_handle(data);
   }
 
-private:
+ private:
   // save the CpuMatrixPtr in case the buffer released outside
   CpuMatrixPtr m_;
 };
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index de404cad89fba8021b8645a40e25c1f5b7e86596..f48119aa511578b21602a225277f01b4c6a9e9a8 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "MathFunctions.h"
+#include "paddle/math/MathFunctions.h"
 #include "hl_matrix_apply.cuh"
 #include "hl_matrix_ops.cuh"
 #include "paddle/utils/DynamicLoader.h"
@@ -240,6 +240,36 @@ template <>
 void vAdd<double>(const int n, const double* a, const double* b, double* r) {
   vdAdd(n, a, b, r);
 }
+
+template <>
+void vTanh<float>(const int n, const float* a, float* r) {
+  vsTanh(n, a, r);
+}
+
+template <>
+void vTanh<double>(const int n, const double* a, double* r) {
+  vdTanh(n, a, r);
+}
+
+template <>
+void vInvSqrt<float>(const int n, const float* a, float* r) {
+  vsInvSqrt(n, a, r);
+}
+
+template <>
+void vInvSqrt<double>(const int n, const double* a, double* r) {
+  vdInvSqrt(n, a, r);
+}
+
+template <>
+void vLog1p<float>(const int n, const float* a, float* r) {
+  vsLog1p(n, a, r);
+}
+
+template <>
+void vLog1p<double>(const int n, const double* a, double* r) {
+  vdLog1p(n, a, r);
+}
 #else
 
 DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
@@ -277,17 +307,6 @@ void vAdd(const int n, const T* a, const T* b, T* r) {
                                                      n);
 }
 
-template void vExp(const int n, const float* a, float* r);
-template void vExp(const int n, const double* a, double* r);
-template void vLog(const int n, const float* a, float* r);
-template void vLog(const int n, const double* a, double* r);
-template void vPow(const int n, const float* a, const float b, float* r);
-template void vPow(const int n, const double* a, const double b, double* r);
-template void vAdd(const int n, const float* a, const float* b, float* r);
-template void vAdd(const int n, const double* a, const double* b, double* r);
-
-#endif
-
 DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
 template <class T>
 void vInvSqrt(const int n, const T* a, T* r) {
@@ -311,11 +330,19 @@ void vTanh(const int n, const T* a, T* r) {
       binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
+template void vExp(const int n, const float* a, float* r);
+template void vExp(const int n, const double* a, double* r);
+template void vLog(const int n, const float* a, float* r);
+template void vLog(const int n, const double* a, double* r);
+template void vPow(const int n, const float* a, const float b, float* r);
+template void vPow(const int n, const double* a, const double b, double* r);
+template void vAdd(const int n, const float* a, const float* b, float* r);
+template void vAdd(const int n, const double* a, const double* b, double* r);
 template void vInvSqrt(const int n, const double* a, double* r);
 template void vInvSqrt(const int n, const float* a, float* r);
 template void vLog1p(const int n, const float* a, float* r);
 template void vLog1p(const int n, const double* a, double* r);
 template void vTanh(const int n, const float* a, float* r);
 template void vTanh(const int n, const double* a, double* r);
-
+#endif
 }  // namespace paddle
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 04e9614eabc47c4c661ace2106e8ca96f45a1d49..4c3b2c95361065372f5969a2da73bce0eb9d123f 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -77,7 +77,7 @@ typedef std::shared_ptr<CpuSparseMatrix> CpuSparseMatrixPtr;
  * instead.
  */
 class Matrix : public BaseMatrix {
-protected:
+ protected:
   Matrix(MemoryHandlePtr memHandle,
          size_t height,
          size_t width,
@@ -95,11 +95,11 @@ protected:
 
   static ThreadLocal<MatrixPtr> tmpMat_;
 
-public:
+ public:
   size_t elementCnt_;  // maximal number of elements which can be held in data_
   MemoryHandlePtr memoryHandle_;
 
-public:
+ public:
   virtual ~Matrix() {}
 
   static MatrixPtr create(MemoryHandlePtr memHandle,
@@ -412,7 +412,7 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-public:
+ public:
   /// Only set all variables to 0 or NULL but not free them.
   virtual void clear() {
     height_ = 0;
@@ -1228,7 +1228,7 @@ inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
 }
 
 class GpuMatrix : public Matrix {
-public:
+ public:
   GpuMatrix();
 
   GpuMatrix(size_t height, size_t width, bool trans = false);
@@ -1660,11 +1660,11 @@ public:
 };
 
 class CpuMatrix : public Matrix {
-private:
+ private:
   MatrixPtr sftmaxSum_;
   MatrixPtr sftmaxDot_;
 
-public:
+ public:
   CpuMatrix(size_t height, size_t width, bool trans = false);
   CpuMatrix(real* data, size_t height, size_t width, bool trans = false)
       : Matrix(data, height, width, trans, false) {}
@@ -1892,7 +1892,7 @@ public:
   real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
   virtual real* getRowBuf(size_t row) { return getRow(row); }
 
-public:
+ public:
   /// add b to each sample of this.
   void addBias(Matrix& b, real scale);
   void addSharedBias(Matrix& b, real scale);
@@ -2128,7 +2128,7 @@ public:
 };
 
 class SharedCpuMatrix : public CpuMatrix {
-public:
+ public:
 #ifndef PADDLE_MOBILE_INFERENCE
   /* blockNum is number of partitions of the matrix  */
   SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false)
@@ -2160,12 +2160,12 @@ public:
 
   ~SharedCpuMatrix() {}
 
-public:
+ public:
   virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
   virtual void add(Matrix& b, real p1, real p2);
   virtual void add(real p1, real p2);
 
-private:
+ private:
   using Matrix::mul;
   void initShared(int blockNum);
   void initBlock(int blockNum);
diff --git a/paddle/math/MatrixBitCode.cpp b/paddle/math/MatrixBitCode.cpp
index 61a9923bc2e6f358738f80de4a30d83c0cc00656..f7a949294b54a5a874e1239a13ca9dce3ba18e94 100644
--- a/paddle/math/MatrixBitCode.cpp
+++ b/paddle/math/MatrixBitCode.cpp
@@ -27,7 +27,7 @@ struct SimpleCode {
   inline bool calcBit(int bit) const { return c_ & (1 << bit); }
   inline int getLength() const { return findLastSet(c_) - 1; }
 
-private:
+ private:
   size_t c_;
 };
 
@@ -39,7 +39,7 @@ struct SimpleCodeTable {
   size_t size() const { return numClasses_; }
   int getMaxCodeLength() const { return findLastSet(numClasses_ - 1); }
 
-private:
+ private:
   size_t numClasses_;
   int maxCodeLength_;
 };
diff --git a/paddle/math/MemoryHandle.h b/paddle/math/MemoryHandle.h
index 03ee413c1218376635c4696ebb774c584aa67aa4..516e09dbed47ac6b039ccb094614c9588eeb3cd5 100644
--- a/paddle/math/MemoryHandle.h
+++ b/paddle/math/MemoryHandle.h
@@ -20,16 +20,16 @@ limitations under the License. */
 namespace paddle {
 
 class MemoryHandle {
-protected:
+ protected:
   explicit MemoryHandle(size_t size);
   virtual ~MemoryHandle() {}
 
-public:
+ public:
   void* getBuf() const { return buf_; }
   size_t getSize() const { return size_; }
   size_t getAllocSize() const { return allocSize_; }
 
-protected:
+ protected:
   PoolAllocator* allocator_;
   size_t size_;       // the requested size
   size_t allocSize_;  // the allocated size
@@ -43,7 +43,7 @@ protected:
  * The raw handle will be released at destructor
  */
 class GpuMemoryHandle : public MemoryHandle {
-public:
+ public:
   explicit GpuMemoryHandle(size_t size);
   virtual ~GpuMemoryHandle();
 };
@@ -54,7 +54,7 @@ public:
  * The raw handle will be released at destructor
  */
 class CpuMemoryHandle : public MemoryHandle {
-public:
+ public:
   explicit CpuMemoryHandle(size_t size);
   virtual ~CpuMemoryHandle();
 };
diff --git a/paddle/math/PoolAllocator.h b/paddle/math/PoolAllocator.h
index 90141fef3fd43fe221874cc50e688f6db9e2dee6..7239cf1c4494e207081e325a7e6067ba26a9c852 100644
--- a/paddle/math/PoolAllocator.h
+++ b/paddle/math/PoolAllocator.h
@@ -27,7 +27,7 @@ namespace paddle {
  * @brief Memory pool allocator implementation.
  */
 class PoolAllocator {
-public:
+ public:
   /**
    * @brief constructor.
    * @param allocator a Allocator object.
@@ -47,7 +47,7 @@ public:
   void free(void* ptr, size_t size);
   std::string getName() { return name_; }
 
-private:
+ private:
   void freeAll();
   void printAll();
   std::unique_ptr<Allocator> allocator_;
diff --git a/paddle/math/RowBuffer.h b/paddle/math/RowBuffer.h
index 2e4d11a86bf8bd1308b2972f549bc7c201044785..6950afaa21d60615b27c06a151b0afbb296653bf 100644
--- a/paddle/math/RowBuffer.h
+++ b/paddle/math/RowBuffer.h
@@ -26,7 +26,7 @@ namespace paddle {
  * If not set memory handler, then the data could be auto growth.
  */
 class RowBuffer {
-public:
+ public:
   /**
    * @brief RowBuffer create a auto-growth row buffer. The row length is width.
    * @param width the length of each row, a.k.a matrix width.
@@ -129,7 +129,7 @@ public:
    */
   inline size_t getWidth() const { return width_; }
 
-private:
+ private:
   //! TODO(yuyang18): Add resize method to CpuMemHandlePtr, then we can get rid
   //! of std::vector here.
   CpuMemHandlePtr preallocatedBuf_;
diff --git a/paddle/math/SparseMatrix.h b/paddle/math/SparseMatrix.h
index 7c525f4edf3d53544c195f8e253c27a03854a793..9181fa29233677d8f4fac503905cc31eb66cb6c1 100644
--- a/paddle/math/SparseMatrix.h
+++ b/paddle/math/SparseMatrix.h
@@ -25,7 +25,7 @@ namespace paddle {
 typedef std::shared_ptr<_hl_sparse_matrix_s> hl_sparse_matrix_s_ptr;
 
 class GpuSparseMatrix : public Matrix {
-public:
+ public:
   MemoryHandlePtr sMemoryHandle_;
   int* rows_;
   int* cols_;
@@ -36,7 +36,7 @@ public:
   SparseValueType valueType_;
   SparseFormat format_;
 
-public:
+ public:
   GpuSparseMatrix(size_t height,
                   size_t width,
                   size_t nnz, /* used to allocate space */
@@ -73,7 +73,7 @@ public:
                   bool trans,
                   MemoryHandlePtr sMemoryHandle);
 
-protected:
+ protected:
   struct Element {
     int row;
     int col;
@@ -82,7 +82,7 @@ protected:
         : row(rowIn), col(colIn), val(valIn) {}
   };
 
-public:
+ public:
   ~GpuSparseMatrix() {}
 
   void resize(size_t newHeight,
@@ -211,13 +211,13 @@ public:
    */
   void rowMax(IVector& maxIds, Matrix& maxVal);
 
-protected:
+ protected:
   void sparseResize();
 
   void copyRow(int offsets, size_t colNum, const sparse_non_value_t* row);
   void copyRow(int offsets, size_t colNum, const sparse_float_value_t* row);
 
-public:
+ public:
   void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
 
   void copyFrom(CpuSparseMatrix& src, hl_stream_t stream);
@@ -228,10 +228,10 @@ public:
   void trimFromCSC(const CpuSparseMatrix& src);
 
   // BaseMatrixT interface
-public:
+ public:
   bool isSparse() const { return true; }
 
-private:
+ private:
   using Matrix::mul;
   using Matrix::copyFrom;
   using Matrix::rowMax;
@@ -248,7 +248,7 @@ private:
 namespace paddle {
 
 class GpuSparseMatrix : public Matrix {
-public:
+ public:
   GpuSparseMatrix(size_t height,
                   size_t width,
                   size_t nnz, /* used to allocate space */
diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h
index 3920de32df7de925d6e22e17b93b15bff8785675..cf6779e8b0b1d6b0c13b21a08ffff5af76e57ba6 100644
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -29,7 +29,7 @@ namespace paddle {
  * Sparse Row
  */
 class SparseRowCpuMatrix : public CpuMatrix {
-public:
+ public:
   struct IndexDict {
     // In the following, global id means the row id in the original matrix.
     // Local id means the row id in the local storage which only contains
@@ -53,7 +53,7 @@ public:
 
   virtual ~SparseRowCpuMatrix() {}
 
-public:
+ public:
   /**
    *  Get the row buf
    *
@@ -163,7 +163,7 @@ public:
     return indexDictHandle_->localIndices;
   }
 
-protected:
+ protected:
   template <typename Func>
   void apply(Func f) {
     f(buf_->data(), localIndices_->size() * width_);
@@ -204,7 +204,7 @@ class SyncThreadPool;
 
 /// For prefetching parameters from remote Parameter server
 class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
-public:
+ public:
   SparsePrefetchRowCpuMatrix(CpuMemHandlePtr dataHandle,
                              size_t height,
                              size_t width,
@@ -229,13 +229,13 @@ public:
    */
   void setupIndices();
 
-protected:
+ protected:
   void addRows(const unsigned int* ids, size_t len);
   SyncThreadPool* pool_;
 };
 
 class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {
-public:
+ public:
   SparseAutoGrowRowCpuMatrix(size_t height,
                              size_t width,
                              IndexDictPtr indexDictHandle = nullptr,
@@ -258,7 +258,7 @@ public:
 };
 
 class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {
-public:
+ public:
   CacheRowCpuMatrix(size_t height,
                     size_t width,
                     IndexDictPtr indexDictHandle = nullptr,
@@ -287,7 +287,7 @@ public:
 
   virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
 
-public:
+ public:
   CpuVectorPtr sourceDataVec_;
   real* sourceData_;
 };
@@ -299,7 +299,7 @@ public:
  * ids are hashed by worker thread id.
  */
 class SparseRowIdsCpuMatrix : public CpuMatrix {
-public:
+ public:
   SparseRowIdsCpuMatrix(CpuMemHandlePtr dataHandle,
                         size_t height,
                         size_t width,
@@ -310,7 +310,7 @@ public:
 
   std::vector<uint32_t>& getIds(size_t threadId) { return idsArray_[threadId]; }
 
-private:
+ private:
   std::vector<std::vector<uint32_t>> idsArray_;
 };
 
@@ -320,13 +320,13 @@ private:
 namespace paddle {
 
 class SparseRowCpuMatrix : public CpuMatrix {
-public:
+ public:
   void reserveStore() {}
   void clearIndices() {}
 };
 
 class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
-public:
+ public:
   void setupIndices() {}
   void addRows(MatrixPtr input) {}
   void addRows(IVectorPtr ids) {}
diff --git a/paddle/math/Storage.h b/paddle/math/Storage.h
index ba8f4689a1e896304aa14821b40fc8ff0c304bb2..61a9aa2a07442d9e4ede80c961e17e079eb8b3ba 100644
--- a/paddle/math/Storage.h
+++ b/paddle/math/Storage.h
@@ -25,7 +25,7 @@ namespace paddle {
  * @brief Storage manager for multiple devices.
  */
 class StorageEngine {
-public:
+ public:
   /**
    * @return Storage singleton
    */
@@ -41,7 +41,7 @@ public:
    */
   PoolAllocator* getCpuAllocator();
 
-protected:
+ protected:
   StorageEngine();
   ~StorageEngine();
   RWLock lock_;
diff --git a/paddle/math/TensorApply.h b/paddle/math/TensorApply.h
index 7d79cae5a11851b190afbb9ac94efdf2ba2510b7..8b642047bffa33b47dfb8ffc8e3fd2a9b7dbae3a 100644
--- a/paddle/math/TensorApply.h
+++ b/paddle/math/TensorApply.h
@@ -21,7 +21,7 @@ namespace paddle {
  */
 template <typename Derived, class T>
 class TensorApply {
-public:
+ public:
   explicit INLINE TensorApply(const Derived& p)
       : data_(p.data_),
         stride_(p.stride_),
@@ -52,7 +52,7 @@ public:
  */
 template <typename Derived, class T>
 class TensorApply<const Derived, T> {
-public:
+ public:
   explicit INLINE TensorApply(const Derived& p)
       : data_(p.data_),
         stride_(p.stride_),
@@ -77,7 +77,7 @@ public:
 
 template <typename Derived, class T>
 class TensorApply<const TensorExpression<Derived, T>, T> {
-public:
+ public:
   explicit TensorApply(const TensorExpression<Derived, T>& expr)
       : expr_(expr.derived()) {}
 
@@ -97,7 +97,7 @@ public:
  */
 template <class OP, typename ArgType, class T>
 class TensorApply<const TensorUnaryOp<OP, ArgType, T>, T> {
-public:
+ public:
   explicit INLINE TensorApply(const TensorUnaryOp<OP, ArgType, T>& expr)
       : op_(expr.op_), expr_(expr.expr_) {}
 
@@ -118,7 +118,7 @@ public:
  */
 template <class OP, typename LhsType, typename RhsType, class T>
 class TensorApply<const TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
-public:
+ public:
   explicit INLINE TensorApply(
       const TensorBinaryOp<OP, LhsType, RhsType, T>& expr)
       : op_(expr.op_), lhs_(expr.lhs_), rhs_(expr.rhs_) {
@@ -153,7 +153,7 @@ public:
  */
 template <typename ArgType1, typename ArgType2, typename ArgType3, class T>
 class TensorApply<const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>, T> {
-public:
+ public:
   explicit INLINE TensorApply(
       const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>& expr)
       : expr1_(expr.expr1_), expr2_(expr.expr2_), expr3_(expr.expr3_) {
@@ -192,7 +192,7 @@ public:
  */
 template <class OP, typename ArgType, class T>
 class TensorApply<const TensorConstant<OP, ArgType, T>, T> {
-public:
+ public:
   explicit INLINE TensorApply(const TensorConstant<OP, ArgType, T>& expr)
       : op_(expr.op_), expr_(expr.expr_) {}
 
diff --git a/paddle/math/TensorAssign.h b/paddle/math/TensorAssign.h
index 113d98c16b22b06971040b1a1ce52c696f6c3c14..7d4726ddba43202970c37dd1a08f842104b24ada 100644
--- a/paddle/math/TensorAssign.h
+++ b/paddle/math/TensorAssign.h
@@ -25,7 +25,7 @@ namespace paddle {
  */
 template <typename LhsType, typename RhsType, class T>
 class TensorAssignOp {
-public:
+ public:
   explicit TensorAssignOp(const LhsType& lhs, const RhsType& rhs)
       : lhs_(lhs), rhs_(rhs) {
 #ifndef __CUDA_ARCH__
@@ -49,7 +49,7 @@ public:
   }
   INLINE bool useGpu() const { return lhs_.useGpu(); }
 
-private:
+ private:
   TensorApply<LhsType, T> lhs_;
   TensorApply<const RhsType, T> rhs_;
 };
diff --git a/paddle/math/TensorExpression.h b/paddle/math/TensorExpression.h
index 83229ae65dd1f4ed6b885c3d6195b3758b8ba039..f6da9adfca50e49ca260e20313c8979a38e1b06b 100644
--- a/paddle/math/TensorExpression.h
+++ b/paddle/math/TensorExpression.h
@@ -40,7 +40,7 @@ class TensorAssignOp;
  */
 template <typename Derived, class T>
 class TensorExpression {
-public:
+ public:
   /**
    * Element wise unary expression.
    */
@@ -355,7 +355,7 @@ public:
     return TensorAssignOp<Derived, ExpressionType, T>(derived(), expr);
   }
 
-protected:
+ protected:
   const Derived& derived() const { return *static_cast<const Derived*>(this); }
 };
 
@@ -365,7 +365,7 @@ protected:
 template <class OP, typename ExprType, class T>
 class TensorUnaryOp
     : public TensorExpression<TensorUnaryOp<OP, ExprType, T>, T> {
-public:
+ public:
   explicit TensorUnaryOp(const OP op, const ExprType& expr)
       : op_(op), expr_(expr) {}
 
@@ -379,7 +379,7 @@ public:
 template <class OP, typename LhsType, typename RhsType, class T>
 class TensorBinaryOp
     : public TensorExpression<TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
-public:
+ public:
   explicit TensorBinaryOp(const OP op, const LhsType& lhs, const RhsType& rhs)
       : op_(op), lhs_(lhs), rhs_(rhs) {}
 
@@ -395,7 +395,7 @@ template <typename ExprType1, typename ExprType2, typename ExprType3, class T>
 class TensorTernaryOp : public TensorExpression<
                             TensorTernaryOp<ExprType1, ExprType2, ExprType3, T>,
                             T> {
-public:
+ public:
   explicit TensorTernaryOp(const ExprType1& expr1,
                            const ExprType2& expr2,
                            const ExprType3& expr3)
@@ -412,7 +412,7 @@ public:
 template <class OP, typename ExprType, class T>
 class TensorConstant
     : public TensorExpression<TensorConstant<OP, ExprType, T>, T> {
-public:
+ public:
   explicit TensorConstant(const OP op, const ExprType& expr)
       : op_(op), expr_(expr) {}
 
diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h
index 3efbc769dff5aa1dbc9d5015b0cbac313710d70d..964b42cae52af9b487ab17103bc5e999514e4dd1 100644
--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
@@ -40,13 +40,13 @@ class Matrix;
 
 template <class T>
 class BaseVector : public BaseMatrixT<T> {
-public:
+ public:
   BaseVector(size_t size, T* data, bool useGpu)
       : BaseMatrixT<T>(1, size, data, false, useGpu), size_(this->width_) {}
 
   ~BaseVector() {}
 
-protected:
+ protected:
   size_t& size_;
 };
 
@@ -57,7 +57,7 @@ protected:
  */
 template <class T>
 class VectorT : public BaseVector<T> {
-protected:
+ protected:
   VectorT(size_t size, MemoryHandlePtr memoryHandle, size_t offset, bool useGpu)
       : BaseVector<T>(size,
                       reinterpret_cast<T*>(memoryHandle->getBuf()) + offset,
@@ -71,7 +71,7 @@ protected:
   VectorT(size_t size, T* data, bool useGpu)
       : BaseVector<T>(size, data, useGpu) {}
 
-public:
+ public:
   virtual ~VectorT() {}
 
   static std::shared_ptr<VectorT<T>> create(size_t size, bool useGpu);
@@ -281,7 +281,7 @@ public:
     }
   }
 
-protected:
+ protected:
   friend class GpuVectorT<T>;
   friend class CpuVectorT<T>;
   virtual void copyTo(CpuVectorT<T>* dest) const = 0;
@@ -297,7 +297,7 @@ std::ostream& operator<<(std::ostream& os, const VectorT<T>& vec) {
 
 template <class T>
 class GpuVectorT : public VectorT<T> {
-public:
+ public:
   explicit GpuVectorT(size_t size);
   GpuVectorT(size_t size, GpuMemHandlePtr memHandle, size_t offset)
       : VectorT<T>(size, memHandle, offset, true) {}
@@ -343,14 +343,14 @@ public:
     TensorGpuApply<T>(*this, expr);
   }
 
-protected:
+ protected:
   virtual void copyTo(CpuVectorT<T>* dest) const;
   virtual void copyTo(GpuVectorT<T>* dest) const;
 };
 
 template <class T>
 class CpuVectorT : public VectorT<T> {
-public:
+ public:
   explicit CpuVectorT(size_t size);
   CpuVectorT(size_t size, MemoryHandlePtr memoryHandle, size_t offset)
       : VectorT<T>(size, memoryHandle, offset, false) {}
@@ -415,7 +415,7 @@ public:
 
 template <class T>
 class ParallelCpuVectorT : public CpuVectorT<T> {
-public:
+ public:
   ParallelCpuVectorT(size_t size, SyncThreadPool* pool)
       : CpuVectorT<T>(size), pool_(pool) {}
 
@@ -434,7 +434,7 @@ public:
 
   virtual void exec(SyncThreadPool::JobFunc jobFunc);
 
-private:
+ private:
   typedef std::function<void(CpuVectorT<T>& vec)> ExecFunc;
   void parallelExec(ExecFunc func);
   SyncThreadPool* pool_;
@@ -445,7 +445,7 @@ private:
  */
 template <class T>
 class CpuGpuVectorT {
-public:
+ public:
   /**
    * @brief An enum type of SyncedFlag using to
    *        mark data memory is in CPU or GPU.
@@ -670,7 +670,7 @@ public:
     setSync(flag);
   }
 
-protected:
+ protected:
   void resizeOrCreate(size_t size, bool useGpu);
 
   /**
diff --git a/paddle/math/tests/TensorCheck.h b/paddle/math/tests/TensorCheck.h
index f4332ede36356bc666612a240448c1be71e5170e..40ac04ef5d4baa0239bb03b04c3a6cce0fcac5a5 100644
--- a/paddle/math/tests/TensorCheck.h
+++ b/paddle/math/tests/TensorCheck.h
@@ -32,7 +32,7 @@ using paddle::CpuVectorT;
 using paddle::GpuVectorT;
 
 class AssertEqual {
-public:
+ public:
   AssertEqual(real err = 0) : err_(err) {}
 
   inline bool operator()(real a, real b) {
@@ -51,7 +51,7 @@ public:
     return true;
   }
 
-private:
+ private:
   real err_;
 };
 
@@ -60,71 +60,71 @@ class CopyToCpu;
 
 template <>
 class CopyToCpu<CpuMatrix> {
-public:
+ public:
   explicit CopyToCpu(const CpuMatrix& arg) : arg_(arg) {}
   const CpuMatrix& copiedArg() const { return arg_; }
 
-private:
+ private:
   const CpuMatrix& arg_;
 };
 
 template <>
 class CopyToCpu<GpuMatrix> {
-public:
+ public:
   explicit CopyToCpu(const GpuMatrix& arg)
       : arg_(arg.getHeight(), arg.getWidth()) {
     arg_.copyFrom(arg);
   }
   CpuMatrix& copiedArg() { return arg_; }
 
-private:
+ private:
   CpuMatrix arg_;
 };
 
 template <>
 class CopyToCpu<Matrix> {
-public:
+ public:
   explicit CopyToCpu(const Matrix& arg)
       : arg_(arg.getHeight(), arg.getWidth()) {
     arg_.copyFrom(arg);
   }
   CpuMatrix& copiedArg() { return arg_; }
 
-private:
+ private:
   CpuMatrix arg_;
 };
 
 template <typename T>
 class CopyToCpu<CpuVectorT<T>> {
-public:
+ public:
   explicit CopyToCpu(const CpuVectorT<T>& arg) : arg_(arg) {}
   const CpuVectorT<T>& copiedArg() const { return arg_; }
 
-private:
+ private:
   const CpuVectorT<T>& arg_;
 };
 
 template <typename T>
 class CopyToCpu<GpuVectorT<T>> {
-public:
+ public:
   explicit CopyToCpu(const GpuVectorT<T>& arg) : arg_(arg.getSize()) {
     arg_.copyFrom(arg);
   }
   CpuVectorT<T>& copiedArg() { return arg_; }
 
-private:
+ private:
   CpuVectorT<T> arg_;
 };
 
 template <typename T>
 class CopyToCpu<VectorT<T>> {
-public:
+ public:
   explicit CopyToCpu(const VectorT<T>& arg) : arg_(arg.getSize()) {
     arg_.copyFrom(arg);
   }
   CpuVectorT<T>& copiedArg() { return arg_; }
 
-private:
+ private:
   CpuVectorT<T> arg_;
 };
 
diff --git a/paddle/math/tests/TestUtils.h b/paddle/math/tests/TestUtils.h
index d2b9706432f84fa082e071eb09d2ffe7402a085f..e1966ec8a74747960420ec80fdfbb957f7cf177f 100644
--- a/paddle/math/tests/TestUtils.h
+++ b/paddle/math/tests/TestUtils.h
@@ -56,31 +56,31 @@ using paddle::GpuSparseMatrix;
 
 template <typename T1, typename T2>
 class ReplaceType {
-public:
+ public:
   typedef T1 type;
 };
 
 template <>
 class ReplaceType<BaseMatrix, CpuMatrix> {
-public:
+ public:
   typedef CpuMatrix type;
 };
 
 template <>
 class ReplaceType<BaseMatrix, GpuMatrix> {
-public:
+ public:
   typedef GpuMatrix type;
 };
 
 template <>
 class ReplaceType<Matrix, CpuMatrix> {
-public:
+ public:
   typedef CpuMatrix type;
 };
 
 template <>
 class ReplaceType<Matrix, GpuMatrix> {
-public:
+ public:
   typedef GpuMatrix type;
 };
 
@@ -180,25 +180,25 @@ R call(C& obj, R (FC::*f)(FArgs...), Args&&... args) {
 
 template <typename T>
 class ReturnType {
-public:
+ public:
   typedef T type;
 };
 
 template <>
 class ReturnType<CpuMatrix> {
-public:
+ public:
   typedef GpuMatrix type;
 };
 
 template <>
 class ReturnType<CpuIVector> {
-public:
+ public:
   typedef GpuIVector type;
 };
 
 template <>
 class ReturnType<CpuSparseMatrix> {
-public:
+ public:
   typedef GpuSparseMatrix type;
 };
 
@@ -234,7 +234,7 @@ GpuSparseMatrix autoArgs(CpuSparseMatrix& v) {
 }
 
 class AutoCompare {
-public:
+ public:
   /**
    * err is the allowed calculation error.
    * The smaller the value of err,
@@ -285,7 +285,7 @@ public:
     TensorCheck(compare, cpu, gpu);
   }
 
-protected:
+ protected:
   CpuMatrix cpu;
   GpuMatrix gpu;
   AssertEqual compare;
diff --git a/paddle/math/tests/test_ExecViaCpu.cpp b/paddle/math/tests/test_ExecViaCpu.cpp
index 513c7b440e0aa6f20cc8209a3624f32f4892225b..72256cb9d4c93159418d27c7ca0d4f8b9a412a64 100644
--- a/paddle/math/tests/test_ExecViaCpu.cpp
+++ b/paddle/math/tests/test_ExecViaCpu.cpp
@@ -39,7 +39,7 @@ real f(Matrix& mat1,
 }
 
 class Functor {
-public:
+ public:
   real operator()(Matrix& mat1,
                   const Matrix& mat2,
                   IVector& vec1,
@@ -49,7 +49,7 @@ public:
     return a_;
   }
 
-private:
+ private:
   real a_;
 };
 
diff --git a/paddle/math/tests/test_TrainingAlgorithm.cpp b/paddle/math/tests/test_TrainingAlgorithm.cpp
index fb146176ca8eb97a9cdbaf9ebd5c4997a8439718..fb58d26734cab5d7d7bbbbe1cf8a920e4195b4bb 100644
--- a/paddle/math/tests/test_TrainingAlgorithm.cpp
+++ b/paddle/math/tests/test_TrainingAlgorithm.cpp
@@ -28,14 +28,14 @@ DEFINE_double(max_diff, 1e-13, "max diff allowed");
 #endif
 
 class SetMaxDiff {
-public:
+ public:
   explicit SetMaxDiff(double max_diff) {
     max_diff_ = FLAGS_max_diff;
     FLAGS_max_diff = max_diff;
   }
   ~SetMaxDiff() { FLAGS_max_diff = max_diff_; }
 
-private:
+ private:
   double max_diff_;
 };
 
diff --git a/paddle/math/tests/test_perturbation.cpp b/paddle/math/tests/test_perturbation.cpp
index ef99dab60a874846d04c5ce07d38b2857640ad7b..969400666f12e4c6001f270be3ec144e7e4d0702 100644
--- a/paddle/math/tests/test_perturbation.cpp
+++ b/paddle/math/tests/test_perturbation.cpp
@@ -32,7 +32,7 @@ const int TGT_SIZE = 21;
 const int CHANNELS = 3;
 
 class PerturbationTest : public testing::Test {
-protected:
+ protected:
   virtual void SetUp() { generateTestImages(gpuImages_); }
 
   virtual void TearDown() {}
diff --git a/paddle/optimizer/CMakeLists.txt b/paddle/optimizer/CMakeLists.txt
index 25fc35311fc63988c64a445d72fc6255e49e8d4b..7c80faa48ce960a3a7eb7d88eda4f2b09756410e 100644
--- a/paddle/optimizer/CMakeLists.txt
+++ b/paddle/optimizer/CMakeLists.txt
@@ -7,6 +7,10 @@ set(OPITMIZER_SRCS
     sgd_optimizer.cc
   )
 
-cc_library(paddle_optimizer STATIC SRCS ${OPITMIZER_SRCS} DEPS paddle_proto glog)
-cc_test(serialization_test SRCS serialization_test.cc DEPS paddle_proto)
-cc_test(parameter_optimizer_test SRCS parameter_optimizer_test.cc DEPS paddle_optimizer)
+add_library(paddle_optimizer ${OPITMIZER_SRCS})
+target_link_libraries(paddle_optimizer paddle_proto glog)
+
+if (WITH_TESTING)
+    add_unittest(serialization_test serialization_test.cc)
+    add_unittest(parameter_optimizer_test parameter_optimizer_test.cc)
+endif()
diff --git a/paddle/optimizer/adadelta_optimizer.h b/paddle/optimizer/adadelta_optimizer.h
index 74df9d54be734fedec8aeddff5f50b1d1aefb1d3..5beb62295a83ba4826e9a6b9caf21de78d2e8ced 100644
--- a/paddle/optimizer/adadelta_optimizer.h
+++ b/paddle/optimizer/adadelta_optimizer.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace optimizer {
 
 class AdadeltaOptimizer : public ParameterOptimizer {
-public:
+ public:
   AdadeltaOptimizer(
       Tensor *parameter, LrPolicy *lr, double rho, double epsilon, double decay)
       : ParameterOptimizer(parameter, lr),
@@ -40,7 +40,7 @@ public:
   std::string SerializeState();
   void DeserializeState(const std::string &state);
 
-private:
+ private:
   Tensor *accum_gradient_;
   Tensor *accum_delta_;
   Tensor *update_delta_;
diff --git a/paddle/optimizer/adagrad_optimizer.h b/paddle/optimizer/adagrad_optimizer.h
index 1d58402d78ff9ada8b084a472d46c96580d01e5b..b6fc06739970984cf4bbd27d3e6e1e9066bc350f 100644
--- a/paddle/optimizer/adagrad_optimizer.h
+++ b/paddle/optimizer/adagrad_optimizer.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace optimizer {
 
 class AdagradOptimizer : public ParameterOptimizer {
-public:
+ public:
   AdagradOptimizer(Tensor *parameter,
                    LrPolicy *lr,
                    double epsilon,
@@ -36,7 +36,7 @@ public:
   std::string SerializeState();
   void DeserializeState(const std::string &state);
 
-private:
+ private:
   Tensor *accum_gradient_;
   double epsilon_;
   double decay_;
diff --git a/paddle/optimizer/adam_optimizer.h b/paddle/optimizer/adam_optimizer.h
index 7977226c8602745d5733021a51fc03d932b0921a..fce10960068364b40592b26a6b439494d75cfa03 100644
--- a/paddle/optimizer/adam_optimizer.h
+++ b/paddle/optimizer/adam_optimizer.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace optimizer {
 
 class AdamOptimizer : public ParameterOptimizer {
-public:
+ public:
   AdamOptimizer(Tensor *parameter,
                 LrPolicy *lr,
                 double beta_1,
@@ -42,7 +42,7 @@ public:
   std::string SerializeState();
   void DeserializeState(const std::string &state);
 
-private:
+ private:
   Tensor *momentums_;
   Tensor *velocitys_;
   double beta_1_;
diff --git a/paddle/optimizer/lr_policy.h b/paddle/optimizer/lr_policy.h
index 14422d1f42fc45d5e9a560c45259d4003a0b3d11..d639c9f22c8ad77267f68e2c3b35257211bf90df 100644
--- a/paddle/optimizer/lr_policy.h
+++ b/paddle/optimizer/lr_policy.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace optimizer {
 
 class LrPolicy {
-public:
+ public:
   virtual ~LrPolicy() {}
   virtual double LearningRate(const uint64_t num_sample_passed) = 0;
   virtual std::string SerializeState() = 0;
@@ -29,7 +29,7 @@ public:
 
 // constant learning rate policy
 class ConstLr final : public LrPolicy {
-public:
+ public:
   ConstLr(double lr) : learning_rate_(lr){};
   double LearningRate(const uint64_t num_sample_passed) {
     return learning_rate_;
@@ -45,12 +45,12 @@ public:
     learning_rate_ = state.learning_rate();
   }
 
-private:
+ private:
   double learning_rate_;
 };
 
 class LinearLr final : public LrPolicy {
-public:
+ public:
   LinearLr(double lr, double lr_decay_a, double lr_decay_b)
       : learning_rate_(lr), lr_decay_a_(lr_decay_a), lr_decay_b_(lr_decay_b) {}
   double LearningRate(const uint64_t num_sample_passed) {
@@ -72,7 +72,7 @@ public:
     lr_decay_b_ = state.lr_decay_b();
   }
 
-private:
+ private:
   double learning_rate_;
   double lr_decay_a_;
   double lr_decay_b_;
diff --git a/paddle/optimizer/parameter_optimizer.h b/paddle/optimizer/parameter_optimizer.h
index c7cf8db3ee05c75c171b68bcbcb06a5ae8fa5b48..d5abca82d55c12aed0f4fca0c4c1f21d20586155 100644
--- a/paddle/optimizer/parameter_optimizer.h
+++ b/paddle/optimizer/parameter_optimizer.h
@@ -26,7 +26,7 @@ namespace paddle {
 namespace optimizer {
 
 class ParameterOptimizer {
-public:
+ public:
   /**
    * @brief  update hook for algorithm need to traverse parameter more than
    * once.
@@ -45,7 +45,7 @@ public:
   virtual std::string SerializeState() = 0;
   virtual void DeserializeState(const std::string &state) = 0;
 
-protected:
+ protected:
   Tensor *parameter_;
   // learning rate policy
   LrPolicy *lr_policy_;
diff --git a/paddle/optimizer/parameter_optimizer_test.cc b/paddle/optimizer/parameter_optimizer_test.cc
index d663e2fd007febd3b9f0f43d213d63d2b20656b8..1d9572999e9e0f10092eecbc1b41369a89629da7 100644
--- a/paddle/optimizer/parameter_optimizer_test.cc
+++ b/paddle/optimizer/parameter_optimizer_test.cc
@@ -38,7 +38,7 @@ paddle::optimizer::Tensor* FixedTensor(size_t size) {
 }
 
 class OptimizerTest : public testing::Test {
-public:
+ public:
   virtual ~OptimizerTest() {}
   // init paddle::optimizer::Tensor shape
   const size_t kSize = 5;
@@ -115,7 +115,7 @@ public:
     }
   }
 
-private:
+ private:
   std::vector<paddle::optimizer::ParameterOptimizer*> opts_;
   paddle::OptimizerConfig config_;
 };
diff --git a/paddle/optimizer/sgd_optimizer.h b/paddle/optimizer/sgd_optimizer.h
index f504d98adb8a01fd69ff313075b4c417222c765e..a8957cde54abd6667143d2a8265d732c849294e3 100644
--- a/paddle/optimizer/sgd_optimizer.h
+++ b/paddle/optimizer/sgd_optimizer.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace optimizer {
 
 class SGDOptimizer : public ParameterOptimizer {
-public:
+ public:
   SGDOptimizer(Tensor* parameter, LrPolicy* lr, double m, double d, bool n)
       : ParameterOptimizer(parameter, lr),
         momentums_(nullptr),
@@ -39,7 +39,7 @@ public:
   std::string SerializeState();
   void DeserializeState(const std::string& state);
 
-private:
+ private:
   Tensor* momentums_;
   double momentum_;
   double decay_;
diff --git a/paddle/optimizer/tensor.h b/paddle/optimizer/tensor.h
index fd32398a237e7e08a198707347cd3c0a4ed77bb3..d2cef99074335be6f9852d60daa103b9b45a550d 100644
--- a/paddle/optimizer/tensor.h
+++ b/paddle/optimizer/tensor.h
@@ -26,7 +26,7 @@ namespace optimizer {
 
 template <class T>
 class TensorT {
-public:
+ public:
   TensorT(size_t size) : height_(1), width_(size) {
     // new T[size]() initializes all element to zero value.
     data_ptr_ = std::shared_ptr<T>(new T[size](), std::default_delete<T[]>());
@@ -54,7 +54,7 @@ public:
   // TODO: replace with tensorshape
   size_t size() const { return this->width_ * this->height_; }
 
-protected:
+ protected:
   size_t height_;
   size_t width_;
   std::shared_ptr<T> data_ptr_;
diff --git a/paddle/parameter/AverageOptimizer.h b/paddle/parameter/AverageOptimizer.h
index 4ad3c18d56abf16d1274c5b3b8e0347b85e64dea..f0fe2fd28e4be7df8ebc52fd9b9b5540f3d76949 100644
--- a/paddle/parameter/AverageOptimizer.h
+++ b/paddle/parameter/AverageOptimizer.h
@@ -21,7 +21,7 @@ namespace paddle {
 // After Optimization, parameter values are further averaged within
 // time range.
 class AverageOptimizer : public ParameterOptimizer {
-public:
+ public:
   // if *useParameterApply* set, use PARAMETER_APPLY to store averaged parameter
   // else use PARAMETER_VALUE, and value backup in PARAMETER_GRADIENT
   AverageOptimizer(const OptimizationConfig& optConfig,
@@ -65,7 +65,7 @@ public:
 
   virtual void setNoDecay() { optimizer_->setNoDecay(); }
 
-protected:
+ protected:
   std::unique_ptr<ParameterOptimizer> optimizer_;
   bool useApply_;
 
@@ -98,7 +98,7 @@ protected:
 
 // Average Optimizer with Sparse support.
 class AverageSparseOptimizer : public AverageOptimizer {
-public:
+ public:
   AverageSparseOptimizer(const OptimizationConfig& optConfig,
                          ParameterOptimizer* optimizer,
                          bool useParameterApply)
@@ -130,7 +130,7 @@ public:
     t0Vec_.assign(t0Vec_.size(), 0);
   }
 
-protected:
+ protected:
   /**
    *  counting batches, clear after catch up with
    *  t(timer_) is current time,
diff --git a/paddle/parameter/CMakeLists.txt b/paddle/parameter/CMakeLists.txt
index d2ae1c16c6b7316f1a6facdef4b933693d6ba818..19ae07e077e2b8f55ce4050566c9cf6aaa0efa0a 100644
--- a/paddle/parameter/CMakeLists.txt
+++ b/paddle/parameter/CMakeLists.txt
@@ -5,8 +5,6 @@ file(GLOB PARAMETERS_SOURCES . *.cpp)
 
 add_library(paddle_parameter STATIC
         ${PARAMETERS_SOURCES})
-add_style_check_target(paddle_parameter ${PARAMETERS_SOURCES})
-add_style_check_target(paddle_parameter ${PARAMETERS_HEADERS})
 add_dependencies(paddle_parameter paddle_proto ${external_project_dependencies})
 if(WITH_TESTING)
     add_subdirectory(tests)
diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/parameter/FirstOrderOptimizer.h
index 047989fcad52afc1d4d4c347258d0fb2f069f3d4..86b9a591aff7a58aafa194c64cb09cd6636d0454 100644
--- a/paddle/parameter/FirstOrderOptimizer.h
+++ b/paddle/parameter/FirstOrderOptimizer.h
@@ -22,7 +22,7 @@ namespace paddle {
 
 // Plain SGD optimization.
 class SgdOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit SgdOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig) {
     addParameterType(PARAMETER_MOMENTUM);
@@ -77,7 +77,7 @@ class SparseMomentumParameterOptimizer : public ParameterOptimizer {
     \gamma_t: learning rate at the t'th step
   */
 
-public:
+ public:
   explicit SparseMomentumParameterOptimizer(
       const OptimizationConfig& optConfig);
   virtual void init(size_t numRows, const ParameterConfig* config);
@@ -89,7 +89,7 @@ public:
       const ParameterConfig& config) const;
   virtual void finishBatch();
 
-private:
+ private:
   real alpha_;
   real beta_;
   real tau_;
@@ -98,7 +98,7 @@ private:
   real momentum_;
   real decayRate_;
 
-protected:
+ protected:
   int64_t timer_;
   mutable std::vector<int64_t> t0Vec_;
   bool isParameterSparse_;
@@ -109,7 +109,7 @@ protected:
  * http://www.magicbroom.info/Papers/DuchiHaSi10.pdf
  */
 class AdagradParameterOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit AdagradParameterOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig) {
     addParameterType(PARAMETER_MOMENTUM);
@@ -129,7 +129,7 @@ public:
   virtual TraverseCallback needSpecialTraversal(
       const ParameterConfig& config) const;
 
-protected:
+ protected:
   int64_t numUpdates_;
   static const int64_t kMaxNumAccumulates = 16384;
 };
@@ -139,7 +139,7 @@ protected:
  * http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf
  */
 class AdaDeltaParameterOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit AdaDeltaParameterOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig) {
     addParameterType(PARAMETER_MOMENTUM);
@@ -158,14 +158,14 @@ public:
                       const ParameterConfig& config,
                       size_t sparseId) const;
 
-protected:
+ protected:
   real rou_;
   real epsilon_;
 };
 
 // RMSProp Parameter Optimization.
 class RMSPropParameterOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit RMSPropParameterOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig) {
     addParameterType(PARAMETER_MOMENTUM);
@@ -191,7 +191,7 @@ public:
                       const ParameterConfig& config,
                       size_t sparseId) const;
 
-protected:
+ protected:
   real rou_;
   real epsilon_;
 
@@ -208,7 +208,7 @@ protected:
 
 // Decayed AdaGrad Optimization.
 class DecayedAdagradParameterOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit DecayedAdagradParameterOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig) {
     addParameterType(PARAMETER_MOMENTUM);
@@ -233,7 +233,7 @@ public:
                       const ParameterConfig& config,
                       size_t sparseId) const;
 
-protected:
+ protected:
   real rou_;
   real epsilon_;
 
@@ -253,7 +253,7 @@ protected:
  * Reference Paper: http://arxiv.org/abs/1412.6980 Algorithm 1
  */
 class AdamParameterOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit AdamParameterOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig),
         beta1_(optConfig.adam_beta1()),
@@ -275,7 +275,7 @@ public:
                       const ParameterConfig& config,
                       size_t sparseId) const;
 
-protected:
+ protected:
   real beta1_;
   real beta2_;
   real epsilon_;
@@ -288,7 +288,7 @@ protected:
  * Reference Paper: http://arxiv.org/abs/1412.6980 Algorithm 2
  */
 class AdamaxParameterOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit AdamaxParameterOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig),
         beta1_(optConfig.adam_beta1()),
@@ -305,7 +305,7 @@ public:
                       const ParameterConfig& config,
                       size_t sparseId) const;
 
-protected:
+ protected:
   real beta1_;
   real beta2_;
   int64_t step_;
@@ -315,7 +315,7 @@ protected:
 // Used in pserver,
 // when PARAMETER_DELTA stores in PARAMETER_GRADIENT.
 class AddOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit AddOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig) {}
 
@@ -333,7 +333,7 @@ public:
 
 // A optimizer which does nothing.
 class DummyOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit DummyOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig) {}
 
@@ -344,7 +344,7 @@ public:
 
 // Do gradient clipping before sgd update
 class OptimizerWithGradientClipping : public ParameterOptimizer {
-public:
+ public:
   OptimizerWithGradientClipping(const OptimizationConfig& optConfig,
                                 ParameterOptimizer* optimizer)
       : ParameterOptimizer(optConfig), optimizer_(optimizer) {
@@ -374,7 +374,7 @@ public:
 
   virtual void setNoDecay() { optimizer_->setNoDecay(); }
 
-protected:
+ protected:
   std::unique_ptr<ParameterOptimizer> optimizer_;
 };
 
diff --git a/paddle/parameter/LearningRateScheduler.cpp b/paddle/parameter/LearningRateScheduler.cpp
index b6b58e3ddad6a0e8811bf56502c3f2f0c8728f5c..d57d2189a45dc8cbcea7a8a5f25c5ec7ac71cca3 100644
--- a/paddle/parameter/LearningRateScheduler.cpp
+++ b/paddle/parameter/LearningRateScheduler.cpp
@@ -28,20 +28,20 @@ LearningRateScheduler* LearningRateScheduler::create(
 // LRS stands for LearningRateScheduler
 
 class BaseLRS : public LearningRateScheduler {
-public:
+ public:
   explicit BaseLRS(const OptimizationConfig& config)
       : learningRate_(config.learning_rate()),
         a_(config.learning_rate_decay_a()),
         b_(config.learning_rate_decay_b()) {}
 
-protected:
+ protected:
   real learningRate_;
   real a_;
   real b_;
 };
 
 class ConstLRS : public BaseLRS {
-public:
+ public:
   explicit ConstLRS(const OptimizationConfig& config) : BaseLRS(config) {}
   virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
     return learningRate_;
@@ -50,7 +50,7 @@ public:
 REGISTER_LEARNING_RATE_SCHEDULER(constant, ConstLRS);
 
 class PolyLRS : public BaseLRS {
-public:
+ public:
   explicit PolyLRS(const OptimizationConfig& config) : BaseLRS(config) {}
   virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
     return learningRate_ * pow(1.0 + a_ * numSamplesProcessed, -b_);
@@ -59,7 +59,7 @@ public:
 REGISTER_LEARNING_RATE_SCHEDULER(poly, PolyLRS);
 
 class CaffePolyLRS : public BaseLRS {
-public:
+ public:
   explicit CaffePolyLRS(const OptimizationConfig& config) : BaseLRS(config) {}
   virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
     if (numSamplesProcessed > a_) {
@@ -78,7 +78,7 @@ public:
 REGISTER_LEARNING_RATE_SCHEDULER(caffe_poly, CaffePolyLRS);
 
 class ExpLRS : public BaseLRS {
-public:
+ public:
   explicit ExpLRS(const OptimizationConfig& config) : BaseLRS(config) {}
   virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
     double decayRatio = (double)numSamplesProcessed / b_;
@@ -88,7 +88,7 @@ public:
 REGISTER_LEARNING_RATE_SCHEDULER(exp, ExpLRS);
 
 class DiscreteExpLRS : public BaseLRS {
-public:
+ public:
   explicit DiscreteExpLRS(const OptimizationConfig& config) : BaseLRS(config) {}
   virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
     int numDecays = floor(numSamplesProcessed / b_);
@@ -98,7 +98,7 @@ public:
 REGISTER_LEARNING_RATE_SCHEDULER(discexp, DiscreteExpLRS);
 
 class LinearLRS : public BaseLRS {
-public:
+ public:
   explicit LinearLRS(const OptimizationConfig& config) : BaseLRS(config) {}
   virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
     return std::max(learningRate_ - a_ * numSamplesProcessed, b_);
@@ -113,7 +113,7 @@ REGISTER_LEARNING_RATE_SCHEDULER(linear, LinearLRS);
   then learning_rate = learning_rate_base * rate_i
 */
 class ManualLRS : public BaseLRS {
-public:
+ public:
   explicit ManualLRS(const OptimizationConfig& config)
       : BaseLRS(config), currentSegment_(0), lastNum_(0) {
     std::vector<std::string> pieces;
@@ -151,7 +151,7 @@ public:
     return learningRate_ * rates_.back();
   }
 
-protected:
+ protected:
   std::vector<real> rates_;
   std::vector<int64_t> segments_;
   size_t currentSegment_;
@@ -161,7 +161,7 @@ protected:
 REGISTER_LEARNING_RATE_SCHEDULER(manual, ManualLRS);
 
 class PassManualLRS : public ManualLRS {
-public:
+ public:
   explicit PassManualLRS(const OptimizationConfig& config)
       : ManualLRS(config) {}
   virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
diff --git a/paddle/parameter/LearningRateScheduler.h b/paddle/parameter/LearningRateScheduler.h
index aea99a1c204b46e937135cbde22360a12d087ae2..3fad97040248dcf8a22988c38153df31f267ed37 100644
--- a/paddle/parameter/LearningRateScheduler.h
+++ b/paddle/parameter/LearningRateScheduler.h
@@ -26,7 +26,7 @@ namespace paddle {
   })
 
 class LearningRateScheduler {
-public:
+ public:
   static LearningRateScheduler* create(const OptimizationConfig& config);
   virtual ~LearningRateScheduler() {}
   virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) = 0;
diff --git a/paddle/parameter/OptimizerWithRegularizer.h b/paddle/parameter/OptimizerWithRegularizer.h
index 7219d96d924dfa26d3ab52b8c6a2ce1249e4f45c..bd29b3966324b2e206cfe56cc15678539d1e870e 100644
--- a/paddle/parameter/OptimizerWithRegularizer.h
+++ b/paddle/parameter/OptimizerWithRegularizer.h
@@ -20,7 +20,7 @@ namespace paddle {
 
 // add regularizer for objective function to do optimization
 class OptimizerWithRegularizer : public ParameterOptimizer {
-public:
+ public:
   static ParameterOptimizer* create(const OptimizationConfig& optConfig,
                                     const ParameterConfig& paraConfig,
                                     bool isParameterSparse,
@@ -67,7 +67,7 @@ public:
     regularizer_->update(vecs, config, optimizer_->getLearningRate(), 0, 1);
   }
 
-protected:
+ protected:
   std::unique_ptr<ParameterOptimizer> optimizer_;
   Regularizer* regularizer_;
 
@@ -84,7 +84,7 @@ protected:
 // Regularized Loss function for every num of batches
 class OptimizerWithRegularizerEveryNumBatches
     : public OptimizerWithRegularizer {
-public:
+ public:
   OptimizerWithRegularizerEveryNumBatches(const OptimizationConfig& optConfig,
                                           ParameterOptimizer* optimizer,
                                           Regularizer* regularizer)
@@ -112,7 +112,7 @@ public:
   virtual TraverseCallback startCatchUpWith() const;
   virtual void finishCatchUpWith() { baseTimer_ = timer_; }
 
-protected:
+ protected:
   bool isRegularizationBatch(const ParameterConfig& config) const {
     return ((timer_ + 1) % config.num_batches_regularization() == 0);
   }
@@ -125,7 +125,7 @@ protected:
 
 // Regularized Loss function with Sparse support
 class OptimizerWithRegularizerSparse : public OptimizerWithRegularizer {
-public:
+ public:
   OptimizerWithRegularizerSparse(const OptimizationConfig& optConfig,
                                  ParameterOptimizer* optimizer,
                                  Regularizer* regularizer)
@@ -145,7 +145,7 @@ public:
     t0Vec_.assign(t0Vec_.size(), 0);
   }
 
-protected:
+ protected:
   /**
    *  t0Vec_ are last occur time of i rows
    *  if one block is update by multi threads,
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
index 24ac10f3fe5977553332a9a8402d6795577b5ad8..ef519bf35a4f051b4477eb04b5eb2c5f0b5e29e8 100644
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -58,7 +58,7 @@ class Parameter;
 typedef std::shared_ptr<Parameter> ParameterPtr;
 
 class Parameter {
-public:
+ public:
   Parameter(const ParameterConfig& config, bool useGpu, bool doInit = true);
   const std::string& getName() const { return config_.name(); }
 
@@ -311,7 +311,7 @@ public:
     }
   }
 
-protected:
+ protected:
   /**
    * @brief create matrix to matType.
    *
@@ -326,7 +326,7 @@ protected:
 
   void clearUpdate() { updateCounter_ = 0; }
 
-protected:
+ protected:
   ParameterConfig config_;
 
   bool useGpu_;
@@ -363,7 +363,7 @@ protected:
 
   std::vector<std::shared_ptr<IParameterUpdaterHook>> updaterHooks_;
 
-public:
+ public:
   void setSharedCount(int cnt) { sharedCount_ = cnt; }
   int getSharedCount() { return sharedCount_; }
 
diff --git a/paddle/parameter/ParameterOptimizer.h b/paddle/parameter/ParameterOptimizer.h
index a8d0ca72f21d04e0e65a9dd6a07e8f53b23e4223..019afa1358ae255fd096e84e5eb1d7b0b9d6859f 100644
--- a/paddle/parameter/ParameterOptimizer.h
+++ b/paddle/parameter/ParameterOptimizer.h
@@ -30,12 +30,12 @@ namespace paddle {
  *    may be called many times, should be no state change between calls.
  */
 class ParameterOptimizer {
-public:
+ public:
   typedef std::function<void(
       const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId)>
       TraverseCallback;
 
-public:
+ public:
   explicit ParameterOptimizer(const OptimizationConfig& optConfig)
       : applyDecay_(true),
         optConfig_(optConfig),
@@ -175,7 +175,7 @@ public:
   static ParameterOptimizer* create(const OptimizationConfig& optConfig,
                                     bool inPserver = false);
 
-protected:
+ protected:
   typedef std::vector<ParameterOptimizer::TraverseCallback> TraverseCallbackVec;
 
   static TraverseCallback composeCallbacks(
diff --git a/paddle/parameter/ParameterUpdaterBase.h b/paddle/parameter/ParameterUpdaterBase.h
index 717e1c6721b6e4d3ff81172eb06213677c3bff98..493512886cad3ea9b74026d6dfcc4fc90f6aadb9 100644
--- a/paddle/parameter/ParameterUpdaterBase.h
+++ b/paddle/parameter/ParameterUpdaterBase.h
@@ -21,7 +21,7 @@ namespace paddle {
 class ParameterOptimizer;
 
 class ParameterUpdater {
-public:
+ public:
   ParameterUpdater() : parameterTypes_{PARAMETER_VALUE, PARAMETER_GRADIENT} {}
   virtual ~ParameterUpdater() {}
 
@@ -89,7 +89,7 @@ public:
   virtual void setForwardbackwardTime(uint64_t delta) {}
 #endif
 
-protected:
+ protected:
   virtual void updateImpl(Parameter* para) = 0;
 
   std::vector<ParameterType> parameterTypes_;
@@ -101,7 +101,7 @@ protected:
 // part of all Parameters. It's useful when we need different
 // update strategy for different Parameter.
 class ParameterUpdaterComposite : public ParameterUpdater {
-public:
+ public:
   ParameterUpdaterComposite() {}
   virtual ~ParameterUpdaterComposite() {}
 
@@ -173,7 +173,7 @@ public:
         [&](int tid, size_t numThreads) { updaters_[tid]->restore(); });
   }
 
-protected:
+ protected:
   virtual void updateImpl(Parameter* para) {}
   std::vector<std::unique_ptr<ParameterUpdater>> updaters_;
   std::unique_ptr<SyncThreadPool> syncThreadPool_;
diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp
index e6aec3c34820764b3515f47f13a432961de1a673..989185b66a5b7785bb0572fba59a72adeef9797b 100644
--- a/paddle/parameter/ParameterUpdaterHook.cpp
+++ b/paddle/parameter/ParameterUpdaterHook.cpp
@@ -37,7 +37,7 @@ namespace paddle {
  */
 
 class StaticPruningHook : public IParameterUpdaterHook {
-public:
+ public:
   explicit StaticPruningHook(const ParameterUpdaterHookConfig &hookConfig)
       : initCount_(0) {
     sparsityRatio_ = hookConfig.sparsity_ratio();
@@ -96,7 +96,7 @@ public:
     paraVec->dotMul(*maskVec_);
   }
 
-private:
+ private:
   SameThreadChecker updateThreadChecker_;
   std::atomic<size_t> initCount_;
   VectorPtr maskVec_;
@@ -116,12 +116,12 @@ IParameterUpdaterHook::~IParameterUpdaterHook() {}
  * May be extracted to Util.h to unify the hasher.
  */
 class StringIntPairHasher {
-public:
+ public:
   size_t operator()(const std::pair<std::string, int> &k) const {
     return intHasher_(strHasher_(k.first) + k.second);
   }
 
-private:
+ private:
   std::hash<std::string> strHasher_;
   std::hash<int> intHasher_;
 };
diff --git a/paddle/parameter/ParameterUpdaterHook.h b/paddle/parameter/ParameterUpdaterHook.h
index d30530ec393c097bf77e5e376e3c4dc84b321ed8..cb96e4cf007572e9688c11719017a9d2771ecd51 100644
--- a/paddle/parameter/ParameterUpdaterHook.h
+++ b/paddle/parameter/ParameterUpdaterHook.h
@@ -29,7 +29,7 @@ class Parameter;
  * parameter optimization.
  */
 class IParameterUpdaterHook {
-public:
+ public:
   virtual ~IParameterUpdaterHook();
 
   /**
@@ -53,7 +53,7 @@ public:
    */
   virtual void init(Parameter* para) = 0;
 
-protected:
+ protected:
   /**
    * Ctor.
    */
diff --git a/paddle/parameter/Regularizer.h b/paddle/parameter/Regularizer.h
index 6bed7b0ddfe7b72c697af60f5243f9037999d54a..fa5384e23251b918cc914df36c16ad790a5c59c5 100644
--- a/paddle/parameter/Regularizer.h
+++ b/paddle/parameter/Regularizer.h
@@ -20,7 +20,7 @@ namespace paddle {
 
 // Regularizer function for parameter, e.g. L1/L2
 class Regularizer {
-public:
+ public:
   virtual void update(const VectorPtr vecs[],
                       const ParameterConfig& paraConfig,
                       real learningRate,  // learningrate from optimizer
diff --git a/paddle/parameter/Weight.h b/paddle/parameter/Weight.h
index 7314c29d0db92db06d5b921c09de39d3b0029ef3..113dd6530c82fe1e831ad4a35e9cbcb9880b9243 100644
--- a/paddle/parameter/Weight.h
+++ b/paddle/parameter/Weight.h
@@ -23,12 +23,12 @@ limitations under the License. */
 namespace paddle {
 
 class Weight {
-private:
+ private:
   MatrixPtr weight_;
   MatrixPtr weightGrad_;
   ParameterPtr parameter_;
 
-public:
+ public:
   Weight(size_t height, size_t width, ParameterPtr parameter);
   Weight(size_t height, size_t width, ParameterPtr parameter, size_t offset);
 
diff --git a/paddle/parameter/tests/test_common.cpp b/paddle/parameter/tests/test_common.cpp
index 6e10becabbbbb8861095fed5aab9ac1e05bcac91..89dcc6c751eb2ec07bfe8297c93d56c824086211 100644
--- a/paddle/parameter/tests/test_common.cpp
+++ b/paddle/parameter/tests/test_common.cpp
@@ -24,7 +24,7 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 
 class CommonTest : public ::testing::Test {
-protected:
+ protected:
   CommonTest() : testStat_("test") {}
   virtual ~CommonTest() {}
   virtual void SetUp() {
@@ -51,7 +51,7 @@ protected:
 
   virtual void TreaDown() { LOG(INFO) << "All Test Finished."; }
 
-protected:
+ protected:
   std::vector<std::pair<real, real>> valueUint_;
   std::vector<size_t> sizeVec_;
   real learningRate_;
diff --git a/paddle/pserver/BaseClient.h b/paddle/pserver/BaseClient.h
index a932d34712f56de1cbbf84a9db4476f862febca0..d50230e73a3a7d128cbfd1d70517fddd228fb1bb 100644
--- a/paddle/pserver/BaseClient.h
+++ b/paddle/pserver/BaseClient.h
@@ -32,7 +32,7 @@ namespace paddle {
  * connections.
  */
 class BaseClient {
-protected:
+ protected:
   typedef std::unique_ptr<std::thread> ThreadPtr;
   typedef std::vector<std::vector<iovec>> InputIovs;
   typedef std::vector<SendParameterRequest> SendRequest;
@@ -49,7 +49,7 @@ protected:
     SendDataRequestVec parallelDataRequests;
   };
 
-public:
+ public:
   explicit BaseClient(bool separate = false, int numPorts = FLAGS_ports_num);
 
   virtual ~BaseClient();
@@ -141,7 +141,7 @@ public:
     return dataType;
   }
 
-protected:
+ protected:
   /// for a > 0, b > 0:
   /// return the smallest x s.t. b*x >= a
   static int divup(int a, int b) { return (a + b - 1) / b; }
@@ -264,7 +264,7 @@ protected:
    */
   virtual void recv(int threadId) = 0;
 
-protected:
+ protected:
   bool stopping_;
   /// nodes * ports that means the number of real pservers
   int serviceNum_;
diff --git a/paddle/pserver/CMakeLists.txt b/paddle/pserver/CMakeLists.txt
index f75475a88f7224ee3889827795088c8aa920b63b..0ae9c6ef6afc6ec5a99a685b08883def0db51cf1 100644
--- a/paddle/pserver/CMakeLists.txt
+++ b/paddle/pserver/CMakeLists.txt
@@ -14,9 +14,6 @@ set(NETWORK_HEADERS
 add_library(paddle_network STATIC
     ${NETWORK_SOURCES})
 
-add_style_check_target(paddle_network ${NETWORK_SOURCES})
-add_style_check_target(paddle_network ${NETWORK_HEADERS})
-
 add_dependencies(paddle_network paddle_proto ${external_project_dependencies})
 
 ################### paddle_pserver ######################
@@ -37,9 +34,6 @@ set(PSERVER_HEADERS
 add_library(paddle_pserver STATIC
     ${PSERVER_SOURCES})
 
-add_style_check_target(paddle_pserver ${PSERVER_SOURCES})
-add_style_check_target(paddle_pserver ${PSERVER_HEADERS})
-
 add_dependencies(paddle_pserver paddle_proto ${external_project_dependencies})
 
 set(PSERVER_MAIN_SOURCES
diff --git a/paddle/pserver/LightNetwork.h b/paddle/pserver/LightNetwork.h
index 2aaa26a5c708f9c01f006136619f599bcfe0db71..bcfc9655e989e80e08e9dce9b8734c0643cbf661 100644
--- a/paddle/pserver/LightNetwork.h
+++ b/paddle/pserver/LightNetwork.h
@@ -41,7 +41,7 @@ class SocketServer : public Thread {
   // rdmaCpu controls the cpu affinity of RDMA server daemon,
   // which could benifit performance. rdmaCpu = -1 means TCP
   // is used instead of RDMA transport.
-public:
+ public:
   SocketServer(const std::string& addr, int port, int rdmaCpu);
   ~SocketServer();
 
@@ -50,7 +50,7 @@ public:
   typedef std::function<void(const std::vector<iovec>& outputIovs)>
       ResponseCallback;
 
-protected:
+ protected:
   //
   // The derived class needs to implement this function
   // to handle the request received by SocketWorker
@@ -70,13 +70,13 @@ protected:
 
   friend class SocketWorker;
 
-private:
+ private:
   void rdmaServer();
   void tcpServer();
 
   void detach() {}  // detach accept thread is forbidden
 
-protected:
+ protected:
   enum ChannelType tcpRdma_;
   // for rdma
   int rdmaCpu_;
@@ -96,7 +96,7 @@ protected:
  * @note  all parameter processing will run in the context of this worker
  */
 class SocketWorker : public Thread {
-public:
+ public:
   SocketWorker(std::unique_ptr<SocketChannel>&& channel, SocketServer* server)
       : channel_(std::move(channel)), server_(server) {}
 
@@ -104,7 +104,7 @@ public:
 
   virtual void run();
 
-protected:
+ protected:
   std::unique_ptr<SocketChannel> channel_;
   SocketServer* server_;
   enum ChannelType tcpRdma_;
@@ -118,12 +118,12 @@ protected:
  *        single cpu core for better load balance performance
  */
 class RdmaClientDaemons {
-private:
+ private:
   RdmaClientDaemons();
 
   static std::unique_ptr<RdmaClientDaemons> daemons_;
 
-public:
+ public:
   static RdmaClientDaemons* get() {
     std::call_once(RdmaClientDaemons::initDataFlag_,
                    &RdmaClientDaemons::getInstance);
@@ -141,10 +141,10 @@ public:
 
   ~RdmaClientDaemons();
 
-public:
+ public:
   friend class SocketClient;
 
-private:
+ private:
   static std::once_flag initDataFlag_;
   static void getInstance() {
     if (!daemons_.get()) daemons_.reset(new RdmaClientDaemons());
@@ -162,19 +162,19 @@ private:
  *        read data
  */
 class SocketClient {
-public:
+ public:
   SocketClient(const std::string& serverAddr,
                int serverPort,
                enum ChannelType channelType);
 
   SocketChannel* getChannel() { return channel_.get(); }
 
-protected:
+ protected:
   std::unique_ptr<SocketChannel> channel_;
   struct sxi_socket* socketDaemon_;
   enum ChannelType tcpRdma_;
 
-private:
+ private:
   void RdmaClient(const std::string& serverAddr, int serverPort);
   void TcpClient(const std::string& serverAddr, int serverPort);
 };
diff --git a/paddle/pserver/ParameterClient2.h b/paddle/pserver/ParameterClient2.h
index d63273ccbc8ed30d9df50d9f8b1a4d1e4fba6720..c96bb787151a525556c8217629109de201762cff 100644
--- a/paddle/pserver/ParameterClient2.h
+++ b/paddle/pserver/ParameterClient2.h
@@ -50,11 +50,11 @@ struct PServerVector {
  * @brief A class to help to prepare server-side operations.
  */
 class PreparedOperations {
-protected:
+ protected:
   class ResultsAdder;
   struct LocalOperationResult;
 
-public:
+ public:
   /**
    * Offers an easy way to prepare operations that will be performed on
    * server-side.
@@ -93,7 +93,7 @@ public:
     return ResultsAdder(&localResults_.back());
   }
 
-protected:
+ protected:
   void addOperationHelper(Operation* op) {}
 
   /**
@@ -151,7 +151,7 @@ protected:
    * @brief ResultsAdder offers easy ways to quickly store operation results.
    */
   class ResultsAdder {
-  public:
+   public:
     explicit ResultsAdder(LocalOperationResult* localResult)
         : localResult_(localResult) {}
     template <typename... Args>
@@ -172,11 +172,11 @@ protected:
       addResult(args...);
     }
 
-  protected:
+   protected:
     LocalOperationResult* localResult_;
   };
 
-protected:
+ protected:
   DoOperationRequest request_;
   std::vector<iovec> inputIovs_;
   struct LocalOperationResult {
@@ -214,7 +214,7 @@ struct ParameterSegments {
  * waiting until all parameters are received to CPU host end.
  */
 class ParameterClient2 : public BaseClient {
-public:
+ public:
   /** Constructor.
    * @param separate True if sending and recieving activities are separated
    *                 into 2 threads, otherwise false.
@@ -232,7 +232,7 @@ public:
   static int calcParameterBlockSize(const std::vector<ParameterPtr>& parameters,
                                     size_t serviceNum);
 
-public:
+ public:
   bool init(const std::vector<ParameterPtr>& parameters);
 
   /// service functions
@@ -514,7 +514,7 @@ public:
   void setForwardbackwardTime(uint64_t delta) { forwardbackwordTime_ = delta; }
 #endif
 
-protected:
+ protected:
   template <typename ProtoIn, typename ProtoOut>
   void multiCall(const char* funcName,
                  const ProtoIn& request,
@@ -529,7 +529,7 @@ protected:
     }
   }
 
-private:
+ private:
   void destroy();
 
   /**
@@ -573,7 +573,7 @@ private:
   /// start necessary threads for threadPool
   void initThreads();
 
-protected:
+ protected:
   /// start port number of pserver
   /// it deduce all ports for dense and sparse with some rules
   int port_;
diff --git a/paddle/pserver/ParameterServer2.h b/paddle/pserver/ParameterServer2.h
index 3ed06b6b045802bcfd48bcff6bd0c1b34e9bbb86..0b8ef5c170c01ec8a5d53f01db9888f82ca68eec 100644
--- a/paddle/pserver/ParameterServer2.h
+++ b/paddle/pserver/ParameterServer2.h
@@ -71,7 +71,7 @@ namespace paddle {
  * to prevent from being polluted.
  */
 class ParameterServer2 : public ProtoServer {
-protected:
+ protected:
   /// parameter_ mutex.
   RWLock parameterMutex_;
 
@@ -169,7 +169,7 @@ protected:
   template <typename T, size_t AlignBytes>
   class ReadWriteBuffer
       : public std::vector<T, AlignedAllocator<T, AlignBytes>> {
-  public:
+   public:
     static_assert(sizeof(T) % AlignBytes == 0 || AlignBytes % sizeof(T) == 0,
                   "Type T must be able to aligned.");
 
@@ -229,7 +229,7 @@ protected:
       return r;
     }
 
-  private:
+   private:
     size_t curOffset_;
   };
 
@@ -298,17 +298,17 @@ protected:
   /// barrier performance tuning sync-sgd required
   std::atomic<int64_t> batchId_;
 
-public:
+ public:
   struct Buffer {
     real* base;
     size_t size;
   };
 
-protected:
+ protected:
   /// async gradient commit control
   bool asyncGrdientCommitCheckAndStat(const SendParameterRequest& request);
 
-public:
+ public:
   /// disable default parameter for overloading
   /// @rdmaCpu:the id of cpu core hosting RDMA server(0-N)
   /// -1 means using TCP transport instead of RDMA
@@ -437,7 +437,7 @@ public:
   void saveValueVector(const SaveValueRequest& request,
                        ProtoResponseCallback callback);
 
-public:
+ public:
   /**
    * @brief initialize parameter server
    */
@@ -512,7 +512,7 @@ public:
                           SendParameterResponse* response,
                           std::vector<Buffer>* outputBuffers);
 
-protected:
+ protected:
   void mergeSegments(BlockSegments* segments);
 
   /// set the unused segments to zero
@@ -641,7 +641,7 @@ protected:
                      const VectorPtr vecs[],
                      const ParameterOptimizer::TraverseCallback& callback);
 
-public:
+ public:
   typedef void (ParameterServer2::*OperatorFunction)(const Operation& operation,
                                                      OperationResult* result);
 
diff --git a/paddle/pserver/ParameterServerController.h b/paddle/pserver/ParameterServerController.h
index 3a9bc74edf240a12fe1f7bd266f0311555349311..1308d62fb1787f19123fe37d49f8e14039c5a39a 100644
--- a/paddle/pserver/ParameterServerController.h
+++ b/paddle/pserver/ParameterServerController.h
@@ -28,7 +28,7 @@ namespace paddle {
  * by gflags or proto.
  */
 class ParameterServerController final {
-public:
+ public:
   DISABLE_COPY(ParameterServerController);
 
   /**
@@ -67,7 +67,7 @@ public:
    */
   void wait();
 
-private:
+ private:
   std::vector<std::unique_ptr<ParameterServer2>> parameterServers_;
 };
 
diff --git a/paddle/pserver/ProtoServer.h b/paddle/pserver/ProtoServer.h
index 3f78799dbfe1d4b80249e8cb27f269e6358903dd..2943867de5885ab1af1aa0f69e93a931092b28e3 100644
--- a/paddle/pserver/ProtoServer.h
+++ b/paddle/pserver/ProtoServer.h
@@ -34,7 +34,7 @@ namespace paddle {
  * for single NIC hardward with --port=N(N>1) for small cluster job.
  */
 class ProtoServer : public SocketServer {
-public:
+ public:
   /// rdmaCpu controls the cpu affinity of RDMA server daemon,
   /// which could benifit performance. rdmaCpu = -1 means TCP
   /// is used instead of RDMA transport.
@@ -87,7 +87,7 @@ public:
                          std::unique_ptr<MsgReader> msgReader,
                          ProtoResponseCallbackEx callback)> func);
 
-protected:
+ protected:
   /**
    * @brief handle rpc request
    * @param[in] msgReader  Message reader for reading data from connection
@@ -111,7 +111,7 @@ protected:
   void registerServiceFunctionImp(const std::string& funcName,
                                   ServiceFunction func);
 
-protected:
+ protected:
   /// Tuning bare network overhead: the beginning of receiving request
   ThreadLocal<struct timeval> handleRequestBegin_;
 
@@ -120,7 +120,7 @@ protected:
 };
 
 class ProtoClient : public SocketClient {
-public:
+ public:
   ProtoClient(const std::string& serverAddr,
               int serverPort,
               enum ChannelType channelType = F_TCP)
diff --git a/paddle/pserver/SocketChannel.h b/paddle/pserver/SocketChannel.h
index c0f30d0db760045a8c0cb001fcadaae8f0c03f9d..8b45ac56090ef82e77514566e7df6b366958655e 100644
--- a/paddle/pserver/SocketChannel.h
+++ b/paddle/pserver/SocketChannel.h
@@ -33,7 +33,7 @@ enum ChannelType {
 
 /// reading a set of blocks of data from SocketChannel.
 class MsgReader {
-public:
+ public:
   MsgReader(SocketChannel* channel, size_t numIovs);
   ~MsgReader() {
     /// ensure all data blocks have been processed
@@ -75,7 +75,7 @@ public:
   void readBlocks(const std::vector<void*>& bufs);
   void readNextBlock(void* buf);
 
-protected:
+ protected:
   SocketChannel* channel_;
   std::vector<size_t> blockLengths_;
   size_t currentBlockIndex_;
@@ -84,7 +84,7 @@ protected:
 /// APIs for reading and writing byte stream data or naive iov data
 /// from the APIs both RDMA and TCP exhibits byte stream style
 class SocketChannel {
-public:
+ public:
   SocketChannel(int socket, const std::string& peerName)
       : tcpSocket_(socket), peerName_(peerName) {
     tcpRdma_ = F_TCP;
@@ -137,7 +137,7 @@ public:
   /// return null to indicate socket is closed
   std::unique_ptr<MsgReader> readMessage();
 
-protected:
+ protected:
   struct MessageHeader {
     int64_t totalLength;  /// include the header
     int64_t numIovs;
diff --git a/paddle/pserver/SparseParameterDistribution.h b/paddle/pserver/SparseParameterDistribution.h
index 13f199548d56262e77e91e45052f3e435dea407c..e168f36c75e9452fff547f139a67a553cc6b796a 100644
--- a/paddle/pserver/SparseParameterDistribution.h
+++ b/paddle/pserver/SparseParameterDistribution.h
@@ -31,7 +31,7 @@ namespace paddle {
  * if unbalanced distribution exhibts by default.
  */
 class SparseParameterDistribution {
-public:
+ public:
   /// serviceNum means the number of ParameterServers
   explicit SparseParameterDistribution(size_t serviceNum);
   ~SparseParameterDistribution() {}
@@ -39,7 +39,7 @@ public:
   void probeDistribution(int serverId, size_t data);
   void checkAndResetDistribution();
 
-private:
+ private:
   std::vector<size_t> data_;
   std::atomic<size_t> totBytes_;
 
diff --git a/paddle/pserver/test/SocketTest.cpp b/paddle/pserver/test/SocketTest.cpp
index 6019dccaadf7fab5a1db7183c07cbbd9562dab2e..206cd17c379f529579c103893cfb492524bc6f8d 100644
--- a/paddle/pserver/test/SocketTest.cpp
+++ b/paddle/pserver/test/SocketTest.cpp
@@ -30,12 +30,12 @@ struct MessageHeader {
 };
 
 class Thread {
-public:
+ public:
   void start();
   virtual void run() = 0;
   virtual ~Thread() {}
 
-protected:
+ protected:
   std::unique_ptr<std::thread> thread_;
 };
 
@@ -44,13 +44,13 @@ void Thread::start() {
 }
 
 class SocketChannel {
-public:
+ public:
   explicit SocketChannel(int socket) : socket_(socket) {}
   int getSocketFd() const { return socket_; }
   uint64_t readAll(void* buf, size_t size);
   uint64_t writeAll(const void* buf, size_t size);
 
-protected:
+ protected:
   int socket_;
 };
 
@@ -79,7 +79,7 @@ uint64_t SocketChannel::writeAll(const void* buf, size_t size) {
 }
 
 class SocketWorker : public Thread {
-public:
+ public:
   explicit SocketWorker(int socket) : channel_(socket) {}
   virtual void run();
 
@@ -88,19 +88,19 @@ public:
 
   // write n bytes
 
-protected:
+ protected:
   SocketChannel channel_;
   std::string buffer_;
 };
 
 class SocketServer : public Thread {
-public:
+ public:
   explicit SocketServer(int port)
       : port_(port), socket_(0), maxPendingConnections_(100) {}
 
   virtual void run();
 
-protected:
+ protected:
   int port_;
   int socket_;
   int maxPendingConnections_;
@@ -161,11 +161,11 @@ void SocketWorker::run() {
 }
 
 class SocketClient {
-public:
+ public:
   SocketClient(const std::string& serverAddr, int serverPort);
   SocketChannel* getChannel() const { return channel_.get(); }
 
-protected:
+ protected:
   std::unique_ptr<SocketChannel> channel_;
 };
 
diff --git a/paddle/pserver/test/test_ParameterServer2.cpp b/paddle/pserver/test/test_ParameterServer2.cpp
index e742cd0871da865e02a60a125a936eea8f15e575..01d179258dffaf996a57022801ee3bd60a268f77 100644
--- a/paddle/pserver/test/test_ParameterServer2.cpp
+++ b/paddle/pserver/test/test_ParameterServer2.cpp
@@ -26,7 +26,7 @@ DEFINE_string(server_addr, "127.0.0.1", "assign server address");
 DEFINE_int32(server_cpu, 0, "assign server cpu");
 
 class ParameterServer2Tester : public ParameterServer2 {
-public:
+ public:
   ParameterServer2Tester(std::string serverAddr,
                          int port,
                          int rdmaCpu = -1,
@@ -88,7 +88,7 @@ public:
   void waitPassFinishTest();
   void synchronizeTest();
 
-protected:
+ protected:
   ParameterClient2 client_;
   vector<ParameterConfig> clientConfigs_;
   vector<ParameterPtr> parameters_;
diff --git a/paddle/pserver/test/test_ProtoServer.cpp b/paddle/pserver/test/test_ProtoServer.cpp
index d68a8d2180cc3081346106132799498f6dc3fa20..a66b14a1cc58d11988e4936a9c35d98b8bf5edc1 100644
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
@@ -28,7 +28,7 @@ DEFINE_bool(benchmark, false, "Do benchmark. Skip some tests");
 using namespace paddle;  // NOLINT
 
 class MyServer : public ProtoServer {
-public:
+ public:
   explicit MyServer(int port, int rdmaCpu = -1)
       : ProtoServer(FLAGS_server_addr, port, rdmaCpu),
         status_(PSERVER_STATUS_NOT_SET) {
@@ -62,7 +62,7 @@ public:
     callback(response);
   }
 
-protected:
+ protected:
   PServerStatus status_;
   std::string buffer_;
 };
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
deleted file mode 100755
index 7e00bd38487902227c3b4521db20cdbe314059be..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/build.sh
+++ /dev/null
@@ -1,259 +0,0 @@
-#!/bin/bash
-
-function cmake_gen() {
-    mkdir -p /paddle/build
-    cd /paddle/build
-
-    # build script will not fail if *.deb does not exist
-    rm *.deb 2>/dev/null || true
-    # delete previous built whl packages
-    rm -rf /paddle/paddle/dist 2>/dev/null || true
-
-    # Support build for all python versions, currently
-    # including cp27-cp27m and cp27-cp27mu.
-    PYTHON_FLAGS=""
-    if [ "$1" != "" ]; then
-        echo "using python abi: $1"
-        if [ "$1" == "cp27-cp27m" ]; then
-            export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
-            export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
-            PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
-        -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
-        -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
-        elif [ "$1" == "cp27-cp27mu" ]; then
-            export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
-            export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
-            PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
-        -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
-        -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
-        fi
-    fi
-
-    cat <<EOF
-    ========================================
-    Configuring cmake in /paddle/build ...
-        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
-        ${PYTHON_FLAGS}
-        -DWITH_DSO=ON
-        -DWITH_DOC=${WITH_DOC:-OFF}
-        -DWITH_GPU=${WITH_GPU:-OFF}
-        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
-        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
-        -DWITH_MKL=${WITH_MKL:-ON}
-        -DWITH_AVX=${WITH_AVX:-OFF}
-        -DWITH_GOLANG=${WITH_GOLANG:-OFF}
-        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
-        -DWITH_SWIG_PY=ON
-        -DWITH_C_API=${WITH_C_API:-OFF}
-        -DWITH_PYTHON=${WITH_PYTHON:-ON}
-        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
-        -DCUDNN_ROOT=/usr/
-        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
-        -DWITH_TESTING=${WITH_TESTING:-ON}
-        -DWITH_FAST_BUNDLE_TEST=ON
-        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
-        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
-    ========================================
-EOF
-    # Disable UNITTEST_USE_VIRTUALENV in docker because
-    # docker environment is fully controlled by this script.
-    # See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
-    cmake .. \
-        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \
-        ${PYTHON_FLAGS} \
-        -DWITH_DSO=ON \
-        -DWITH_DOC=${WITH_DOC:-OFF} \
-        -DWITH_GPU=${WITH_GPU:-OFF} \
-        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
-        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
-        -DWITH_MKL=${WITH_MKL:-ON} \
-        -DWITH_AVX=${WITH_AVX:-OFF} \
-        -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
-        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
-        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
-        -DWITH_C_API=${WITH_C_API:-OFF} \
-        -DWITH_PYTHON=${WITH_PYTHON:-ON} \
-        -DCUDNN_ROOT=/usr/ \
-        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
-        -DWITH_TESTING=${WITH_TESTING:-ON} \
-        -DWITH_FAST_BUNDLE_TEST=ON \
-        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
-        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-}
-
-function run_build() {
-    cat <<EOF
-    ============================================
-    Building in /paddle/build ...
-    ============================================
-EOF
-    make clean
-    make -j `nproc`
-}
-
-function run_test() {
-    if [ ${WITH_TESTING:-ON} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
-    cat <<EOF
-    ========================================
-    Running unit tests ...
-    ========================================
-EOF
-        ctest --output-on-failure
-        # make install should also be test when unittest
-        make install -j `nproc`
-        pip install /usr/local/opt/paddle/share/wheels/*.whl
-        if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
-            paddle version
-        fi
-    fi
-}
-
-
-function gen_docs() {
-    if [[ ${WITH_DOC:-OFF} == "ON" ]]; then
-        cat <<EOF
-    ========================================
-    Building documentation ...
-    In /paddle/build_doc
-    ========================================
-EOF
-        mkdir -p /paddle/build_doc
-        pushd /paddle/build_doc
-        cmake .. \
-            -DWITH_DOC=ON \
-            -DWITH_GPU=OFF \
-            -DWITH_AVX=${WITH_AVX:-ON} \
-            -DWITH_SWIG_PY=ON \
-            -DWITH_STYLE_CHECK=OFF
-
-        make -j `nproc` paddle_docs paddle_apis
-        popd
-    fi
-
-
-    if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
-        cat <<EOF
-    ========================================
-    Converting C++ source code into HTML ...
-    ========================================
-EOF
-        export WOBOQ_OUT=/paddle/build/woboq_out
-        mkdir -p $WOBOQ_OUT
-        cp -rv /woboq/data $WOBOQ_OUT/../data
-        /woboq/generator/codebrowser_generator \
-            -b /paddle/build \
-            -a \
-            -o $WOBOQ_OUT \
-            -p paddle:/paddle
-        /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-    fi
-}
-
-
-function gen_dockerfile() {
-    # Set BASE_IMAGE according to env variables
-    if [[ ${WITH_GPU} == "ON" ]]; then
-    BASE_IMAGE="nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04"
-    else
-    BASE_IMAGE="ubuntu:16.04"
-    fi
-
-    DOCKERFILE_GPU_ENV=""
-    DOCKERFILE_CUDNN_DSO=""
-    if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
-        DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}"
-        DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/x86_64-linux-gnu/libcudnn.so"
-    fi
-
-    cat <<EOF
-    ========================================
-    Generate /paddle/build/Dockerfile ...
-    ========================================
-EOF
-
-    cat > /paddle/build/Dockerfile <<EOF
-    FROM ${BASE_IMAGE}
-    MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-    ENV HOME /root
-EOF
-
-    if [[ ${WITH_GPU} == "ON"  ]]; then
-        NCCL_DEPS="apt-get install -y libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 &&"
-    else
-        NCCL_DEPS=""
-    fi
-
-    if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then
-        PADDLE_VERSION="paddle version"
-        CMD='"paddle", "version"'
-    else
-        PADDLE_VERSION="true"
-        CMD='"true"'
-    fi
-
-    cat >> /paddle/build/Dockerfile <<EOF
-    ADD python/dist/*.whl /
-    # run paddle version to install python packages first
-    RUN apt-get update &&\
-        ${NCCL_DEPS}\
-        apt-get install -y wget python-pip dmidecode python-tk && pip install -U pip==9.0.3 && \
-        pip install /*.whl; apt-get install -f -y && \
-        apt-get clean -y && \
-        rm -f /*.whl && \
-        ${PADDLE_VERSION} && \
-        ldconfig
-    ${DOCKERFILE_CUDNN_DSO}
-    ${DOCKERFILE_GPU_ENV}
-    ENV NCCL_LAUNCH_MODE PARALLEL
-EOF
-    if [[ ${WITH_GOLANG:-OFF} == "ON" ]]; then
-        cat >> /paddle/build/Dockerfile <<EOF
-        ADD go/cmd/pserver/pserver /usr/bin/
-        ADD go/cmd/master/master /usr/bin/
-EOF
-    fi
-    cat >> /paddle/build/Dockerfile <<EOF
-    # default command shows the paddle version and exit
-    CMD [${CMD}]
-EOF
-}
-
-function gen_capi_package() {
-  if [[ ${WITH_C_API} == "ON" ]]; then
-    install_prefix="/paddle/build/capi_output"
-    rm -rf $install_prefix
-    make DESTDIR="$install_prefix" install
-    cd $install_prefix/usr/local
-    ls | egrep -v "^Found.*item$" | xargs tar -cf /paddle/build/paddle.tgz
-  fi
-}
-
-function gen_fluid_inference_lib() {
-    if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
-    cat <<EOF
-    ========================================
-    Deploying fluid inference library ...
-    ========================================
-EOF
-        make -j `nproc` inference_lib_dist
-    fi
-}
-
-set -xe
-
-cmake_gen ${PYTHON_ABI:-""}
-run_build
-run_test
-gen_docs
-gen_dockerfile
-gen_capi_package
-gen_fluid_inference_lib
-
-if [[ ${WITH_C_API:-OFF} == "ON" ]]; then
-  printf "PaddlePaddle C-API libraries was generated on build/paddle.tgz\n"
-else
-  printf "If you need to install PaddlePaddle in develop docker image,"
-  printf "please make install or pip install build/python/dist/*.whl.\n"
-fi
diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh
deleted file mode 100644
index 3d5e775fafb6b94a3429dbf3368a8949bca3d612..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/build_android.sh
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/bin/bash
-
-set -xe
-
-if [ $ANDROID_ABI == "arm64-v8a" ]; then
-  ANDROID_ARCH=arm64
-  if [ $ANDROID_API -lt 21 ]; then
-    echo "Warning: arm64-v8a requires ANDROID_API >= 21."
-    ANDROID_API=21
-  fi
-else # armeabi, armeabi-v7a
-  ANDROID_ARCH=arm
-fi
-
-ANDROID_STANDALONE_TOOLCHAIN=$ANDROID_TOOLCHAINS_DIR/$ANDROID_ARCH-android-$ANDROID_API
-
-cat <<EOF
-============================================
-Generating the standalone toolchain ...
-${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh
-      --arch=$ANDROID_ARCH
-      --platform=android-$ANDROID_API
-      --install-dir=${ANDROID_STANDALONE_TOOLCHAIN}
-============================================
-EOF
-${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh \
-      --arch=$ANDROID_ARCH \
-      --platform=android-$ANDROID_API \
-      --install-dir=$ANDROID_STANDALONE_TOOLCHAIN
-
-BUILD_ROOT=/paddle/build_android
-DEST_ROOT=/paddle/install_android
-
-mkdir -p $BUILD_ROOT
-cd $BUILD_ROOT
-
-if [ $ANDROID_ABI == "armeabi-v7a" ]; then
-  cmake -DCMAKE_SYSTEM_NAME=Android \
-        -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-        -DANDROID_ABI=$ANDROID_ABI \
-        -DANDROID_ARM_NEON=ON \
-        -DANDROID_ARM_MODE=ON \
-        -DHOST_C_COMPILER=/usr/bin/gcc \
-        -DHOST_CXX_COMPILER=/usr/bin/g++ \
-        -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-        -DCMAKE_BUILD_TYPE=MinSizeRel \
-        -DUSE_EIGEN_FOR_BLAS=ON \
-        -DWITH_C_API=ON \
-        -DWITH_SWIG_PY=OFF \
-        -DWITH_STYLE_CHECK=OFF \
-        ..
-elif [ $ANDROID_ABI == "arm64-v8a" ]; then
-  cmake -DCMAKE_SYSTEM_NAME=Android \
-        -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-        -DANDROID_ABI=$ANDROID_ABI \
-        -DANDROID_ARM_MODE=ON \
-        -DHOST_C_COMPILER=/usr/bin/gcc \
-        -DHOST_CXX_COMPILER=/usr/bin/g++ \
-        -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-        -DCMAKE_BUILD_TYPE=MinSizeRel \
-        -DUSE_EIGEN_FOR_BLAS=OFF \
-        -DWITH_C_API=ON \
-        -DWITH_SWIG_PY=OFF \
-        -DWITH_STYLE_CHECK=OFF \
-        ..
-elif [ $ANDROID_ABI == "armeabi" ]; then
-  cmake -DCMAKE_SYSTEM_NAME=Android \
-        -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-        -DANDROID_ABI=$ANDROID_ABI \
-        -DANDROID_ARM_MODE=ON \
-        -DHOST_C_COMPILER=/usr/bin/gcc \
-        -DHOST_CXX_COMPILER=/usr/bin/g++ \
-        -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-        -DCMAKE_BUILD_TYPE=MinSizeRel \
-        -DWITH_C_API=ON \
-        -DWITH_SWIG_PY=OFF \
-        -DWITH_STYLE_CHECK=OFF \
-        ..
-else
-  echo "Invalid ANDROID_ABI: $ANDROID_ABI"
-fi
-
-cat <<EOF
-============================================
-Building in $BUILD_ROOT ...
-============================================
-EOF
-make -j `nproc`
-make install -j `nproc`
diff --git a/paddle/scripts/docker/entrypoint b/paddle/scripts/docker/entrypoint
deleted file mode 100755
index bc194bd909aa308fd5fe920c9319f62a0ec2dac7..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/entrypoint
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-
-/usr/sbin/sshd -D &
-jupyter notebook --ip=0.0.0.0 /paddle/book/
diff --git a/paddle/scripts/docker/test.sh b/paddle/scripts/docker/test.sh
deleted file mode 100755
index 8180737a8f431d6eb8bab4b2ef7bdcc50cce41f3..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/test.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-set -e
-
-# the number of process to run tests
-NUM_PROC=6
-
-# calculate and set the memory usage for each process
-MEM_USAGE=$(printf "%.2f" `echo "scale=5; 1.0 / $NUM_PROC" | bc`)
-export FLAGS_fraction_of_gpu_memory_to_use=$MEM_USAGE
-
-# get the CUDA device count
-CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
-
-for (( i = 0; i < $NUM_PROC; i++ )); do
-    cuda_list=()
-    for (( j = 0; j < $CUDA_DEVICE_COUNT; j++ )); do
-        s=$[i+j]
-        n=$[s%CUDA_DEVICE_COUNT]
-        if [ $j -eq 0 ]; then
-            cuda_list=("$n")
-        else
-            cuda_list="$cuda_list,$n"
-        fi
-    done
-    echo $cuda_list
-    # CUDA_VISIBLE_DEVICES http://acceleware.com/blog/cudavisibledevices-masking-gpus
-    # ctest -I https://cmake.org/cmake/help/v3.0/manual/ctest.1.html?highlight=ctest
-    env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC --output-on-failure &
-done
-wait
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 1595cc9e8aad4d143ca62f84f812dbc791dc1d26..55959197e7cd82253fb0c604604b4302ca0a3dc7 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -20,19 +20,15 @@
 #=================================================
 
 function print_usage() {
-    RED='\033[0;31m'
-    BLUE='\033[0;34m'
-    BOLD='\033[1m'
-    NONE='\033[0m'
-    
     echo -e "\n${RED}Usage${NONE}:
-    ${BOLD}$0${NONE} [OPTION]"
+    ${BOLD}${SCRIPT_NAME}${NONE} [OPTION]"
     
     echo -e "\n${RED}Options${NONE}:
     ${BLUE}build${NONE}: run build for x86 platform
     ${BLUE}build_android${NONE}: run build for android platform
     ${BLUE}build_ios${NONE}: run build for ios platform
     ${BLUE}test${NONE}: run all unit tests
+    ${BLUE}single_test${NONE}: run a single unit test
     ${BLUE}bind_test${NONE}: parallel tests bind to different GPU
     ${BLUE}doc${NONE}: generate paddle documents
     ${BLUE}html${NONE}: convert C++ source code into HTML
@@ -45,7 +41,15 @@ function print_usage() {
 }
 
 function init() {
+    RED='\033[0;31m'
+    BLUE='\033[0;34m'
+    BOLD='\033[1m'
+    NONE='\033[0m'
+
     PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
+    if [ -z "${SCRIPT_NAME}" ]; then
+        SCRIPT_NAME=$0
+    fi
 }
 
 function cmake_gen() {
@@ -91,17 +95,17 @@ function cmake_gen() {
         -DWITH_AVX=${WITH_AVX:-OFF}
         -DWITH_GOLANG=${WITH_GOLANG:-OFF}
         -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
-        -DWITH_SWIG_PY=ON
         -DWITH_C_API=${WITH_C_API:-OFF}
         -DWITH_PYTHON=${WITH_PYTHON:-ON}
         -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
         -DCUDNN_ROOT=/usr/
-        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
         -DWITH_TESTING=${WITH_TESTING:-ON}
         -DWITH_FAST_BUNDLE_TEST=ON
         -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+        -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -123,12 +127,12 @@ EOF
         -DWITH_C_API=${WITH_C_API:-OFF} \
         -DWITH_PYTHON=${WITH_PYTHON:-ON} \
         -DCUDNN_ROOT=/usr/ \
-        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
         -DWITH_TESTING=${WITH_TESTING:-ON} \
         -DWITH_FAST_BUNDLE_TEST=ON \
         -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+        -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
 }
 
 function abort(){
@@ -141,19 +145,17 @@ function check_style() {
     trap 'abort' 0
     set -e
 
-    # install glide
-    curl https://glide.sh/get | bash
-    eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
+    if [ -x "$(command -v gimme)" ]; then
+    	eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
+    fi
 
     # set up go environment for running gometalinter
     mkdir -p $GOPATH/src/github.com/PaddlePaddle/
     ln -sf ${PADDLE_ROOT} $GOPATH/src/github.com/PaddlePaddle/Paddle
-    cd  $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd -
-
-    go get github.com/alecthomas/gometalinter
-    gometalinter --install
+    mkdir -p ./build/go
+    cp go/glide.* build/go
+    cd build/go; glide install; cd -
 
-    cd ${PADDLE_ROOT}
     export PATH=/usr/bin:$PATH
     pre-commit install
     clang-format --version
@@ -179,7 +181,7 @@ function build() {
     ============================================
 EOF
     make clean
-    make -j `nproc`
+    make install -j `nproc`
 }
 
 function build_android() {
@@ -228,7 +230,6 @@ EOF
             -DUSE_EIGEN_FOR_BLAS=ON \
             -DWITH_C_API=ON \
             -DWITH_SWIG_PY=OFF \
-            -DWITH_STYLE_CHECK=OFF \
             ..
     elif [ $ANDROID_ABI == "arm64-v8a" ]; then
       cmake -DCMAKE_SYSTEM_NAME=Android \
@@ -242,7 +243,6 @@ EOF
             -DUSE_EIGEN_FOR_BLAS=OFF \
             -DWITH_C_API=ON \
             -DWITH_SWIG_PY=OFF \
-            -DWITH_STYLE_CHECK=OFF \
             ..
     elif [ $ANDROID_ABI == "armeabi" ]; then
       cmake -DCMAKE_SYSTEM_NAME=Android \
@@ -255,7 +255,6 @@ EOF
             -DCMAKE_BUILD_TYPE=MinSizeRel \
             -DWITH_C_API=ON \
             -DWITH_SWIG_PY=OFF \
-            -DWITH_STYLE_CHECK=OFF \
             ..
     else
       echo "Invalid ANDROID_ABI: $ANDROID_ABI"
@@ -284,7 +283,6 @@ function build_ios() {
           -DUSE_EIGEN_FOR_BLAS=ON \
           -DWITH_TESTING=OFF \
           -DWITH_SWIG_PY=OFF \
-          -DWITH_STYLE_CHECK=OFF \
           -DCMAKE_BUILD_TYPE=Release
     
     make -j 2
@@ -309,6 +307,25 @@ EOF
     fi
 }
 
+function single_test() {
+    TEST_NAME=$1
+    if [ -z "${TEST_NAME}" ]; then
+        echo -e "${RED}Usage:${NONE}"
+        echo -e "${BOLD}${SCRIPT_NAME}${NONE} ${BLUE}single_test${NONE} [test_name]"
+        exit 1
+    fi
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
+    if [ ${WITH_TESTING:-ON} == "ON" ] ; then
+    cat <<EOF
+    ========================================
+    Running ${TEST_NAME} ...
+    ========================================
+EOF
+        ctest --output-on-failure -R ${TEST_NAME}
+    fi
+}
+
 function bind_test() {
     # the number of process to run tests
     NUM_PROC=6
@@ -353,8 +370,7 @@ EOF
         -DCMAKE_BUILD_TYPE=Release \
         -DWITH_DOC=ON \
         -DWITH_GPU=OFF \
-        -DWITH_MKL=OFF \
-        -DWITH_STYLE_CHECK=OFF
+        -DWITH_MKL=OFF
 
     make -j `nproc` paddle_docs paddle_apis
 
@@ -383,22 +399,26 @@ EOF
 
 function gen_dockerfile() {
     # Set BASE_IMAGE according to env variables
+    CUDA_MAJOR="$(echo $CUDA_VERSION | cut -d '.' -f 1).$(echo $CUDA_VERSION | cut -d '.' -f 2)"
+    CUDNN_MAJOR=$(echo $CUDNN_VERSION | cut -d '.' -f 1)
     if [[ ${WITH_GPU} == "ON" ]]; then
-    BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
+        BASE_IMAGE="nvidia/cuda:${CUDA_MAJOR}-cudnn${CUDNN_MAJOR}-runtime-ubuntu16.04"
     else
-    BASE_IMAGE="ubuntu:16.04"
+        BASE_IMAGE="ubuntu:16.04"
     fi
 
     DOCKERFILE_GPU_ENV=""
     DOCKERFILE_CUDNN_DSO=""
+    DOCKERFILE_CUBLAS_DSO=""
     if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
         DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}"
-        DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
+        DOCKERFILE_CUDNN_DSO="RUN ln -sf /usr/lib/x86_64-linux-gnu/libcudnn.so.${CUDNN_MAJOR} /usr/lib/x86_64-linux-gnu/libcudnn.so"
+        DOCKERFILE_CUBLAS_DSO="RUN ln -sf /usr/local/cuda/targets/x86_64-linux/lib/libcublas.so.${CUDA_MAJOR} /usr/lib/x86_64-linux-gnu/libcublas.so"
     fi
 
     cat <<EOF
     ========================================
-    Generate /paddle/build/Dockerfile ...
+    Generate ${PADDLE_ROOT}/build/Dockerfile ...
     ========================================
 EOF
 
@@ -409,7 +429,7 @@ EOF
 EOF
 
     if [[ ${WITH_GPU} == "ON"  ]]; then
-        NCCL_DEPS="apt-get install -y libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 &&"
+        NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.1.2-1+cuda${CUDA_MAJOR} libnccl-dev=2.1.2-1+cuda${CUDA_MAJOR} &&"
     else
         NCCL_DEPS=""
     fi
@@ -422,22 +442,29 @@ EOF
         CMD='"true"'
     fi
 
-    cat >> /paddle/build/Dockerfile <<EOF
+    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
     ADD python/dist/*.whl /
     # run paddle version to install python packages first
     RUN apt-get update &&\
         ${NCCL_DEPS}\
-        apt-get install -y wget python-pip dmidecode python-tk && pip install -U pip==9.0.3 && \
+        apt-get install -y wget python-pip python-opencv libgtk2.0-dev dmidecode python-tk && easy_install -U pip && \
         pip install /*.whl; apt-get install -f -y && \
         apt-get clean -y && \
         rm -f /*.whl && \
         ${PADDLE_VERSION} && \
         ldconfig
     ${DOCKERFILE_CUDNN_DSO}
+    ${DOCKERFILE_CUBLAS_DSO}
     ${DOCKERFILE_GPU_ENV}
     ENV NCCL_LAUNCH_MODE PARALLEL
-    ADD go/cmd/pserver/pserver /usr/bin/
-    ADD go/cmd/master/master /usr/bin/
+EOF
+    if [[ ${WITH_GOLANG:-OFF} == "ON" ]]; then
+        cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+        ADD go/cmd/pserver/pserver /usr/bin/
+        ADD go/cmd/master/master /usr/bin/
+EOF
+    fi
+    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
     # default command shows the paddle version and exit
     CMD [${CMD}]
 EOF
@@ -462,17 +489,22 @@ function gen_fluid_inference_lib() {
     Deploying fluid inference library ...
     ========================================
 EOF
-        make inference_lib_dist
-    fi
+        make -j `nproc` inference_lib_dist
+        cd ${PADDLE_ROOT}/build
+        mv fluid_install_dir fluid
+        tar -cf fluid.tgz fluid
+      fi
 }
 
 function main() {
+    set -e
     local CMD=$1
     init
     case $CMD in
       build)
         cmake_gen ${PYTHON_ABI:-""}
         build
+        gen_dockerfile
         ;;
       build_android)
         build_android
@@ -483,6 +515,9 @@ function main() {
       test)
         run_test
         ;;
+      single_test)
+        single_test $2
+        ;;
       bind_test)
         bind_test
         ;;
@@ -497,6 +532,7 @@ function main() {
         ;;
       capi)
         cmake_gen ${PYTHON_ABI:-""}
+        build
         gen_capi_package
         ;;
       fluid_inference_lib)
diff --git a/paddle/scripts/paddle_docker_build.sh b/paddle/scripts/paddle_docker_build.sh
index ddae9f19a21b404c9b24b47d433e07db4603dc8b..3462deb9c2f88b6da643d6aa833449ed5f4a9b34 100755
--- a/paddle/scripts/paddle_docker_build.sh
+++ b/paddle/scripts/paddle_docker_build.sh
@@ -14,25 +14,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-function container_running() {
-    name=$1
-    docker ps -a --format "{{.Names}}" | grep "${name}" > /dev/null
-    return $?
-}
-
 function start_build_docker() {
     docker pull $IMG
 
-    if container_running "${CONTAINER_ID}"; then
-        docker stop "${CONTAINER_ID}" 1>/dev/null
-        docker rm -f "${CONTAINER_ID}" 1>/dev/null
-    fi
-
     apt_mirror='s#http://archive.ubuntu.com/ubuntu#mirror://mirrors.ubuntu.com/mirrors.txt#g'
     DOCKER_ENV=$(cat <<EOL
         -e FLAGS_fraction_of_gpu_memory_to_use=0.15 \
         -e CTEST_OUTPUT_ON_FAILURE=1 \
-        -e CTEST_PARALLEL_LEVEL=5 \
+        -e CTEST_PARALLEL_LEVEL=1 \
         -e APT_MIRROR=${apt_mirror} \
         -e WITH_GPU=ON \
         -e CUDA_ARCH_NAME=Auto \
@@ -59,10 +48,10 @@ EOL
     if [ ! -d "${HOME}/.ccache" ]; then
         mkdir ${HOME}/.ccache
     fi
-    set -x
+    set -ex
     ${DOCKER_CMD} run -it \
-        --name $CONTAINER_ID \
         ${DOCKER_ENV} \
+        -e SCRIPT_NAME=$0 \
         -v $PADDLE_ROOT:/paddle \
         -v ${HOME}/.ccache:/root/.ccache \
         -w /paddle \
@@ -74,10 +63,8 @@ EOL
 function main() {
     DOCKER_REPO="paddlepaddle/paddle"
     VERSION="latest-dev"
-    CONTAINER_ID="${USER}_paddle_dev"
     PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
     if [ "$1" == "build_android" ]; then
-        CONTAINER_ID="${USER}_paddle_dev_android"
         VERSION="latest-dev-android"
     fi
     IMG=${DOCKER_REPO}:${VERSION}
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
deleted file mode 100755
index d7527d99482bfe93a06e0de150a6c1ece36addde..0000000000000000000000000000000000000000
--- a/paddle/scripts/travis/build_doc.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-set -e
-
-# Create the build directory for CMake.
-mkdir -p $TRAVIS_BUILD_DIR/build
-cd $TRAVIS_BUILD_DIR/build
-
-# Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON -DWITH_STYLE_CHECK=OFF
-
-make -j `nproc` paddle_docs paddle_apis
-
-# check websites for broken links
-linkchecker doc/v2/en/html/index.html
-linkchecker doc/v2/cn/html/index.html
-linkchecker doc/v2/api/en/html/index.html
diff --git a/paddle/scripts/travis/build_ios.sh b/paddle/scripts/travis/build_ios.sh
deleted file mode 100755
index dee7cf7cbbcccffd727002108ae7f6b6ee2fbba8..0000000000000000000000000000000000000000
--- a/paddle/scripts/travis/build_ios.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-set -e
-
-# Create the build directory for CMake.
-mkdir -p $TRAVIS_BUILD_DIR/build_ios
-cd $TRAVIS_BUILD_DIR/build_ios
-
-# Compile paddle binaries
-cmake -DCMAKE_SYSTEM_NAME=iOS \
-      -DIOS_PLATFORM=OS \
-      -DCMAKE_OSX_ARCHITECTURES="arm64" \
-      -DWITH_C_API=ON \
-      -DUSE_EIGEN_FOR_BLAS=ON \
-      -DWITH_TESTING=OFF \
-      -DWITH_SWIG_PY=OFF \
-      -DWITH_STYLE_CHECK=OFF \
-      -DCMAKE_BUILD_TYPE=Release \
-      ..
-
-make -j 2
diff --git a/paddle/scripts/travis/check_style.sh b/paddle/scripts/travis/check_style.sh
deleted file mode 100755
index e71d243efa2041cc0624b8273e1bfabaa03ce106..0000000000000000000000000000000000000000
--- a/paddle/scripts/travis/check_style.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-function abort(){
-    echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
-    echo "Please use pre-commit to check what is wrong." 1>&2
-    exit 1
-}
-
-trap 'abort' 0
-set -e
-
-# install glide
-curl https://glide.sh/get | bash
-eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
-
-# set up go environment for running gometalinter
-mkdir -p $GOPATH/src/github.com/PaddlePaddle/
-ln -sf $TRAVIS_BUILD_DIR $GOPATH/src/github.com/PaddlePaddle/Paddle
-cd  $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd -
-
-go get github.com/alecthomas/gometalinter
-gometalinter --install
-
-cd $TRAVIS_BUILD_DIR
-export PATH=/usr/bin:$PATH
-pre-commit install
-clang-format --version
-
-
-
-if ! pre-commit run -a ; then
-    git diff
-    exit 1
-fi
-
-trap : 0
diff --git a/paddle/scripts/travis/deploy_key.enc b/paddle/scripts/travis/deploy_key.enc
deleted file mode 100644
index b0aa45c5ac626c735735fd8541a43bf8b099d0a0..0000000000000000000000000000000000000000
Binary files a/paddle/scripts/travis/deploy_key.enc and /dev/null differ
diff --git a/paddle/trainer/CMakeLists.txt b/paddle/trainer/CMakeLists.txt
index 72911695bd4959d73d783897b0c5e674454c30bc..6192de4388c8c3f5165fb88b443d372748f7a17e 100644
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
@@ -36,17 +36,12 @@ endif()
 add_library(paddle_trainer_lib STATIC
     ${TRAINER_SOURCES})
 
-add_style_check_target(paddle_trainer_lib
-    ${TRAINER_SOURCES})
-add_style_check_target(paddle_trainer_lib
-    ${TRAINER_HEADERS})
 add_dependencies(paddle_trainer_lib
     paddle_proto
     ${external_project_dependencies})
 
 macro(add_paddle_exe TARGET_NAME)
   add_executable(${TARGET_NAME} ${ARGN})
-  add_style_check_target(${TARGET_NAME} ${ARGN})
   link_paddle_exe(${TARGET_NAME})
 endmacro()
 
diff --git a/paddle/trainer/NewRemoteParameterUpdater.h b/paddle/trainer/NewRemoteParameterUpdater.h
index 6223ba427c9b94494c2bee8f0847442f1b0574c9..02693c675e6f5cb574e52e9681963a5904676028 100644
--- a/paddle/trainer/NewRemoteParameterUpdater.h
+++ b/paddle/trainer/NewRemoteParameterUpdater.h
@@ -29,7 +29,7 @@ namespace paddle {
  * New remote parameter updater for dense parameters that use cclient of go.
  */
 class NewRemoteParameterUpdater : public ParameterUpdater {
-public:
+ public:
   NewRemoteParameterUpdater(const OptimizationConfig& config,
                             const std::string pserverSpec);
   NewRemoteParameterUpdater(const OptimizationConfig& config,
@@ -61,13 +61,13 @@ public:
   virtual void startPass();
   virtual bool finishPass();
 
-protected:
+ protected:
   /**
    * work need to do after finishBatch
    */
   virtual void updateImpl(Parameter* para);
 
-private:
+ private:
   int parameterSize() { return (int)parameters_.size(); }
 
   /**
@@ -104,7 +104,7 @@ private:
     }
   }
 
-protected:
+ protected:
   const OptimizationConfig& trainerConfig_;
   /// internal parameter client object for exchanging data with pserver
   paddle_pserver_client parameterClient_;
diff --git a/paddle/trainer/ParamUtil.h b/paddle/trainer/ParamUtil.h
index 2e05595848760c9abd7d916003656c8103151abf..10746b4d58e3a82c081987a6aaad9e0b42272a03 100644
--- a/paddle/trainer/ParamUtil.h
+++ b/paddle/trainer/ParamUtil.h
@@ -56,7 +56,7 @@ struct ParameterUtilConfig {
  * Utility class for loading and saving parameters
  */
 class ParameterUtil {
-public:
+ public:
   /**
    * Ctor.
    *
@@ -115,7 +115,7 @@ public:
     }
   }
 
-private:
+ private:
   std::shared_ptr<TrainerConfigHelper> config_;
   std::unique_ptr<ParameterUtilConfig> intConfig_;
   GradientMachinePtr gserver_;
diff --git a/paddle/trainer/ParameterUpdater.h b/paddle/trainer/ParameterUpdater.h
index 9e9e948b8856d2712f8894b3d14db9c795d5f694..ef7ab92eca77bab2a8481561713f8034d2b8505d 100644
--- a/paddle/trainer/ParameterUpdater.h
+++ b/paddle/trainer/ParameterUpdater.h
@@ -36,7 +36,7 @@ namespace paddle {
  * @brief Parameter Updater for SGD, and local(not cluster) run.
  */
 class SgdLocalUpdater : public ParameterUpdater {
-public:
+ public:
   /**
    * @brief Ctor. Initialize optimizer locally by optConfig.
    * @param optConfig optimization config.
@@ -131,7 +131,7 @@ public:
     }
   }
 
-protected:
+ protected:
   /**
    * @brief update method. Update value from gradient.
    * @param para parameter that will be updated.
@@ -159,7 +159,7 @@ protected:
  * @deprecated
  */
 class SgdCpuUpdater : public SgdLocalUpdater, public Deprecated {
-public:
+ public:
   explicit SgdCpuUpdater(const OptimizationConfig& optConfig)
       : SgdLocalUpdater(optConfig),
         Deprecated(
@@ -178,7 +178,7 @@ public:
     optimizer_->finishBatch();
   }
 
-protected:
+ protected:
   /**
    * @brief do nothing.
    * @param para
@@ -192,7 +192,7 @@ protected:
  * It will do model average in cpu to reduce gpu memory comsuption.
  */
 class SgdUpdaterWithCpuAverager : public SgdLocalUpdater {
-public:
+ public:
   /**
    * @brief Ctor.
    *
@@ -233,12 +233,12 @@ public:
    */
   virtual void restore();
 
-protected:
+ protected:
   virtual void updateImpl(Parameter* para);
 
   void updateFunc(Parameter* para);
 
-protected:
+ protected:
   std::unique_ptr<ParameterOptimizer> averager_;
 
   /**
diff --git a/paddle/trainer/RemoteParameterUpdater.h b/paddle/trainer/RemoteParameterUpdater.h
index 5e82c944751629632ea8d16992bd8f4178a2fbd5..3a40a46354efd6b92278884c8f5b72504a3ff283 100644
--- a/paddle/trainer/RemoteParameterUpdater.h
+++ b/paddle/trainer/RemoteParameterUpdater.h
@@ -53,7 +53,7 @@ namespace paddle {
  * backward and communication is not supported.
  */
 class RemoteParameterUpdater : public ParameterUpdater {
-public:
+ public:
   RemoteParameterUpdater(
       const OptimizationConfig& config,
       int expectedPassCount,
@@ -101,7 +101,7 @@ public:
   virtual void apply();
   virtual void restore();
 
-protected:
+ protected:
   /**
    * control all pservers with all trainers for sync-sgd
    */
@@ -128,7 +128,7 @@ protected:
    */
   void copyParametersFromDevice(ParameterType parameterType);
 
-protected:
+ protected:
   /// Optimization config used to guide initialization and finishBatch
   OptimizationConfig config_;
   /// internal parameter client object for exchanging data with pserver
@@ -178,7 +178,7 @@ protected:
  * It contains separate send and recv thread for pipeline usage.
  */
 class ConcurrentRemoteParameterUpdater : public RemoteParameterUpdater {
-public:
+ public:
   ConcurrentRemoteParameterUpdater(
       OptimizationConfig config,
       int expectedPassCount,
@@ -194,7 +194,7 @@ public:
    */
   virtual void finishBatch(real cost);
 
-protected:
+ protected:
   virtual void updateImpl(Parameter* para);
   /// internal thread called in send thread
   void send(Parameter* para);  // para == NULL indicate end of a minibatch
@@ -221,7 +221,7 @@ protected:
     return (numBatches_ + 1) % config_.num_batches_per_send_parameter() == 0;
   }
 
-private:
+ private:
   /// send thread used for overlapping
   std::unique_ptr<std::thread> sendThread_;
   /// recv thread used for overlapping
@@ -263,7 +263,7 @@ private:
  * to encapsulate sparse specified message for all pservers.
  */
 class SparseRemoteParameterUpdater : public ParameterUpdater {
-public:
+ public:
   SparseRemoteParameterUpdater(const OptimizationConfig& config,
                                int expectedPassCount,
                                bool testing);
@@ -303,7 +303,7 @@ public:
   }
 #endif
 
-protected:
+ protected:
   /// update implimentation, not implemented
   virtual void updateImpl(Parameter* para) {}
 
@@ -313,7 +313,7 @@ protected:
   /// start controller thread
   void startController();
 
-protected:
+ protected:
   /// optimization config
   OptimizationConfig config_;
   /// internal parameter client
@@ -335,7 +335,7 @@ protected:
  * it directly call internal dense and sparse udpater individually.
  */
 class SparseRemoteParameterUpdaterComposite : public ParameterUpdaterComposite {
-public:
+ public:
   enum {
     UPDATER_SPARSE_REMOTE = 0,  // execute in sync thread pool(tid:0)
     UPDATER_NORMAL = 1,         // execute in Owner thread(tid:1)
@@ -364,7 +364,7 @@ public:
 };
 
 class ParameterUpdaterCreators {
-public:
+ public:
   /**
    * @brief add a creator to create custom ParameterUpdater while training.
    *        The creator is a function with type (alogrithm, optConfig, isLocal,
@@ -407,7 +407,7 @@ public:
     return nullptr;
   }
 
-private:
+ private:
   static std::vector<std::function<ParameterUpdater*(
       const std::string&, const OptimizationConfig&, bool, size_t)>>
       constructors_;
diff --git a/paddle/trainer/Tester.h b/paddle/trainer/Tester.h
index e892744db278586f2fd5b3cb527aa7c17752c477..801c77e3116369732bf4b03107adce6a71dc2184 100644
--- a/paddle/trainer/Tester.h
+++ b/paddle/trainer/Tester.h
@@ -38,7 +38,7 @@ namespace paddle {
  * It is a private class for Trainer.
  */
 class Tester {
-public:
+ public:
   /**
    * Ctor
    * @param config Trainer Config.
@@ -87,7 +87,7 @@ public:
    */
   void test();
 
-protected:
+ protected:
   std::shared_ptr<ParameterClient2> testParameterClient_;
   std::shared_ptr<TrainerConfigHelper> config_;
   std::unique_ptr<TesterConfig> intconfig_;
@@ -107,7 +107,7 @@ protected:
     real cost;
   } testContext_;
 
-private:
+ private:
   /**
    * Test one batch by batchId. It is only used for testOnePass.
    *
diff --git a/paddle/trainer/ThreadParameterUpdater.h b/paddle/trainer/ThreadParameterUpdater.h
index bc08a9e9f0eda1cab7776ba76c67e88add1028a9..b5e6a7ce3c8457364b10c921bca3386fbb6f6cbf 100644
--- a/paddle/trainer/ThreadParameterUpdater.h
+++ b/paddle/trainer/ThreadParameterUpdater.h
@@ -39,7 +39,7 @@ namespace paddle {
    class.
  */
 class SgdThreadUpdater : public ParameterUpdater {
-public:
+ public:
   explicit SgdThreadUpdater(const OptimizationConfig& optConfig);
   virtual ~SgdThreadUpdater() {}
 
@@ -57,7 +57,7 @@ public:
   virtual void apply();
   virtual void restore();
 
-protected:
+ protected:
   // This is the function that will be eventualy called by the GradientMachine.
   // used only for GPU update.
   virtual void updateImpl(Parameter* para);
diff --git a/paddle/trainer/Trainer.h b/paddle/trainer/Trainer.h
index fac589d1d711affcd008f90edf87d865c8362f69..78127b7be5cef34f51a4b540852c139625b571dd 100644
--- a/paddle/trainer/Trainer.h
+++ b/paddle/trainer/Trainer.h
@@ -41,7 +41,7 @@ namespace paddle {
  * train/test a NeuralNetwork.
  */
 class Trainer {
-public:
+ public:
   /**
    * Ctor.
    * @return
@@ -138,7 +138,7 @@ public:
    */
   ParameterUtil* getParameterUtilPtr();
 
-protected:
+ protected:
   /**
    * Train one pass of data.
    *
@@ -159,10 +159,10 @@ protected:
 
   void createTester();
 
-private:
+ private:
   std::unique_ptr<TesterConfig> createTesterConfig();
 
-protected:
+ protected:
   std::shared_ptr<TrainerConfigHelper> config_;
   std::shared_ptr<TrainerStats> stats_;
 
diff --git a/paddle/trainer/TrainerConfigHelper.h b/paddle/trainer/TrainerConfigHelper.h
index f1366cc041b0d983e65a1bf5b02ec2128324c5a8..b21dda964e70fce6e5e9672cc131595ad5af3bbc 100644
--- a/paddle/trainer/TrainerConfigHelper.h
+++ b/paddle/trainer/TrainerConfigHelper.h
@@ -37,7 +37,7 @@ class DataConfig;
  * Define a macro to unify 'final' keyword
  */
 class TrainerConfigHelper /*final*/ {
-public:
+ public:
   DISABLE_COPY(TrainerConfigHelper);
 
   /**
@@ -193,7 +193,7 @@ public:
    */
   static std::shared_ptr<TrainerConfigHelper> createFromFlagConfig();
 
-private:
+ private:
   static std::string getConfigNameFromPassId(int passId,
                                              const std::string& modelPath);
 
diff --git a/paddle/trainer/TrainerInternal.h b/paddle/trainer/TrainerInternal.h
index 7018faab24744f7a087a53130acc56ec6314101e..48ee53a5e60f950bfc3cc299c754b0e72601c818 100644
--- a/paddle/trainer/TrainerInternal.h
+++ b/paddle/trainer/TrainerInternal.h
@@ -34,7 +34,7 @@ namespace paddle {
  * the core training class for driving training logic
  */
 class TrainerInternal {
-public:
+ public:
   struct ParaStat {
     real maxAbsGrad;
     real avgAbsGrad;
@@ -126,7 +126,7 @@ public:
                                     UpdateCallback updateCallback,
                                     bool doPipelineUpdate);
 
-protected:
+ protected:
   std::shared_ptr<ParameterUpdater> parameterUpdater_;
   GradientMachinePtr gradientMachine_;
   std::shared_ptr<TrainerConfigHelper> config_;
diff --git a/paddle/trainer/TrainerInternalConfig.h b/paddle/trainer/TrainerInternalConfig.h
index b47692720efc2ed4f2db84f61ca81fcb52d234c0..43aae381029784278ad58c9398f64af24dffa1df 100644
--- a/paddle/trainer/TrainerInternalConfig.h
+++ b/paddle/trainer/TrainerInternalConfig.h
@@ -37,7 +37,7 @@ namespace paddle {
  * through one mini-batch.
  */
 class TrainerStats {
-public:
+ public:
   /**
    * @brief reset all stats.
    *
@@ -147,7 +147,7 @@ public:
     return os.str();
   }
 
-private:
+ private:
   int64_t numProcessed_;
   real totalCost_;
   real currentCost_;
diff --git a/paddle/trainer/tests/picojson.h b/paddle/trainer/tests/picojson.h
index eaa8b9baf6e4e753a441ab77811f494cbdab80cf..75349537b1c7f10d23bae788e8414a753c7ccab0 100644
--- a/paddle/trainer/tests/picojson.h
+++ b/paddle/trainer/tests/picojson.h
@@ -125,7 +125,7 @@ enum { INDENT_WIDTH = 2 };
 struct null {};
 
 class value {
-public:
+ public:
   typedef std::vector<value> array;
   typedef std::map<std::string, value> object;
   union _storage {
@@ -139,11 +139,11 @@ public:
     object* object_;
   };
 
-protected:
+ protected:
   int type_;
   _storage u_;
 
-public:
+ public:
   value();
   value(int type, bool);
   explicit value(bool b);
@@ -179,7 +179,7 @@ public:
   void serialize(Iter os, bool prettify = false) const;
   std::string serialize(bool prettify = false) const;
 
-private:
+ private:
   template <typename T>
   value(const T*);  // intentionally defined to block implicit conversion of
                     // pointer to bool
@@ -588,13 +588,13 @@ inline std::string value::_serialize(int indent) const {
 
 template <typename Iter>
 class input {
-protected:
+ protected:
   Iter cur_, end_;
   int last_ch_;
   bool ungot_;
   int line_;
 
-public:
+ public:
   input(const Iter& first, const Iter& last)
       : cur_(first), end_(last), last_ch_(-1), ungot_(false), line_(1) {}
   int getc() {
@@ -873,7 +873,7 @@ inline bool _parse(Context& ctx, input<Iter>& in) {
 }
 
 class deny_parse_context {
-public:
+ public:
   bool set_null() { return false; }
   bool set_bool(bool) { return false; }
 #ifdef PICOJSON_USE_INT64
@@ -898,10 +898,10 @@ public:
 };
 
 class default_parse_context {
-protected:
+ protected:
   value* out_;
 
-public:
+ public:
   default_parse_context(value* out) : out_(out) {}
   bool set_null() {
     *out_ = value();
@@ -949,18 +949,18 @@ public:
     return _parse(ctx, in);
   }
 
-private:
+ private:
   default_parse_context(const default_parse_context&);
   default_parse_context& operator=(const default_parse_context&);
 };
 
 class null_parse_context {
-public:
+ public:
   struct dummy_str {
     void push_back(int) {}
   };
 
-public:
+ public:
   null_parse_context() {}
   bool set_null() { return true; }
   bool set_bool(bool) { return true; }
@@ -985,7 +985,7 @@ public:
     return _parse(*this, in);
   }
 
-private:
+ private:
   null_parse_context(const null_parse_context&);
   null_parse_context& operator=(const null_parse_context&);
 };
diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/trainer/tests/test_TrainerOnePass.cpp
index b2a93d4d5eea37ad716b59427f2aa4409d2f537d..de12c4d649c6041f497c0eeac0904ebfc0d5bf97 100644
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
@@ -38,7 +38,7 @@ DECLARE_int32(num_passes);
 DECLARE_int32(saving_period);
 
 class TrainerForTest : public paddle::Trainer {
-public:
+ public:
   inline const std::shared_ptr<ParameterUpdater>& getParameterUpdaterForTest() {
     return this->trainerInternal_.getParameterUpdater();
   }
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index 6292e7fa52cd86c71724d9fe84ea622e98ff1e08..b42b2bae968a10c581c594054f853347eb5d5445 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -14,9 +14,6 @@ add_library(paddle_utils STATIC
         ${UTIL_SOURCES}
         ${UTIL_ARCH_SOURCES}
         ${UTIL_RES})
-add_style_check_target(paddle_utils ${UTIL_HEADERS})
-add_style_check_target(paddle_utils ${UTIL_SOURCES}
-    ${UTIL_ARCH_SOURCES})
 add_dependencies(paddle_utils paddle_proto ${external_project_dependencies})
 if(WITH_TESTING)
     add_subdirectory(tests)
diff --git a/paddle/utils/ClassRegistrar.h b/paddle/utils/ClassRegistrar.h
index 1ac27bafabd1945d1d01e3bead22b0dd200d8688..5f40a0b25e92c7adcfe3f8c4be96016be801da3b 100644
--- a/paddle/utils/ClassRegistrar.h
+++ b/paddle/utils/ClassRegistrar.h
@@ -41,7 +41,7 @@ namespace paddle {
  */
 template <class BaseClass, typename... CreateArgs>
 class ClassRegistrar {
-public:
+ public:
   typedef std::function<BaseClass*(CreateArgs...)> ClassCreator;
 
   // Register a class using a creation function.
@@ -74,7 +74,7 @@ public:
     }
   }
 
-protected:
+ protected:
   std::map<std::string, ClassCreator> creatorMap_;
 };
 
diff --git a/paddle/utils/CpuId.h b/paddle/utils/CpuId.h
index 869be5be541dafd699a87a8e8893aadadf59b711..ed58211d13ac1e0f80d6728950f0b88dc0ae625f 100644
--- a/paddle/utils/CpuId.h
+++ b/paddle/utils/CpuId.h
@@ -35,7 +35,7 @@ enum simd_t {
 // clang-format on
 
 class SIMDFlags final {
-public:
+ public:
   DISABLE_COPY(SIMDFlags);
 
   SIMDFlags();
@@ -46,7 +46,7 @@ public:
     return !((simd_flags_ & flags) ^ flags);
   }
 
-private:
+ private:
   int simd_flags_ = SIMD_NONE;
 };
 
diff --git a/paddle/utils/CustomStackTrace.h b/paddle/utils/CustomStackTrace.h
index 52a6df94979fd3d8d7d540ed0e3898bb3375d975..b60077ea2d946366910780eeb773635972211e04 100644
--- a/paddle/utils/CustomStackTrace.h
+++ b/paddle/utils/CustomStackTrace.h
@@ -49,7 +49,7 @@ namespace paddle {
  */
 template <typename T>
 class CustomStackTrace {
-public:
+ public:
   /**
    * @brief Pop out an item from the top of the stack if item == top.
    *        Else, just set status to popping.
@@ -136,7 +136,7 @@ public:
     p.push(item);
   }
 
-private:
+ private:
   /**
    * Get thread local attribute, and save them into a map (threadId => TYPE*)
    *
@@ -174,7 +174,7 @@ private:
     return this->getThreadLocal(this->isPushing_, this->pushingBuffers_);
   }
 
-private:
+ private:
   mutable std::mutex mtx_;
 
   std::unordered_map<std::thread::id, std::stack<T>*> stackBuffers_;
diff --git a/paddle/utils/Error.h b/paddle/utils/Error.h
index 7cde98306026ca1de76089749aaea265d151da33..1fc8482e3a1bef869d4df147bbd3cab6e62ccf49 100644
--- a/paddle/utils/Error.h
+++ b/paddle/utils/Error.h
@@ -95,7 +95,7 @@ namespace paddle {
  * log(FATAL) and CHECK in Paddle, 'check' method will be removed.
  */
 class Error {
-public:
+ public:
   /**
    * Construct a no-error value.
    */
@@ -138,7 +138,7 @@ public:
    */
   bool isOK() const { return msg_ == nullptr; }
 
-private:
+ private:
   std::shared_ptr<std::string> msg_;
 };
 
diff --git a/paddle/utils/GlobalConstants.h b/paddle/utils/GlobalConstants.h
index 0ec1c28dfbb2a7db9fa84c9eb2bc4dad806b78e9..3f45e82268435e4c22d1879e909b0c90838d6693 100644
--- a/paddle/utils/GlobalConstants.h
+++ b/paddle/utils/GlobalConstants.h
@@ -78,7 +78,7 @@ enum ParameterType {
 using namespace enumeration_wrapper;  // NOLINT
 
 class TrainAlgorithm {
-public:
+ public:
   static const std::string SGD;
   static const std::string AsyncSGD;
   static const std::string OWLQN;
diff --git a/paddle/utils/Locks.h b/paddle/utils/Locks.h
index e87abb9139f1c3f250f8b8fe1afdd8883f682647..65f983685f5e178345a6a875a79a6573ce1ccca1 100644
--- a/paddle/utils/Locks.h
+++ b/paddle/utils/Locks.h
@@ -42,7 +42,7 @@ namespace paddle {
  * Use unlock() to unlock the lock.
  */
 class RWLock {
-public:
+ public:
   RWLock() { pthread_rwlock_init(&rwlock_, NULL); }
   ~RWLock() { pthread_rwlock_destroy(&rwlock_); }
   RWLock(const RWLock&) = delete;
@@ -62,7 +62,7 @@ public:
   void lock_shared() { pthread_rwlock_rdlock(&rwlock_); }
   void unlock() { pthread_rwlock_unlock(&rwlock_); }
 
-protected:
+ protected:
   pthread_rwlock_t rwlock_;
 };
 
@@ -71,7 +71,7 @@ protected:
  * using RAII management mechanism.
  */
 class ReadLockGuard {
-public:
+ public:
   /**
    * @brief Construct Function. Lock on rwlock in read mode.
    */
@@ -86,7 +86,7 @@ public:
    */
   ~ReadLockGuard() { rwlock_->unlock(); }
 
-protected:
+ protected:
   RWLock* rwlock_;
 };
 
@@ -98,7 +98,7 @@ protected:
  */
 class SpinLockPrivate;
 class SpinLock {
-public:
+ public:
   DISABLE_COPY(SpinLock);
   SpinLock();
   ~SpinLock();
@@ -107,7 +107,7 @@ public:
   void lock();
   void unlock();
 
-private:
+ private:
   SpinLockPrivate* m;
 };
 
@@ -116,7 +116,7 @@ private:
  */
 class SemaphorePrivate;
 class Semaphore {
-public:
+ public:
   //! Disable copy & assign
   Semaphore(const Semaphore& other) = delete;
   Semaphore& operator=(const Semaphore&& other) = delete;
@@ -124,7 +124,7 @@ public:
   //! Enable move.
   Semaphore(Semaphore&& other) : m(std::move(other.m)) {}
 
-public:
+ public:
   /**
    * @brief Construct Function.
    * @param[in] initValue the initial value of the
@@ -156,7 +156,7 @@ public:
    */
   void post();
 
-private:
+ private:
   SemaphorePrivate* m;
 };
 
@@ -166,7 +166,7 @@ private:
  */
 class ThreadBarrierPrivate;
 class ThreadBarrier {
-public:
+ public:
   DISABLE_COPY(ThreadBarrier);
 
   /**
@@ -184,7 +184,7 @@ public:
    */
   void wait();
 
-private:
+ private:
   ThreadBarrierPrivate* m;
 };
 
@@ -192,7 +192,7 @@ private:
  * A wrapper for condition variable with mutex.
  */
 class LockedCondition : public std::condition_variable {
-public:
+ public:
   /**
    * @brief execute op and notify one thread which was blocked.
    * @param[in] op a thread can do something in op before notify.
@@ -235,7 +235,7 @@ public:
    */
   std::mutex* mutex() { return &mutex_; }
 
-protected:
+ protected:
   std::mutex mutex_;
 };
 
diff --git a/paddle/utils/PythonUtil.h b/paddle/utils/PythonUtil.h
index daebaffc855518425ae43942c22ec150d2e327f0..6f8d7e09309503e47aca7ae2d20774c748703b21 100644
--- a/paddle/utils/PythonUtil.h
+++ b/paddle/utils/PythonUtil.h
@@ -55,12 +55,12 @@ std::string callPythonFunc(const std::string& moduleName,
  * NOTE: the lock of this guard is reentrant or recursive.
  */
 class PyGuard {
-public:
+ public:
   PyGuard();
   PyGuard(const PyGuard& other) = delete;
   PyGuard& operator=(const PyGuard& other) = delete;
 
-private:
+ private:
   std::lock_guard<std::recursive_mutex> guard_;
 };
 
@@ -133,7 +133,7 @@ std::string getPyCallStack();
  * Implements getAttr method for object.
  */
 class ObjectHelper {
-public:
+ public:
   explicit ObjectHelper(const PyObjectPtr& obj) : obj_(obj) {}
 
   /**
@@ -192,7 +192,7 @@ public:
     return PyObject_IsTrue(tmp.get());
   }
 
-private:
+ private:
   const PyObjectPtr& obj_;
 };
 
@@ -202,7 +202,7 @@ private:
  * The python sequence means list or tuple.
  */
 class SequenceHelper {
-public:
+ public:
   explicit SequenceHelper(const PyObjectPtr& seq) : seq_(seq.get()) {
     CHECK(PySequence_Check(seq_));
   }
@@ -248,12 +248,12 @@ public:
     }
   }
 
-private:
+ private:
   PyObject* seq_;
 };
 
 class DictHelper {
-public:
+ public:
   explicit DictHelper(PyObject* d) : dict_(d) {}
 
   explicit DictHelper(const PyObjectPtr& d) : dict_(d.get()) {}
@@ -275,7 +275,7 @@ public:
     this->set(key, list);
   }
 
-private:
+ private:
   inline void checkDict() { CHECK(PyDict_Check(this->dict_)); }
 
   PyObject* dict_;
@@ -289,7 +289,7 @@ inline static bool isCallable(const PyObjectPtr& obj) {
  * Wrap a callable object.
  */
 class CallableHelper {
-public:
+ public:
   explicit CallableHelper(const PyObjectPtr& obj) : obj_(obj) {
     CHECK(py::isCallable(obj_));
   }
@@ -315,7 +315,7 @@ public:
     return PyObject_Call(obj_.get(), args.get(), kwargs.get());
   }
 
-private:
+ private:
   const PyObjectPtr& obj_;
   PyObjectPtr args;
   PyObjectPtr kwargs;
diff --git a/paddle/utils/Queue.h b/paddle/utils/Queue.h
index f054738f87c02d2d749eec8d6c7bb55b506a6d91..189e1a14f7b2d133408a50418d96431164248f0e 100644
--- a/paddle/utils/Queue.h
+++ b/paddle/utils/Queue.h
@@ -56,7 +56,7 @@ namespace paddle {
  */
 template <class T>
 class Queue {
-public:
+ public:
   /**
    * @brief Construct Function. Default capacity of Queue is zero.
    */
@@ -147,7 +147,7 @@ public:
     });
   }
 
-private:
+ private:
   std::deque<T> elements_;
   int numElements_;
   std::mutex queueLock_;
@@ -185,7 +185,7 @@ private:
  */
 template <typename T>
 class BlockingQueue {
-public:
+ public:
   /**
    * @brief Construct Function.
    * @param[in] capacity the max numer of elements the queue can have.
@@ -244,7 +244,7 @@ public:
     return queue_.empty();
   }
 
-private:
+ private:
   std::mutex mutex_;
   std::condition_variable notEmpty_;
   std::condition_variable notFull_;
diff --git a/paddle/utils/Stat.h b/paddle/utils/Stat.h
index 79fd3b8cf043e62922dfd046754ee8ac261990c5..100e9eba909466fcca57f755405ab63b638a8ebd 100644
--- a/paddle/utils/Stat.h
+++ b/paddle/utils/Stat.h
@@ -33,7 +33,7 @@ namespace paddle {
 class Stat;
 
 class StatInfo {
-public:
+ public:
   explicit StatInfo(Stat* stat = nullptr) : stat_(stat) {
     total_ = 0;
     max_ = 0;
@@ -61,7 +61,7 @@ class Stat;
 typedef std::shared_ptr<Stat> StatPtr;
 
 class StatSet {
-public:
+ public:
   explicit StatSet(const std::string& name) : name_(name) {}
   ~StatSet() {}
 
@@ -102,7 +102,7 @@ public:
   // pserver code logic, -_- ).
   void reset(bool clearRawData = true);
 
-private:
+ private:
   std::unordered_map<std::string, StatPtr> statSet_;
   const std::string name_;
   RWLock lock_;
@@ -112,7 +112,7 @@ extern StatSet globalStat;
 
 /*@brief : a simple stat*/
 class Stat {
-public:
+ public:
   explicit Stat(const std::string& statName)
       : destructStat_(nullptr), name_(statName), openThreadInfo_(false) {}
   ~Stat() {}
@@ -137,7 +137,7 @@ public:
 
   friend class StatInfo;
 
-private:
+ private:
   void mergeThreadStat(StatInfo& allThreadStat);
 
   std::mutex lock_;
@@ -164,7 +164,7 @@ inline uint64_t nowInMicroSec() {
  * A simple help class to measure time interval
  */
 class Timer {
-public:
+ public:
   explicit Timer(bool autoStart = true) : total_(0), startStamp_(0) {
     if (autoStart) {
       start();
@@ -181,13 +181,13 @@ public:
 
   void reset() { total_ = 0; }
 
-protected:
+ protected:
   uint64_t total_;
   uint64_t startStamp_;
 };
 
 class TimerOnce {
-public:
+ public:
   TimerOnce(Stat* stat,
             const char* info = "",
             uint64_t threshold = -1,
@@ -208,7 +208,7 @@ public:
     stat_->addSample(span);
   }
 
-private:
+ private:
   Stat* stat_;
   const char* info_;
   Timer timer_;
@@ -280,11 +280,11 @@ inline StatSet& registerTimerArg2(uint64_t threshold = -1,
 #endif  // DISABLE_TIMER
 
 class GpuProfiler final {
-public:
+ public:
   GpuProfiler(std::string statName, std::string info);
   ~GpuProfiler();
 
-private:
+ private:
   std::lock_guard<std::recursive_mutex> guard_;
 };
 
diff --git a/paddle/utils/Thread.h b/paddle/utils/Thread.h
index ef36a8c5b2b0e95d759da8a781d781b71d067b7a..2ee6eba1a68202282537788160a77f7689a2ffdb 100644
--- a/paddle/utils/Thread.h
+++ b/paddle/utils/Thread.h
@@ -29,7 +29,7 @@ namespace paddle {
  */
 
 class Thread {
-public:
+ public:
   /**
    * @brief Construct Function. Default thread pointer is null.
    */
@@ -62,7 +62,7 @@ public:
    */
   virtual void run() = 0;
 
-protected:
+ protected:
   std::unique_ptr<std::thread> thread_;
 };
 
@@ -73,7 +73,7 @@ protected:
  * Use addJob() to add a new job to the job queue.
  */
 class ThreadWorker : protected Thread {
-public:
+ public:
   typedef std::function<void()> JobFunc;
 
   /**
@@ -116,7 +116,7 @@ public:
     finishCV_.wait([this] { return empty_; });
   }
 
-protected:
+ protected:
   /**
    * @brief Execute jobs in the job queue sequentianlly,
    * @note If finish all the jobs in the job queue,
@@ -150,7 +150,7 @@ protected:
  * JobFunc can use tid to divide input data.
  */
 class SyncThreadPool {
-public:
+ public:
   typedef std::function<void(int tid, size_t numThreads)> JobFunc;
 
   /**
@@ -236,7 +236,7 @@ public:
     }
   }
 
-protected:
+ protected:
   /**
    * @brief Start all the workers in the pool, call their run() function.
    */
@@ -285,7 +285,7 @@ protected:
     }
   }
 
-protected:
+ protected:
   pid_t ownerThreadId_;
   bool stopping_;
   ThreadBarrier jobStartBarrier_;
@@ -323,7 +323,7 @@ protected:
  */
 template <class T>
 class MultiThreadWorker {
-public:
+ public:
   typedef T ResultType;
   typedef std::shared_ptr<ResultType> ResultPtrType;
   typedef std::function<ResultPtrType()> JobFunc;
@@ -424,7 +424,7 @@ public:
    */
   bool testResult() { return results_.empty(); }
 
-protected:
+ protected:
   /**
    * @brief Do the jobs in the job queue sequentianlly
    * and enqueue the result into the result queue.
@@ -476,7 +476,7 @@ protected:
  *    thread pool.
  */
 class AsyncThreadPool {
-public:
+ public:
   typedef std::function<void()> JobFunc;
 
   AsyncThreadPool() { LOG(FATAL) << "Not implemented"; }
@@ -594,7 +594,7 @@ public:
     }
   }
 
-protected:
+ protected:
   /**
    * @brief Execute the jobs in the job queue.
    */
@@ -606,7 +606,7 @@ protected:
     }
   }
 
-private:
+ private:
   std::vector<std::unique_ptr<std::thread>> workers_;
   Queue<JobFunc> jobs_;
   bool stopping_;
diff --git a/paddle/utils/ThreadLocal.h b/paddle/utils/ThreadLocal.h
index 0a27b8b97b83a9066af23039a317c437ea56777a..c5b07506d36875ead65887ea2e221e762be0d621 100644
--- a/paddle/utils/ThreadLocal.h
+++ b/paddle/utils/ThreadLocal.h
@@ -49,7 +49,7 @@ namespace paddle {
  */
 template <class T>
 class ThreadLocal {
-public:
+ public:
   ThreadLocal() {
     CHECK_EQ(pthread_key_create(&threadSpecificKey_, dataDestructor), 0);
   }
@@ -92,7 +92,7 @@ public:
    */
   operator T*() { return get(); }
 
-private:
+ private:
   static void dataDestructor(void* p) { delete (T*)p; }
 
   pthread_key_t threadSpecificKey_;
@@ -111,7 +111,7 @@ private:
  */
 template <class T>
 class ThreadLocalD {
-public:
+ public:
   ThreadLocalD() { CHECK_EQ(pthread_key_create(&threadSpecificKey_, NULL), 0); }
   ~ThreadLocalD() {
     pthread_key_delete(threadSpecificKey_);
@@ -150,7 +150,7 @@ public:
    */
   T& operator*() { return *get(); }
 
-private:
+ private:
   static void dataDestructor(void* p) { delete (T*)p; }
 
   void updateMap(T* p) {
@@ -172,7 +172,7 @@ private:
  * @brief Thread-safe C-style random API.
  */
 class ThreadLocalRand {
-public:
+ public:
   /**
    * initSeed just like srand,
    * called by main thread,
@@ -205,7 +205,7 @@ public:
    */
   static int getDefaultSeed() { return defaultSeed_; }
 
-protected:
+ protected:
   static unsigned int defaultSeed_;
   static ThreadLocal<unsigned int> seed_;
 };
@@ -214,7 +214,7 @@ protected:
  * @brief Thread-safe C++ style random engine.
  */
 class ThreadLocalRandomEngine {
-public:
+ public:
   /**
    * get random_engine for each thread.
    *
@@ -222,7 +222,7 @@ public:
    */
   static std::default_random_engine& get();
 
-protected:
+ protected:
   static ThreadLocal<std::default_random_engine> engine_;
 };
 
diff --git a/paddle/utils/Util.h b/paddle/utils/Util.h
index 9579881ea3b92abab0189631184bab515afb67a3..e6f05e30d308b8b94935897e947350934a5971ee 100644
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
@@ -179,7 +179,7 @@ void loadFileList(const std::string& fileListFileName,
  */
 void registerInitFunction(std::function<void()> func, int priority = 0);
 class InitFunction {
-public:
+ public:
   explicit InitFunction(std::function<void()> func, int priority = 0) {
     registerInitFunction(func, priority);
   }
@@ -191,7 +191,7 @@ public:
  * When the SetDevice object is destructed, it will restore device environment.
  */
 class SetDevice {
-public:
+ public:
   explicit SetDevice(int deviceId) {
     isSet_ = deviceId >= 0;
     devId_ = 0;
@@ -206,7 +206,7 @@ public:
     }
   }
 
-protected:
+ protected:
   bool isSet_;
   int devId_;
 };
@@ -240,7 +240,7 @@ inline void enablePeerAccess(int d1, int d2) {
  * }
  */
 class AsyncGpuBlock {
-public:
+ public:
   AsyncGpuBlock() : syncFlag_(hl_get_sync_flag()) { hl_set_sync_flag(false); }
   ~AsyncGpuBlock() {
     if (syncFlag_) {
@@ -249,7 +249,7 @@ public:
     }
   }
 
-private:
+ private:
   bool syncFlag_;
 };
 
@@ -378,7 +378,7 @@ std::string join(const std::string& part1,
  * A Checker for each invoke of method in same thread.
  */
 class SameThreadChecker {
-public:
+ public:
   SameThreadChecker() {}
 
   /**
@@ -400,7 +400,7 @@ public:
         << invokeThreadId_ << " current invoked in " << curThreadId;
   }
 
-private:
+ private:
   std::once_flag onceFlag_;
   std::thread::id invokeThreadId_;
 };
@@ -421,7 +421,7 @@ private:
  */
 template <typename KType, typename VType, typename Hash>
 class WeakKVCache {
-public:
+ public:
   WeakKVCache() {}
 
   std::shared_ptr<VType> get(const KType& key,
@@ -442,7 +442,7 @@ public:
     return retVal;
   }
 
-private:
+ private:
   std::mutex lock_;
   std::unordered_map<KType, std::weak_ptr<VType>, Hash> storage_;
 };
@@ -453,7 +453,7 @@ private:
  */
 template <typename CallbackType, typename... Args>
 class ScopedCallbacks {
-public:
+ public:
   ScopedCallbacks(CallbackType enter, CallbackType exit, Args&... args)
       : exit_(std::bind(exit, args...)) {
     enter(args...);
@@ -464,7 +464,7 @@ public:
 
   ~ScopedCallbacks() { exit_(); }
 
-private:
+ private:
   std::function<void()> exit_;
 };
 
@@ -475,7 +475,7 @@ private:
  */
 template <typename T, size_t Alignment>
 class AlignedAllocator {
-public:
+ public:
   /// std campatible typedefs.
   typedef T* pointer;
   typedef const T* const_pointer;
@@ -552,12 +552,12 @@ public:
     return this->allocate(n);
   }
 
-private:
+ private:
   AlignedAllocator& operator=(const AlignedAllocator&);  // disable
 };
 
 class Deprecated {
-public:
+ public:
   explicit Deprecated(const std::string& msg = "") {
     if (msg.empty()) {
       LOG(WARNING) << "This class is deprecated, please do not use this class.";
diff --git a/paddle/utils/arch/linux/Locks.cpp b/paddle/utils/arch/linux/Locks.cpp
index a4e6c8f7b8397adc262588612c250bac5ef5eaa6..409af8bce3621c51bfd7a69c6b4ec1f9cc6be8e4 100644
--- a/paddle/utils/arch/linux/Locks.cpp
+++ b/paddle/utils/arch/linux/Locks.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 
 namespace paddle {
 class SemaphorePrivate {
-public:
+ public:
   sem_t sem;
 };
 
@@ -45,7 +45,7 @@ void Semaphore::post() { sem_post(&m->sem); }
 #ifdef PADDLE_USE_PTHREAD_SPINLOCK
 
 class SpinLockPrivate {
-public:
+ public:
   inline SpinLockPrivate() { pthread_spin_init(&lock_, 0); }
   inline ~SpinLockPrivate() { pthread_spin_destroy(&lock_); }
 
@@ -63,7 +63,7 @@ public:
 // clang-format on
 
 class SpinLockPrivate {
-public:
+ public:
   inline void lock() {
     while (lock_.test_and_set(std::memory_order_acquire)) {
     }
@@ -86,7 +86,7 @@ void SpinLock::unlock() { m->unlock(); }
 #ifdef PADDLE_USE_PTHREAD_BARRIER
 
 class ThreadBarrierPrivate {
-public:
+ public:
   pthread_barrier_t barrier_;
 
   inline explicit ThreadBarrierPrivate(int count) {
@@ -101,7 +101,7 @@ public:
 #else
 
 class ThreadBarrierPrivate {
-public:
+ public:
   pthread_mutex_t mutex_;
   pthread_cond_t cond_;
   int count_;
diff --git a/paddle/utils/arch/osx/Locks.cpp b/paddle/utils/arch/osx/Locks.cpp
index e03992363fd6051a1970664d63406b2e7a47fce3..f3905091bd024ab02c3f5d39cfed6dbc38fabbbc 100644
--- a/paddle/utils/arch/osx/Locks.cpp
+++ b/paddle/utils/arch/osx/Locks.cpp
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 
 class SemaphorePrivate {
-public:
+ public:
   ~SemaphorePrivate() { dispatch_release(sem); }
 
   dispatch_semaphore_t sem;
@@ -45,7 +45,7 @@ void Semaphore::wait() {
 void Semaphore::post() { dispatch_semaphore_signal(m->sem); }
 
 class SpinLockPrivate {
-public:
+ public:
   std::atomic_flag lock_ = ATOMIC_FLAG_INIT;
   char padding_[64 - sizeof(lock_)];  // Padding to cache line size
 };
@@ -61,7 +61,7 @@ void SpinLock::lock() {
 void SpinLock::unlock() { m->lock_.clear(std::memory_order_release); }
 
 class ThreadBarrierPrivate {
-public:
+ public:
   pthread_mutex_t mutex_;
   pthread_cond_t cond_;
   int count_;
diff --git a/python/paddle/batch.py b/python/paddle/batch.py
index 317cf037c69f8639e3760fbfce20565127794fcb..d48c54fcbb66487617b1946bc69724870c8f879c 100644
--- a/python/paddle/batch.py
+++ b/python/paddle/batch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']
 
 
-def batch(reader, batch_size):
+def batch(reader, batch_size, drop_last=False):
     """
     Create a batched reader.
 
@@ -23,6 +23,8 @@ def batch(reader, batch_size):
     :type reader: callable
     :param batch_size: size of each mini-batch
     :type batch_size: int
+    :param drop_last: drop the last batch, if the size of last batch is not equal to batch_size.
+    :type drop_last: bool
     :return: the batched reader.
     :rtype: callable
     """
@@ -35,7 +37,7 @@ def batch(reader, batch_size):
             if len(b) == batch_size:
                 yield b
                 b = []
-        if b:
+        if drop_last == False and len(b) != 0:
             yield b
 
     return batch_reader
diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py
index ad23338a96df6856c7e15cb5e3bb713021a55bf0..540d43b692e0f65460f558dd74a52715ff4db68d 100644
--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -96,7 +96,7 @@ def __get_dict_size(src_dict_size, trg_dict_size, src_lang):
     src_dict_size = min(src_dict_size, (TOTAL_EN_WORDS if src_lang == "en" else
                                         TOTAL_DE_WORDS))
     trg_dict_size = min(trg_dict_size, (TOTAL_DE_WORDS if src_lang == "en" else
-                                        TOTAL_ENG_WORDS))
+                                        TOTAL_EN_WORDS))
     return src_dict_size, trg_dict_size
 
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index c8a435748dc5b51bf9e57b5b597e1422f0380e8e..c4fad620f0c49bb6b0ad3be22a564c16619efb0b 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -44,42 +44,45 @@ import transpiler
 from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
 from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace
-from transpiler import DistributeTranspiler, SimpleDistributeTranspiler, InferenceTranspiler, memory_optimize, release_memory
+from transpiler import DistributeTranspiler, InferenceTranspiler, \
+    memory_optimize, release_memory
 from concurrency import (Go, make_channel, channel_send, channel_recv,
                          channel_close, Select)
+from lod_tensor import create_lod_tensor, create_random_int_lodtensor
 import clip
 import profiler
 import unique_name
 import recordio_writer
-from parallel_executor import ParallelExecutor
+import parallel_executor
+from parallel_executor import *
 
 Tensor = LoDTensor
 
-__all__ = framework.__all__ + executor.__all__ + concurrency.__all__ +\
-          trainer.__all__ + inferencer.__all__ + transpiler.__all__ + [
-    'io',
-    'initializer',
-    'layers',
-    'transpiler'
-    'nets',
-    'optimizer',
-    'learning_rate_decay',
-    'backward',
-    'regularizer',
-    'LoDTensor',
-    'CPUPlace',
-    'CUDAPlace',
-    'CUDAPinnedPlace',
-    'Tensor',
-    'ParamAttr',
-    'WeightNormParamAttr',
-    'DataFeeder',
-    'clip',
-    'profiler',
-    'unique_name',
-    'recordio_writer',
-    'ParallelExecutor',
-]
+__all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \
+          trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
+          parallel_executor.__all__ + lod_tensor.__all__ + [
+              'io',
+              'initializer',
+              'layers',
+              'transpiler'
+              'nets',
+              'optimizer',
+              'learning_rate_decay',
+              'backward',
+              'regularizer',
+              'LoDTensor',
+              'CPUPlace',
+              'CUDAPlace',
+              'CUDAPinnedPlace',
+              'Tensor',
+              'ParamAttr',
+              'WeightNormParamAttr',
+              'DataFeeder',
+              'clip',
+              'profiler',
+              'unique_name',
+              'recordio_writer',
+          ]
 
 
 def __bootstrap__():
@@ -117,7 +120,7 @@ def __bootstrap__():
     ]
     if core.is_compiled_with_cuda():
         read_env_flags += [
-            'fraction_of_gpu_memory_to_use', 'cudnn_algo_use_autotune'
+            'fraction_of_gpu_memory_to_use', 'cudnn_deterministic'
         ]
     core.init_gflags([sys.argv[0]] +
                      ["--tryfromenv=" + ",".join(read_env_flags)])
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 7af6ed1463ab737e871da487f2a687301652ef2d..4f9622d04dc98f41b503ceb780802d2a4e4c58a0 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -51,6 +51,12 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
         op_desc.set_input(para, args)
     for para, args in outputs.iteritems():
         op_desc.set_output(para, args)
+
+    op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
+
+    if op_role_attr_name not in attrs:
+        attrs[
+            op_role_attr_name] = core.op_proto_and_checker_maker.OpRole.Backward
     for name, val in attrs.iteritems():
         if isinstance(val, framework.Block):
             op_desc.set_block_attr(name, val.desc)
@@ -141,7 +147,7 @@ def _addup_repetitive_outputs_(op_descs):
             else:
                 if len(renamed_vars[var_name]) == 1:
                     new_name = var_name + "@RENAME@" + \
-                        str(var_rename_count[var_name])
+                               str(var_rename_count[var_name])
                     var_rename_count[var_name] += 1
                     # rename original var_name
                     renamed_vars[var_name][0] = new_name
@@ -149,7 +155,7 @@ def _addup_repetitive_outputs_(op_descs):
                     _rename_arg_(pending_sum_ops, var_name, new_name)
 
                 new_name = var_name + "@RENAME@" + \
-                    str(var_rename_count[var_name])
+                           str(var_rename_count[var_name])
                 var_rename_count[var_name] += 1
                 op_desc.rename_output(var_name, new_name)
                 renamed_vars[var_name].append(new_name)
@@ -335,9 +341,12 @@ def _append_backward_ops_(block,
                                             no_grad_dict[block.idx])
 
     # append op_desc in grad_op_descs to target_block
+    op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
+    backward = core.op_proto_and_checker_maker.OpRole.Backward
     for op_desc in grad_op_descs:
         new_op_desc = target_block.desc.append_op()
         new_op_desc.copy_from(op_desc)
+        new_op_desc.set_attr(op_role_attr_name, backward)
         grad_to_var["__current_op_desc__"] = new_op_desc
         if callbacks is not None:
             assert (isinstance(callbacks, list))
@@ -439,6 +448,22 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
         (list[(Variable,Variable)]): list of (parameter, gradient) pair.
     """
     assert isinstance(loss, framework.Variable)
+
+    if loss.op is None:
+        # the loss is from a cloned program. Find loss op manually.
+        for op in reversed(loss.block.ops):
+            assert isinstance(op, framework.Operator)
+            if len(op.output_arg_names) == 1 and op.output_arg_names[
+                    0] == loss.name:
+                loss.op = op
+                break
+        if loss.op is None:
+            raise ValueError("loss.op is None. Should not happend")
+
+    loss.op.set_attr(core.op_proto_and_checker_maker.kOpRoleAttrName(),
+                     int(core.op_proto_and_checker_maker.OpRole.Forward) |
+                     int(core.op_proto_and_checker_maker.OpRole.Loss))
+
     if callbacks is not None:
         isinstance(callbacks, list)
 
@@ -456,12 +481,16 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
     current_block_idx = program.current_block_idx
     grad_to_var = dict()
 
-    op_desc = _create_op_desc_("fill_constant", {}, {
-        "Out": [_append_grad_suffix_(loss.name)]
-    }, {"shape": [1],
-        "value": 1.0,
-        "dtype": loss.dtype,
-        "force_cpu": False})
+    op_desc = _create_op_desc_(
+        "fill_constant", {}, {"Out": [_append_grad_suffix_(loss.name)]}, {
+            "shape": [1],
+            "value": 1.0,
+            "dtype": loss.dtype,
+            "force_cpu": False,
+            core.op_proto_and_checker_maker.kOpRoleAttrName():
+            int(core.op_proto_and_checker_maker.OpRole.Backward) |
+            int(core.op_proto_and_checker_maker.OpRole.Loss),
+        })
     root_block.desc.append_op().copy_from(op_desc)
 
     block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
@@ -480,6 +509,8 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
 
     program.current_block_idx = current_block_idx
     program.sync_with_cpp()
+    # FIXME(zcd): prevent loss.grad optimized by mem_opt.
+    loss.block.var(_append_grad_suffix_(loss.name)).persistable = True
 
     if parameter_list is not None:
         parameters = parameter_list
@@ -503,6 +534,24 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
             params_and_grads.append((param_var, grad_var))
         else:
             params_and_grads.append((param_var, None))
+
+    op_role_var_attr_name = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
+    for p, g in params_and_grads:
+        if g is None:
+            continue
+        for op in reversed(program.global_block().ops):
+            assert isinstance(op, framework.Operator)
+            if g.name in op.output_arg_names:
+                g.op = op
+                break
+
+        if g.op is None:
+            raise ValueError("Unexpected branch")
+        attr_val = [p.name, g.name]
+        if g.op.has_attr(op_role_var_attr_name):
+            attr_val.extend(g.op.attr(op_role_var_attr_name))
+        g.op.set_attr(op_role_var_attr_name, attr_val)
+
     return params_and_grads
 
 
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 12add9e686910c3936cf17fe87a5d0b78443b270..66c3fc6b66d61bc9578f84594409ad0f24c99910 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -214,21 +214,24 @@ def set_gradient_clip(clip, param_list=None, program=None):
 
 def append_gradient_clip_ops(param_grad):
     context = dict()
-    create_op_callbacks = []
     for p, g in param_grad:
-        clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr())
-        if clip_attr is None:
-            clip_attr = NullGradientClipAttr()
-        if not isinstance(clip_attr, BaseGradientClipAttr):
-            raise TypeError(
-                "clip attribute should be an instance of BaseGradientClipAttr")
+        with p.block.program.optimized_guard(p):
+            clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr())
+            if clip_attr is None:
+                clip_attr = NullGradientClipAttr()
+            if not isinstance(clip_attr, BaseGradientClipAttr):
+                raise TypeError(
+                    "clip attribute should be an instance of BaseGradientClipAttr"
+                )
 
-        clip_attr.process_context(context=context, param=p, grad=g)
-        create_op_callbacks.append(
-            functools.partial(
-                clip_attr.create_operators, param=p, grad=g))
+            clip_attr.process_context(context=context, param=p, grad=g)
+
+    res = []
+    for p, g in param_grad:
+        with p.block.program.optimized_guard(p):
+            res.append(clip_attr.create_operators(param=p, grad=g))
 
-    return [each_callback() for each_callback in create_op_callbacks]
+    return res
 
 
 ClipByValue = GradientClipByValue
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index ac02401c79b787716b2e5f43e0d1c5686cf2bd13..7940dabcfb03cc9eb46f678365685a6e99bcceec 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 import core
 import numpy
 import six.moves as six
+import multiprocessing
 
 from framework import Variable, default_main_program
 
@@ -35,9 +36,11 @@ class DataToLoDTensorConverter(object):
             self.dtype = 'float64'
         elif dtype == core.VarDesc.VarType.INT32:
             self.dtype = 'int32'
+        elif dtype == core.VarDesc.VarType.UINT8:
+            self.dtype = 'uint8'
         else:
             raise ValueError("dtype must be any of [int32, float32, int64, "
-                             "float64]")
+                             "float64, uint8]")
 
         self.data = []
         self.lod = []
@@ -53,9 +56,9 @@ class DataToLoDTensorConverter(object):
             self.data.append(data)
         else:
             cur_lod_len = len(data)
-            lod[-1].append(lod[-1][-1] + cur_lod_len)
+            lod[0].append(lod[0][-1] + cur_lod_len)
             for each_data in data:
-                self._feed_impl_(each_data, lod[:-1], lod_level - 1)
+                self._feed_impl_(each_data, lod[1:], lod_level - 1)
 
     def done(self):
         arr = numpy.array(self.data, dtype=self.dtype).reshape(self.shape)
@@ -116,3 +119,60 @@ class DataFeeder(object):
         for each_name, each_converter in six.zip(self.feed_names, converter):
             ret_dict[each_name] = each_converter.done()
         return ret_dict
+
+    def feed_parallel(self, iterable, num_places=None):
+        if isinstance(self.place, core.CUDAPlace):
+            places = [
+                core.CUDAPlace(i)
+                for i in six.xrange(self._get_number_of_places_(num_places))
+            ]
+        else:
+            places = [
+                core.CPUPlace()
+                for _ in six.xrange(self._get_number_of_places_(num_places))
+            ]
+
+        if len(iterable) != len(places):
+            raise ValueError("feed_parallel takes multiple mini-batches. Each "
+                             "mini-batch will be feed on each device. The "
+                             "number of devices and number of mini-batches "
+                             "must be same.")
+
+        place = self.place
+        for p, batch in six.zip(places, iterable):
+            self.place = p
+            yield self.feed(batch)
+        self.place = place
+
+    def _get_number_of_places_(self, num_places):
+        if num_places is not None:
+            return int(num_places)
+        elif isinstance(self.place, core.CUDAPlace):
+            return core.get_cuda_device_count()
+        else:
+            return multiprocessing.cpu_count()
+
+    def decorate_reader(self,
+                        reader,
+                        multi_devices,
+                        num_places=None,
+                        drop_last=True):
+        def __reader_creator__():
+            if not multi_devices:
+                for item in reader():
+                    yield self.feed(item)
+            else:
+                num = self._get_number_of_places_(num_places)
+                item = []
+                for batch in reader():
+                    item.append(batch)
+                    if len(item) == num:
+                        yield list(self.feed_parallel(item, num))
+                        item = []
+                if not drop_last and len(item) != 0:
+                    raise ValueError(
+                        "The data batch which cannot fit for devices will be "
+                        "dropped is not implementation. Other strategies are "
+                        "not implemented")
+
+        return __reader_creator__
diff --git a/python/paddle/fluid/debuger.py b/python/paddle/fluid/debugger.py
similarity index 100%
rename from python/paddle/fluid/debuger.py
rename to python/paddle/fluid/debugger.py
diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py
index 1ee1d3727174c079d2c217dede27ff1a0316c01c..7c6ad6f27dcfd7040f79c72c01413c8cc84a28ba 100644
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -273,10 +273,11 @@ class DetectionMAP(Evaluator):
             [M, 6]. The layout is [label, confidence, xmin, ymin, xmax, ymax].
         gt_label (Variable): The ground truth label index, which is a LoDTensor
             with shape [N, 1].
-        gt_difficult (Variable): Whether this ground truth is a difficult
-            bounding box (bbox), which is a LoDTensor [N, 1].
         gt_box (Variable): The ground truth bounding box (bbox), which is a
             LoDTensor with shape [N, 6]. The layout is [xmin, ymin, xmax, ymax].
+        gt_difficult (Variable|None): Whether this ground truth is a difficult
+            bounding bbox, which can be a LoDTensor [N, 1] or not set. If None,
+            it means all the ground truth labels are not difficult bbox.
         class_num (int): The class number.
         background_label (int): The index of background label, the background
             label will be ignored. If set to -1, then all categories will be
@@ -284,7 +285,8 @@ class DetectionMAP(Evaluator):
         overlap_threshold (float): The threshold for deciding true/false
             positive, 0.5 by defalut.
         evaluate_difficult (bool): Whether to consider difficult ground truth
-            for evaluation, True by defalut.
+            for evaluation, True by defalut. This argument does not work when
+            gt_difficult is None.
         ap_version (string): The average precision calculation ways, it must be
             'integral' or '11point'. Please check
             https://sanchom.wordpress.com/tag/average-precision/ for details.
@@ -295,7 +297,7 @@ class DetectionMAP(Evaluator):
 
         exe = fluid.executor(place)
         map_evaluator = fluid.Evaluator.DetectionMAP(input,
-            gt_label, gt_difficult, gt_box)
+            gt_label, gt_box, gt_difficult)
         cur_map, accum_map = map_evaluator.get_map_var()
         fetch = [cost, cur_map, accum_map]
         for epoch in PASS_NUM:
@@ -313,8 +315,8 @@ class DetectionMAP(Evaluator):
                  input,
                  gt_label,
                  gt_box,
-                 gt_difficult,
-                 class_num,
+                 gt_difficult=None,
+                 class_num=None,
                  background_label=0,
                  overlap_threshold=0.5,
                  evaluate_difficult=True,
@@ -322,8 +324,11 @@ class DetectionMAP(Evaluator):
         super(DetectionMAP, self).__init__("map_eval")
 
         gt_label = layers.cast(x=gt_label, dtype=gt_box.dtype)
-        gt_difficult = layers.cast(x=gt_difficult, dtype=gt_box.dtype)
-        label = layers.concat([gt_label, gt_difficult, gt_box], axis=1)
+        if gt_difficult:
+            gt_difficult = layers.cast(x=gt_difficult, dtype=gt_box.dtype)
+            label = layers.concat([gt_label, gt_difficult, gt_box], axis=1)
+        else:
+            label = layers.concat([gt_label, gt_box], axis=1)
 
         # calculate mean average precision (mAP) of current mini-batch
         map = layers.detection_map(
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 7ad028714d3b47d93328dbf7c3297d55a2db1bd0..33d8f709412b25d29c6618272500dd7b953d6645 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -170,6 +170,8 @@ def get_program_cache_key(feed, fetch_list):
             return var.desc.name()
         elif isinstance(var, str):
             return var
+        elif isinstance(var, basestring):
+            return str(var)
         else:
             raise TypeError(str(var) + " should be Variable or str")
 
@@ -299,14 +301,18 @@ class Executor(object):
         if feed is None:
             feed = {}
         if not isinstance(feed, dict):
-            raise TypeError("feed should be a map")
+            raise TypeError(
+                "feed requires dict as its Parameter. But you passed in %s" %
+                (type(feed)))
         if fetch_list is None:
             fetch_list = []
         if program is None:
             program = default_main_program()
 
         if not isinstance(program, Program):
-            raise TypeError()
+            raise TypeError(
+                "Executor requires Program as its Parameter. But you passed in %s"
+                % (type(program)))
 
         if scope is None:
             scope = global_scope()
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index d7eda619c34a20fa09a30afdcf90047d66a05cbf..bbd35aaecba27ea9fd66b9be585a972690980ab8 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -72,6 +72,10 @@ def convert_np_dtype_to_dtype_(np_dtype):
         return core.VarDesc.VarType.INT64
     elif dtype == np.bool:
         return core.VarDesc.VarType.BOOL
+    elif dtype == np.uint16:
+        return core.VarDesc.VarType.INT16
+    elif dtype == np.uint8:
+        return core.VarDesc.VarType.UINT8
     else:
         raise ValueError("Not supported numpy dtype " + str(dtype))
 
@@ -160,6 +164,7 @@ class Variable(object):
                  persistable=None,
                  error_clip=None,
                  stop_gradient=False,
+                 is_data=False,
                  **kwargs):
         self.block = block
         self.error_clip = error_clip
@@ -238,6 +243,7 @@ class Variable(object):
         self.block.vars[name] = self
         self.op = None
         self.stop_gradient = stop_gradient
+        self.is_data = is_data
 
     def __str__(self):
         return self.to_string(True)
@@ -357,6 +363,13 @@ class OpProtoHolder(object):
             raise ValueError("Operator \"%s\" has not been registered." % type)
         return self.op_proto_map[type]
 
+    @staticmethod
+    def generated_op_attr_names():
+        return {
+            core.op_proto_and_checker_maker.kOpRoleAttrName(),
+            core.op_proto_and_checker_maker.kOpRoleVarAttrName()
+        }
+
 
 class Operator(object):
     """
@@ -364,6 +377,13 @@ class Operator(object):
     Block. Users can use the build in instructions to describe their neural
     network.
     """
+    OP_WITHOUT_KERNEL_SET = {
+        'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
+        'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
+        'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
+        'ncclInit', 'channel_create', 'channel_close', 'channel_send',
+        'channel_recv', 'select'
+    }
 
     def __init__(self,
                  block,
@@ -400,6 +420,23 @@ class Operator(object):
         self.block = block
         self.desc = desc
         self.attrs = attrs
+        if self.attrs is None:
+            self.attrs = dict()
+        del attrs
+
+        op_maker = core.op_proto_and_checker_maker
+
+        if op_maker.kOpRoleAttrName() not in self.attrs:
+            self.attrs[op_maker.kOpRoleAttrName()] = self.block.program.op_role
+
+        role_var_name = op_maker.kOpRoleVarAttrName()
+        if len(self.block.program.
+               op_role_var) != 0 and role_var_name not in self.attrs:
+            self.attrs[role_var_name] = self.block.program.op_role_var
+
+        if role_var_name in self.attrs and len(self.attrs[role_var_name]) == 0:
+            del self.attrs[role_var_name]
+
         if len(self.desc.type()) != 0:
             return
         if type is None:
@@ -465,34 +502,31 @@ class Operator(object):
                     arg.op = self
                 self.desc.set_output(out_proto.name, out_arg_names)
 
-        if attrs is not None:
-            if not isinstance(attrs, dict):
+        if self.attrs is not None:
+            if not isinstance(self.attrs, dict):
                 raise TypeError("'attrs' should be a dict.")
             for attr in proto.attrs:
                 attr_name = attr.name
-                if (attr_name not in attrs) or (attrs[attr_name] is None):
+                if (attr_name not in self.attrs) or (
+                        self.attrs[attr_name] is None):
                     continue
-                if isinstance(attrs[attr_name], Block):
-                    self.desc.set_block_attr(attr_name, attrs[attr_name].desc)
-                elif isinstance(attrs[attr_name], core.BlockDesc) or \
-                   isinstance(attrs[attr_name], core.ProgramDesc):
+                if isinstance(self.attrs[attr_name], Block):
+                    self.desc.set_block_attr(attr_name,
+                                             self.attrs[attr_name].desc)
+                elif isinstance(self.attrs[attr_name], core.BlockDesc) or \
+                        isinstance(self.attrs[attr_name], core.ProgramDesc):
                     self.desc.set_serialized_attr(
-                        attr_name, attrs[attr_name].serialize_to_string())
+                        attr_name, self.attrs[attr_name].serialize_to_string())
                 else:
-                    self.desc.set_attr(attr_name, attrs[attr_name])
-
+                    self.desc.set_attr(attr_name, self.attrs[attr_name])
         self.desc.check_attrs()
-        no_kernel_op_set = {
-            'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
-            'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
-            'recv', 'listen_and_serv', 'parallel_do', 'save_combine',
-            'load_combine', 'ncclInit', 'channel_create', 'channel_close',
-            'channel_send', 'channel_recv', 'select'
-        }
-        if type not in no_kernel_op_set:
+        if self.has_kernel(type):
             self.desc.infer_var_type(self.block.desc)
             self.desc.infer_shape(self.block.desc)
 
+    def has_kernel(self, op_type):
+        return op_type not in self.OP_WITHOUT_KERNEL_SET
+
     def to_string(self, throw_on_error):
         """
         To debug string.
@@ -608,6 +642,10 @@ class Operator(object):
         """
         return self.desc.attr_type(name)
 
+    def set_attr(self, name, val):
+        self.attrs[name] = val
+        self.desc.set_attr(name, val)
+
     @property
     def attr_names(self):
         """
@@ -716,7 +754,9 @@ class Block(object):
 
     def var(self, name):
         if not isinstance(name, basestring):
-            raise TypeError()
+            raise TypeError(
+                "var require string as parameter, but get %s instead." %
+                (type(name)))
         v = self.vars.get(name, None)
         if v is None:
             raise ValueError("var %s not in this block" % name)
@@ -771,7 +811,7 @@ class Block(object):
         Rename variable in vars and ops' inputs and outputs
         """
         if not self.has_var(name):
-            raise ValueError("var %s is not in current" % name)
+            raise ValueError("var %s is not in current block" % name)
         v = self.var(name)
         if type(v) == Parameter:
             var_type = "Parameter"
@@ -817,6 +857,7 @@ class Block(object):
         self.vars[new_name] = var
         del self.vars[name]
         self.sync_with_cpp()
+        return var
 
     def remove_var(self, name):
         self.sync_with_cpp()
@@ -978,7 +1019,8 @@ class Block(object):
                 shape=var.shape,
                 dtype=var.dtype,
                 type=var.type,
-                persistable=True)
+                persistable=True,
+                is_data=var.is_data)
         else:
             ret_var = self.create_var(
                 name=var.name,
@@ -986,7 +1028,8 @@ class Block(object):
                 dtype=var.dtype,
                 type=var.type,
                 lod_level=var.lod_level,
-                persistable=True)
+                persistable=True,
+                is_data=var.is_data)
         return ret_var
 
 
@@ -996,6 +1039,33 @@ class Program(object):
         self.blocks = [Block(self, 0)]
         self.current_block_idx = 0
         self._seed = 0
+        self._current_role = core.op_proto_and_checker_maker.OpRole.Forward
+        self._op_role_var = []
+
+    @property
+    def op_role(self):
+        return self._current_role
+
+    @op_role.setter
+    def set_op_role(self, role):
+        self._current_role = role
+
+    @property
+    def op_role_var(self):
+        return self._op_role_var
+
+    @op_role_var.setter
+    def set_op_role_var(self, var_name):
+        self._op_role_var = [var_name]
+
+    @contextlib.contextmanager
+    def optimized_guard(self, var):
+        OpRole = core.op_proto_and_checker_maker.OpRole
+        self._current_role = OpRole.Optimize
+        self._op_role_var = [var.name if isinstance(var, Variable) else var]
+        yield
+        self._op_role_var = []
+        self._current_role = OpRole.Forward
 
     def __str__(self):
         return self.to_string(True)
@@ -1051,6 +1121,7 @@ class Program(object):
             p.sync_with_cpp()
 
         p.copy_param_info_from(self)
+        p.copy_data_info_from(self)
         return p
 
     def prune(self, targets):
@@ -1172,6 +1243,26 @@ class Program(object):
                              "program, with represent the same topology")
         self.global_block().copy_param_info_from(other.global_block())
 
+    def copy_data_info_from(self, other):
+        """
+        Copy the information of data variables from other program.
+        Args:
+            other(Program): Other program
+
+        Returns:
+            None
+        """
+        if not isinstance(other, Program):
+            raise TypeError("copy_param_info_from should be invoked with "
+                            "Program")
+
+        if len(self.blocks) != len(other.blocks):
+            raise ValueError("copy_param_info_from should be invoked with two "
+                             "program, with represent the same topology")
+        for var in other.global_block().vars.itervalues():
+            if var.is_data:
+                self.global_block().var(var.name).is_data = True
+
     def list_vars(self):
         for each_block in self.blocks:
             for each_var in each_block.vars.itervalues():
diff --git a/python/paddle/fluid/inferencer.py b/python/paddle/fluid/inferencer.py
index b38526bc574a103ece86aecdecf06b0bcfd6cad0..9f242cf29a56573349f192307a68e135a409a4be 100644
--- a/python/paddle/fluid/inferencer.py
+++ b/python/paddle/fluid/inferencer.py
@@ -12,35 +12,70 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import contextlib
+
 import core
-import framework
+
 import executor
+import framework
 import io
+import parallel_executor
+import unique_name
+from trainer import check_and_get_place
+
 __all__ = ['Inferencer', ]
 
 
 class Inferencer(object):
-    def __init__(self, network_func, param_path=None, place=None):
-        # 1. we need to generate a framework.Program by calling
-        # network_func. Reference: fluid.program_guard in test_word2vec.py
+    def __init__(self, infer_func, param_path, place=None, parallel=False):
+        """
+        :param infer_func: a function that will return predict Variable
+        :param param_path: the path where the inference model is saved by fluid.io.save_params
+        :param place: place to do the inference
+        :param parallel: use parallel_executor to run the inference, it will use multi CPU/GPU.
+        """
+        self.param_path = param_path
+        self.scope = core.Scope()
+        self.parallel = parallel
+        self.place = check_and_get_place(place)
 
-        # 2. move the default_main_program to self.program.
+        self.inference_program = framework.Program()
+        with framework.program_guard(self.inference_program):
+            with unique_name.guard():
+                self.predict_var = infer_func()
 
-        # 3. run the default_startup program.
+        with self._prog_and_scope_guard():
+            # load params from param_path into scope
+            io.load_params(executor.Executor(self.place), param_path)
 
-        # 4. load params from param_path into scope
-        self.scope = core.Scope()
-        self.place = place
-        self.startup_program = framework.Program()
-        # TODO: generate the startup_program with network_func
+        if parallel:
+            with self._prog_and_scope_guard():
+                self.exe = parallel_executor.ParallelExecutor(
+                    use_cuda=isinstance(self.place, core.CUDAPlace),
+                    loss_name=self.predict_var.name)
+        else:
+            self.exe = executor.Executor(self.place)
 
-        exe = executor.Executor(place)
-        exe.run(self.startup_program, scope=self.scope)
+    def infer(self, inputs, return_numpy=True):
+        """
+        :param inputs: a map of {"input_name": input_var} that will be feed into the inference program
+        to get the predict value
+        :return: the predict value of the inference model
+        """
+        if not isinstance(inputs, dict):
+            raise ValueError(
+                "inputs should be a map of {'input_name': input_var}")
 
-        if param_path:
-            # load params from param_path into scope
-            io.load_persistables(exe, dirname=param_path)
+        with executor.scope_guard(self.scope):
+            results = self.exe.run(self.inference_program,
+                                   feed=inputs,
+                                   fetch_list=[self.predict_var],
+                                   return_numpy=return_numpy)
+
+        return results
 
-    def infer(self, inputs):
-        # run self.program
-        pass
+    @contextlib.contextmanager
+    def _prog_and_scope_guard(self):
+        with framework.program_guard(main_program=self.inference_program):
+            with executor.scope_guard(self.scope):
+                yield
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 08b8a878b6490bc989620085f3f9c06c7032d882..8e58e5eb794e1bb507ab05394a1f7b57a1d2ed42 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -13,21 +13,18 @@
 # limitations under the License.
 
 import os
+import time
+import shutil
 
 from paddle.fluid.evaluator import Evaluator
 from paddle.fluid.framework import Program, Parameter, default_main_program, Variable
 from . import core
 
 __all__ = [
-    'save_vars',
-    'save_params',
-    'save_persistables',
-    'load_vars',
-    'load_params',
-    'load_persistables',
-    'save_inference_model',
-    'load_inference_model',
-    'get_inference_program',
+    'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
+    'load_persistables', 'save_inference_model', 'load_inference_model',
+    'get_inference_program', 'save_checkpoint', 'load_checkpoint',
+    'clean_checkpoint'
 ]
 
 
@@ -195,6 +192,8 @@ def load_vars(executor,
         load_var_map = {}
         for each_var in vars:
             assert isinstance(each_var, Variable)
+            if each_var.type == core.VarDesc.VarType.RAW:
+                continue
             new_var = _clone_var_in_block_(load_block, each_var)
             if filename is None:
                 load_block.append_op(
@@ -263,6 +262,9 @@ def get_inference_program(target_vars, main_program=None):
 def prepend_feed_ops(inference_program,
                      feed_target_names,
                      feed_holder_name='feed'):
+    if len(feed_target_names) == 0:
+        return
+
     global_block = inference_program.global_block()
     feed_var = global_block.create_var(
         name=feed_holder_name,
@@ -323,9 +325,10 @@ def save_inference_model(dirname,
     if isinstance(feeded_var_names, basestring):
         feeded_var_names = [feeded_var_names]
     else:
-        if not (bool(feeded_var_names) and all(
-                isinstance(name, basestring) for name in feeded_var_names)):
-            raise ValueError("'feed_var_names' should be a list of str.")
+        if len(feeded_var_names) > 0:
+            if not (bool(feeded_var_names) and all(
+                    isinstance(name, basestring) for name in feeded_var_names)):
+                raise ValueError("'feed_var_names' should be a list of str.")
 
     if isinstance(target_vars, Variable):
         target_vars = [target_vars]
@@ -450,3 +453,192 @@ def get_parameter_value_by_name(name, executor, program=None):
         program = default_main_program()
     var = program.global_block().var(name)
     return get_parameter_value(var, executor)
+
+
+SUCCESS_MARK_FILENAME = "_SUCCESS"
+CHECKPOINT_PREFIX = "checkpoint"
+CHECKPOINT_SEPARATOR = "_"
+
+
+def save_checkpoint(executor,
+                    checkpoint_dir=None,
+                    max_num_checkpoints=3,
+                    save_interval_secs=600,
+                    main_program=None):
+    """
+    Save Checkpoint will save persistable LodTensor variables from main_program in checkpoint directory,
+    the directory named by serial number from 0 to (n -1), save_checkpoint use LRU strategy
+    to keep numbers of checkpoint directory,  the numbers of checkpoint directory are max_num_checkpoints at most,
+    The interval between two saved checkpoints must greater than save_interval_secs.
+
+    :param executor
+    :param checkpoint_dir
+    :param max_num_checkpoints
+    :param save_interval_secs
+    :param main_program
+    """
+    if checkpoint_dir is None:
+        checkpoint_dir = os.getcwd()
+
+    if not os.path.isdir(checkpoint_dir):
+        os.makedirs(checkpoint_dir)
+
+    serial = _get_lastest_checkpoint_dir(checkpoint_dir)
+    if serial >= 0 and not _interval_secs_exceed(
+            _get_serial_dir(serial, checkpoint_dir), save_interval_secs):
+        return
+
+    serial += 1
+    cur_dir = _get_serial_dir(serial, checkpoint_dir)
+
+    save_vars(
+        executor,
+        dirname=cur_dir,
+        main_program=main_program,
+        vars=None,
+        predicate=_is_checkpoint_var,
+        filename=None)
+    _write_success(cur_dir)
+    _lru_delete(checkpoint_dir, max_num_checkpoints)
+
+
+def load_checkpoint(executor, checkpoint_dir=None, main_program=None):
+    """
+    Load checkpoint from a directory by executor,
+    it will find  the most recent saved checkpoint file and load it auto.
+
+    :param executor
+    :param checkpoint_dir
+    :param main_program
+    """
+
+    if checkpoint_dir is None:
+        checkpoint_dir = os.getcwd()
+
+    serial = _get_lastest_checkpoint_dir(checkpoint_dir)
+
+    if serial < 0:
+        return
+
+    cur_dir = _get_serial_dir(serial, checkpoint_dir)
+
+    load_vars(
+        executor,
+        dirname=cur_dir,
+        main_program=main_program,
+        predicate=_is_checkpoint_var,
+        filename=None)
+
+
+def clean_checkpoint(checkpoint_dir, delete_dir=False):
+    """
+    clean the checkpoint dir, when the train exits normally, the trainer will call clean_checkpoint to delete checkpoint directory saved before.
+    delete_dir only works when the directory is empty, otherwise, OSError is raised.  
+    """
+    if checkpoint_dir is None:
+        checkpoint_dir = os.getcwd()
+    _lru_delete(checkpoint_dir, max_num_checkpoints=0)
+
+    if delete_dir and not os.listdir(checkpoint_dir):
+        os.rmdir(checkpoint_dir)
+
+
+def _get_serial_dir(serial, checkpoint_dir):
+    serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial)
+    return os.path.join(checkpoint_dir, serial_folder)
+
+
+def _is_checkpoint_var(var):
+    """
+    the checkpoint will not save or load all the variables.
+    var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
+
+    :param var
+    """
+    if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+            var.desc.type() == core.VarDesc.VarType.RAW:
+        return False
+
+    if var.name.endswith("@GRAD"):
+        return False
+
+    return var.persistable
+
+
+def _interval_secs_exceed(dirname, save_interval_secs):
+    dir_time = os.path.getmtime(dirname)
+    if save_interval_secs > (time.time() - dir_time):
+        return False
+    return True
+
+
+def _lru_delete(dirname, max_num_checkpoints=3):
+    dirs = os.listdir(dirname)
+    serials = []
+    for serial in dirs:
+        try:
+            serials.append(int(serial))
+        except ValueError:
+            continue
+
+    if len(serials) <= max_num_checkpoints:
+        return
+
+    serials.sort(reverse=True)
+    serials = serials[max_num_checkpoints:]
+    for serial in serials:
+        cur_dir = os.path.join(dirname, str(serial))
+        shutil.rmtree(cur_dir)
+
+
+def _write_success(dirname):
+    """
+    write an empty file named "_SUCCESS" in checkpoint dir, indicate this checkpoint is correct.
+
+    :param dirname
+    """
+    success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME)
+    with open(success_file, 'a') as f:
+        now = time.ctime()
+        f.write(now)
+
+
+def _get_lastest_checkpoint_dir(checkpoint_dir):
+    """
+    get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory
+
+    :param checkpoint_dir
+    """
+    if not checkpoint_dir.strip():
+        return -1
+
+    def has_success(checkpoint_dir, cur_dir):
+        """
+        is _SUCCESS in this dir
+        """
+        _, serial = cur_dir.split(CHECKPOINT_SEPARATOR)
+
+        try:
+            int(serial)
+        except ValueError:
+            return -1
+
+        if not os.path.isdir(os.path.join(checkpoint_dir, cur_dir)):
+            return -1
+
+        success_path = os.path.join(
+            _get_serial_dir(serial, checkpoint_dir), SUCCESS_MARK_FILENAME)
+        if os.path.isfile(success_path):
+            return int(serial)
+
+    if not os.path.isdir(checkpoint_dir):
+        return -1
+
+    current_dir = -1
+    dirs = os.listdir(checkpoint_dir)
+    for cur_dir in dirs:
+        success_num = has_success(checkpoint_dir, cur_dir)
+        if success_num > current_dir:
+            current_dir = success_num
+    return current_dir
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 4b707973e27391a6bdcba138934f62a255e04bb2..d1ea9f148566d20988a43f4c9d421c4452697ef1 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -49,6 +49,7 @@ __all__ = [
     'reorder_lod_tensor_by_rank',
     'ParallelDo',
     'Print',
+    'is_empty',
 ]
 
 
@@ -1097,7 +1098,7 @@ class ConditionalBlock(object):
         input_set = set([ipt.name for ipt in self.inputs])
 
         param_list = [
-            parent_block.var(each_name) for each_name in params
+            parent_block.var_recursive(each_name) for each_name in params
             if each_name not in input_set
         ]
 
@@ -1562,3 +1563,40 @@ def reorder_lod_tensor_by_rank(x, rank_table):
                 'RankTable': [rank_table]},
         outputs={'Out': [out]})
     return out
+
+
+def is_empty(x, cond=None, **ignored):
+    """
+    **Is Empty**
+
+    This layer returns the truth value of whether the variable is empty.
+
+    Args:
+        x(Variable): Operand of *is_empty*
+        cond(Variable|None): Optional output variable to store the result
+                             of *is_empty*
+
+    Returns:
+        Variable: The tensor variable storing the output of *is_empty*.
+
+    Raises:
+        TypeError: If input cond is not a variable, or cond's dtype is
+                   not bool
+
+    Examples:
+        .. code-block:: python
+
+          less = fluid.layers.is_empty(x=input)
+    """
+    helper = LayerHelper("is_empty", **locals())
+    if cond is None:
+        cond = helper.create_tmp_variable(dtype='bool')
+        cond.stop_gradient = True
+    elif not isinstance(cond, Variable):
+        raise TypeError("cond takes a variable")
+    elif cond.dtype != 'bool':
+        raise TypeError("The data type of cond must be bool")
+
+    helper.append_op(
+        type='is_empty', inputs={'X': [x]}, outputs={'Out': [cond]})
+    return cond
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index a5938fe494265778ef7032c56a8d6d35acd729c5..3a83db12fd13651578deeac6b562bac2f1e4e4b6 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -23,6 +23,7 @@ import nn
 import math
 
 __all__ = [
+    'prior_box',
     'multi_box_head',
     'bipartite_match',
     'target_assign',
@@ -564,6 +565,115 @@ def ssd_loss(location,
     return loss
 
 
+def prior_box(input,
+              image,
+              min_sizes,
+              max_sizes=None,
+              aspect_ratios=[1.],
+              variance=[0.1, 0.1, 0.2, 0.2],
+              flip=False,
+              clip=False,
+              steps=[0.0, 0.0],
+              offset=0.5,
+              name=None):
+    """
+    **Prior box operator**
+
+    Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
+    Each position of the input produce N prior boxes, N is determined by
+    the count of min_sizes, max_sizes and aspect_ratios, The size of the
+    box is in range(min_size, max_size) interval, which is generated in
+    sequence according to the aspect_ratios.
+
+    Args:
+       input(Variable): The Input Variables, the format is NCHW.
+       image(Variable): The input image data of PriorBoxOp,
+            the layout is NCHW.
+       min_sizes(list|tuple|float value): min sizes of generated prior boxes.
+       max_sizes(list|tuple|None): max sizes of generated prior boxes.
+            Default: None.
+       aspect_ratios(list|tuple|float value): the aspect ratios of generated
+            prior boxes. Default: [1.].
+       variance(list|tuple): the variances to be encoded in prior boxes.
+            Default:[0.1, 0.1, 0.2, 0.2].
+       flip(bool): Whether to flip aspect ratios. Default:False.
+       clip(bool): Whether to clip out-of-boundary boxes. Default: False.
+       step(list|turple): Prior boxes step across width and height, If
+            step[0] == 0.0/step[1] == 0.0, the prior boxes step across
+            height/weight of the input will be automatically calculated.
+            Default: [0., 0.]
+       offset(float): Prior boxes center offset. Default: 0.5
+       name(str): Name of the prior box op. Default: None.
+
+    Returns:
+        boxes(Variable): the output prior boxes of PriorBox.
+             The layout is [H, W, num_priors, 4].
+             H is the height of input, W is the width of input,
+             num_priors is the total
+             box count of each position of input.
+        Variances(Variable): the expanded variances of PriorBox.
+             The layout is [H, W, num_priors, 4].
+             H is the height of input, W is the width of input
+             num_priors is the total
+             box count of each position of input
+
+
+    Examples:
+        .. code-block:: python
+            box, var = prior_box(
+            input=conv1,
+            image=images,
+            min_sizes=[100.],
+            flip=True,
+            clip=True)
+    """
+    helper = LayerHelper("prior_box", **locals())
+    dtype = helper.input_dtype()
+
+    def _is_list_or_tuple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
+    if not _is_list_or_tuple_(min_sizes):
+        min_sizes = [min_sizes]
+    if not _is_list_or_tuple_(aspect_ratios):
+        aspect_ratios = [aspect_ratios]
+    if not (_is_list_or_tuple_(steps) and len(steps) == 2):
+        raise ValueError('steps should be a list or tuple ',
+                         'with length 2, (step_width, step_height).')
+
+    min_sizes = list(map(float, min_sizes))
+    aspect_ratios = list(map(float, aspect_ratios))
+    steps = list(map(float, steps))
+
+    attrs = {
+        'min_sizes': min_sizes,
+        'aspect_ratios': aspect_ratios,
+        'variances': variance,
+        'flip': flip,
+        'clip': clip,
+        'step_w': steps[0],
+        'step_h': steps[1],
+        'offset': offset
+    }
+    if max_sizes is not None and len(max_sizes) > 0 and max_sizes[0] > 0:
+        if not _is_list_or_tuple_(max_sizes):
+            max_sizes = [max_sizes]
+        attrs['max_sizes'] = max_sizes
+
+    box = helper.create_tmp_variable(dtype)
+    var = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="prior_box",
+        inputs={"Input": input,
+                "Image": image},
+        outputs={"Boxes": box,
+                 "Variances": var},
+        attrs=attrs, )
+    box.stop_gradient = True
+    var.stop_gradient = True
+    return box, var
+
+
 def multi_box_head(inputs,
                    image,
                    base_size,
@@ -660,47 +770,6 @@ def multi_box_head(inputs,
             clip=True)
     """
 
-    def _prior_box_(input,
-                    image,
-                    min_sizes,
-                    max_sizes,
-                    aspect_ratios,
-                    variance,
-                    flip=False,
-                    clip=False,
-                    step_w=0.0,
-                    step_h=0.0,
-                    offset=0.5,
-                    name=None):
-        helper = LayerHelper("prior_box", **locals())
-        dtype = helper.input_dtype()
-
-        attrs = {
-            'min_sizes': min_sizes,
-            'aspect_ratios': aspect_ratios,
-            'variances': variance,
-            'flip': flip,
-            'clip': clip,
-            'step_w': step_w,
-            'step_h': step_h,
-            'offset': offset
-        }
-        if len(max_sizes) > 0 and max_sizes[0] > 0:
-            attrs['max_sizes'] = max_sizes
-
-        box = helper.create_tmp_variable(dtype)
-        var = helper.create_tmp_variable(dtype)
-        helper.append_op(
-            type="prior_box",
-            inputs={"Input": input,
-                    "Image": image},
-            outputs={"Boxes": box,
-                     "Variances": var},
-            attrs=attrs, )
-        box.stop_gradient = True
-        var.stop_gradient = True
-        return box, var
-
     def _reshape_with_axis_(input, axis=1):
         if not (axis > 0 and axis < len(input.shape)):
             raise ValueError("The axis should be smaller than "
@@ -777,11 +846,10 @@ def multi_box_head(inputs,
             aspect_ratio = aspect_ratios[i]
             if not _is_list_or_tuple_(aspect_ratio):
                 aspect_ratio = [aspect_ratio]
+        step = [step_w[i] if step_w else 0.0, step_h[i] if step_w else 0.0]
 
-        box, var = _prior_box_(input, image, min_size, max_size, aspect_ratio,
-                               variance, flip, clip, step_w[i]
-                               if step_w else 0.0, step_h[i]
-                               if step_w else 0.0, offset)
+        box, var = prior_box(input, image, min_size, max_size, aspect_ratio,
+                             variance, flip, clip, step, offset)
 
         box_results.append(box)
         var_results.append(var)
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 0a6befd1485a1f79d63873c47a9fd74ab4214f57..a56f3ea9db6b9fabf9d78f102d394a0817a44a98 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import contextlib
 
 from .. import core
 from ..framework import convert_np_dtype_to_dtype_, default_main_program, default_startup_program, Program
@@ -21,7 +22,8 @@ from ..executor import global_scope
 
 __all__ = [
     'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
-    'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer'
+    'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer',
+    'random_data_generator', 'Preprocessor'
 ]
 
 
@@ -78,8 +80,8 @@ def data(name,
         dtype=dtype,
         type=type,
         stop_gradient=stop_gradient,
-        lod_level=lod_level)
-    data_var.is_data = True
+        lod_level=lod_level,
+        is_data=True)
     return data_var
 
 
@@ -193,21 +195,23 @@ def Send(endpoints, send_vars, get_vars=None):
     endpoints = list(set(epmap))
 
     helper = LayerHelper("Send", **locals())
-    rpc_client_var = default_main_program().global_block().create_var(
-        name="RPC_CLIENT_VAR", persistable=True, type=core.VarDesc.VarType.RAW)
     if not get_vars:
         get_vars = []
         for s in send_vars:
             v = helper.create_tmp_variable(dtype=s.dtype, stop_gradient=True)
             get_vars.append(v)
+    rpc_op_role_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
 
     helper.append_op(
         type="send",
         inputs={"X": send_vars},
-        outputs={"Out": get_vars,
-                 "RPCClient": rpc_client_var},
-        attrs={"endpoints": endpoints,
-               "epmap": epmap})
+        outputs={"Out": get_vars},
+        attrs={
+            "endpoints": endpoints,
+            "epmap": epmap,
+            rpc_op_role_name: core.op_proto_and_checker_maker.OpRole.RPC
+        })
+
     return get_vars
 
 
@@ -321,7 +325,7 @@ def open_recordio_file(filename,
                                           dtypes=['float32', 'int64'])
 
          # Via the reader, we can use 'read_file' layer to get data:
-         image, label = fluid.layers.read_file(reader)
+         image, label = fluid.layers.io.read_file(reader)
     """
     dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
     shape_concat = []
@@ -359,11 +363,78 @@ def open_recordio_file(filename,
     return monkey_patch_reader_methods(main_prog_var)
 
 
+def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
+    """
+    Create a uniform random data generator
+
+    This layer returns a Reader Variable.
+    Instead of opening a file and reading data from it, this 
+    Reader Variable generates float uniform random data by itself. 
+    It can be used as a dummy reader to test a network without 
+    opening a real file.
+
+    Args:
+       low(float): The lower bound of data's uniform distribution.
+       high(float): The upper bound of data's uniform distribution.
+       shapes(list): List of tuples which declaring data shapes.
+       lod_levels(list): List of ints which declaring data lod_level.
+       for_parallel(Bool): Set it as True if you are going to run
+            subsequent operators in parallel.
+
+    Returns:
+       Variable: A Reader Variable from which we can get random data.
+
+    Examples:
+       .. code-block:: python
+
+         reader = fluid.layers.io.random_data_generator(
+                                          low=0.0,
+                                          high=1.0,
+                                          shapes=[(3,224,224), (1)],
+                                          lod_levels=[0, 0])
+
+         # Via the reader, we can use 'read_file' layer to get data:
+         image, label = fluid.layers.io.read_file(reader)
+    """
+    dtypes = [core.VarDesc.VarType.FP32] * len(shapes)
+    shape_concat = []
+    ranks = []
+
+    for shape in shapes:
+        shape_concat.extend(shape)
+        ranks.append(len(shape))
+
+    var_name = unique_name('random_data_generator')
+
+    startup_blk = default_startup_program().current_block()
+    startup_var = startup_blk.create_var(name=var_name)
+    startup_blk.append_op(
+        type='create_random_data_generator',
+        outputs={'Out': [startup_var]},
+        attrs={
+            'low': low,
+            'high': high,
+            'shape_concat': shape_concat,
+            'lod_levels': lod_levels,
+            'ranks': ranks
+        })
+
+    startup_var.desc.set_dtypes(dtypes)
+    startup_var.persistable = True
+    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
+                                      startup_var)
+
+    if for_parallel:
+        main_prog_var = parallel(reader=main_prog_var)
+
+    return monkey_patch_reader_methods(main_prog_var)
+
+
 def open_files(filenames,
                shapes,
                lod_levels,
                dtypes,
-               thread_num,
+               thread_num=1,
                buffer_size=None,
                pass_num=1,
                for_parallel=True):
@@ -468,8 +539,6 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None):
         inputs={'UnderlyingReader': reader},
         outputs={'Out': [new_reader]},
         attrs=attrs)
-    new_reader.persistable = True
-    new_reader.stop_gradient = True
     return monkey_patch_reader_methods(new_reader)
 
 
@@ -514,3 +583,82 @@ def read_file(file_obj):
         return out[0]
     else:
         return out
+
+
+class Preprocessor(object):
+    BEFORE_SUB_BLOCK = 0
+    IN_SUB_BLOCK = 1
+    AFTER_SUB_BLOCK = 2
+
+    def __init__(self, reader, name=None):
+        self.underlying_reader = reader
+        new_reader_name = name if name is not None else unique_name(
+            "create_custom_reader")
+        self.main_prog = default_main_program()
+        self.reader = self.main_prog.current_block().create_var(
+            name=new_reader_name)
+        self.sub_block = None
+        self.source_var_names = None
+        self.sink_var_names = None
+        self.status = Preprocessor.BEFORE_SUB_BLOCK
+
+    def is_completed(self):
+        return self.sub_block and self.source_var_names and self.sink_var_names
+
+    @contextlib.contextmanager
+    def block(self):
+        self.status = Preprocessor.IN_SUB_BLOCK
+        self.sub_block = self.main_prog.create_block()
+        yield
+        self.main_prog.rollback()
+        self.status = Preprocessor.AFTER_SUB_BLOCK
+        if not self.is_completed():
+            raise RuntimeError(
+                "The definition of preprocessor is incompleted! "
+                "Please make sure that you have set input and output "
+                "variables by invoking 'inputs' and 'outputs' in "
+                "Preprocessor's sub-block.")
+
+    def inputs(self):
+        if self.status != Preprocessor.IN_SUB_BLOCK:
+            raise RuntimeError(
+                "Preprocessor.inputs() can only be invoked inside the sub-block."
+            )
+
+        source_shapes = self.underlying_reader.desc.shapes()
+        source_dtypes = self.underlying_reader.desc.dtypes()
+        source_lod_levels = self.underlying_reader.desc.lod_levels()
+        self.source_var_names = [
+            unique_name("preprocessor_source")
+            for _ in xrange(len(source_shapes))
+        ]
+        source_vars = []
+        for var_name, shape, dtype, lod_level in zip(
+                self.source_var_names, source_shapes, source_dtypes,
+                source_lod_levels):
+            source_vars.append(self.main_prog.current_block().create_var(
+                name=var_name, shape=shape, dtype=dtype, lod_level=lod_level))
+        return source_vars
+
+    def outputs(self, *outs):
+        if self.status != Preprocessor.IN_SUB_BLOCK:
+            raise RuntimeError(
+                "Preprocessor.outputs() can only be invoked inside the sub-block."
+            )
+        self.sink_var_names = [var.name for var in outs]
+
+    def __call__(self, *args, **kwargs):
+        if self.status != Preprocessor.AFTER_SUB_BLOCK:
+            raise RuntimeError(
+                "Preprocessor output can only be retrieved after rnn block.")
+
+        self.main_prog.current_block().append_op(
+            type="create_custom_reader",
+            inputs={'UnderlyingReader': self.underlying_reader},
+            outputs={'Out': [self.reader]},
+            attrs={
+                "sub_block": self.sub_block,
+                "source_var_names": self.source_var_names,
+                "sink_var_names": self.sink_var_names
+            })
+        return monkey_patch_reader_methods(self.reader)
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 35b01a79914b3427836d4abd51aa2e2eb471d517..904413cc11b50f80d3c4730bf66ec359f9285ae6 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -15,16 +15,13 @@ import re
 import cStringIO
 import functools
 import warnings
+import string
 
 from ..proto import framework_pb2
 from ..framework import OpProtoHolder, Variable
 from ..layer_helper import LayerHelper
 
-__all__ = [
-    'deprecated',
-    'generate_layer_fn',
-    'autodoc',
-]
+__all__ = ['deprecated', 'generate_layer_fn', 'autodoc', 'templatedoc']
 
 
 def _convert_(name):
@@ -43,6 +40,10 @@ def _convert_(name):
     return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
 
 
+def _type_to_str_(tp):
+    return framework_pb2.AttrType.Name(tp)
+
+
 def _generate_doc_string_(op_proto):
     """
     Generate docstring by OpProto
@@ -54,9 +55,6 @@ def _generate_doc_string_(op_proto):
         str: the document string
     """
 
-    def _type_to_str_(tp):
-        return framework_pb2.AttrType.Name(tp)
-
     if not isinstance(op_proto, framework_pb2.OpProto):
         raise TypeError("OpProto should be `framework_pb2.OpProto`")
 
@@ -75,7 +73,11 @@ def _generate_doc_string_(op_proto):
         buf.write(str(each_input.dispensable))
         buf.write('\n')
 
+    skip_attrs = OpProtoHolder.generated_op_attr_names()
+
     for each_attr in op_proto.attrs:
+        if each_attr.name in skip_attrs:
+            continue
         buf.write('    ')
         buf.write(each_attr.name)
         buf.write(' (')
@@ -113,7 +115,7 @@ def generate_layer_fn(op_type):
 
     if len(not_intermediate_outputs) != 1:
         raise ValueError("Only one non intermediate output operator can be",
-                         "automatically generated.")
+                         "automatically generated. {0}".format(op_type))
 
     if not_intermediate_outputs[0].duplicable:
         raise ValueError(
@@ -220,3 +222,49 @@ def autodoc(comment=""):
         return func
 
     return __impl__
+
+
+def templatedoc():
+    """
+    Decorator of layer function. It will use the docstring from the layer
+    function as the template. The template arguments are:
+
+    * ${comment}: The operator comment written in CPP.
+    * ${{name}_comment}: The comment of ${name} written with AddAttr, AddOutput,
+        and AddInput. The ${name} is Python snake style. i.e., xxx_xxx.
+    * ${{name}_type}: The type of ${name}.
+
+    Returns:
+        Decorated function.
+    """
+
+    def __impl__(func):
+        op_proto = OpProtoHolder.instance().get_op_proto(func.__name__)
+        tmpl = string.Template(func.__doc__)
+
+        comment_lines = op_proto.comment.split("\n")
+        comment = ""
+        for line in comment_lines:
+            line = line.lstrip()
+            comment += line
+            comment += "\n"
+
+        args = {"comment": comment}
+        for each_input in op_proto.inputs:
+            input_name = _convert_(each_input.name)
+            args["{0}_comment".format(input_name)] = each_input.comment
+            args["{0}_type".format(input_name)] = "Variable"
+        for each_attr in op_proto.attrs:
+            input_name = _convert_(each_attr.name)
+            args["{0}_comment".format(input_name)] = each_attr.comment
+            args["{0}_type".format(input_name)] = _type_to_str_(each_attr.type)
+
+        for each_opt in op_proto.outputs:
+            output_name = _convert_(each_opt.name)
+            args["{0}_comment".format(output_name)] = each_opt.comment
+            args["{0}_type".format(output_name)] = "Variable"
+
+        func.__doc__ = tmpl.substitute(args)
+        return func
+
+    return __impl__
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 1786be22fdcd0d074b45bc94b3b0c4e8c41b4e8a..ddaeb415af4320c233aa7d01130fe1da2cdcbfa8 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -19,9 +19,10 @@ from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
 from ..param_attr import ParamAttr
-from layer_function_generator import autodoc
+from layer_function_generator import autodoc, templatedoc
 from tensor import concat
 import utils
+import random
 
 __all__ = [
     'fc',
@@ -80,6 +81,12 @@ __all__ = [
     'pad',
     'label_smooth',
     'roi_pool',
+    'dice_loss',
+    'image_resize',
+    'image_resize_short',
+    'resize_bilinear',
+    'gather',
+    'random_crop',
 ]
 
 
@@ -152,7 +159,8 @@ def fc(input,
     Examples:
         .. code-block:: python
 
-          data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+          data = fluid.layers.data(
+              name="data", shape=[32, 32], dtype="float32")
           fc = fluid.layers.fc(input=data, size=1000, act="tanh")
     """
 
@@ -175,11 +183,8 @@ def fc(input,
             inputs={"X": input_var,
                     "Y": w},
             outputs={"Out": tmp},
-            attrs={
-                "x_num_col_dims": num_flatten_dims,
-                "y_num_col_dims": 1,
-                "use_mkldnn": use_mkldnn
-            })
+            attrs={"x_num_col_dims": num_flatten_dims,
+                   "y_num_col_dims": 1})
         mul_results.append(tmp)
 
     if len(mul_results) == 1:
@@ -347,7 +352,8 @@ def dynamic_lstm(input,
         cell_activation(str): The activation for cell output. Choices = ["sigmoid",
                               "tanh", "relu", "identity"], default "tanh".
         candidate_activation(str): The activation for candidate hidden state.
-                              Choices = ["sigmoid", "tanh", "relu", "identity"],
+                              Choices = ["sigmoid", "tanh",
+                                  "relu", "identity"],
                               default "tanh".
         dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
         name(str|None): A name for this layer(optional). If set None, the layer
@@ -514,10 +520,12 @@ def dynamic_lstmp(input,
         cell_activation(str): The activation for cell output. Choices = ["sigmoid",
                               "tanh", "relu", "identity"], default "tanh".
         candidate_activation(str): The activation for candidate hidden state.
-                              Choices = ["sigmoid", "tanh", "relu", "identity"],
+                              Choices = ["sigmoid", "tanh",
+                                  "relu", "identity"],
                               default "tanh".
         proj_activation(str): The activation for projection output.
-                              Choices = ["sigmoid", "tanh", "relu", "identity"],
+                              Choices = ["sigmoid", "tanh",
+                                  "relu", "identity"],
                               default "tanh".
         dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
         name(str|None): A name for this layer(optional). If set None, the layer
@@ -699,8 +707,8 @@ def dynamic_gru(input,
 def gru_unit(input,
              hidden,
              size,
-             weight=None,
-             bias=None,
+             param_attr=None,
+             bias_attr=None,
              activation='tanh',
              gate_activation='sigmoid'):
     """
@@ -731,8 +739,8 @@ def gru_unit(input,
         input (Variable): The fc transformed input value of current step.
         hidden (Variable): The hidden value of lstm unit from previous step.
         size (integer): The input dimension value.
-        weight (ParamAttr): The weight parameters for gru unit. Default: None
-        bias (ParamAttr): The bias parameters for gru unit. Default: None
+        param_attr (ParamAttr): The weight parameters for gru unit. Default: None
+        bias_attr (ParamAttr): The bias parameters for gru unit. Default: None
         activation (string): The activation type for cell (actNode).
                              Default: 'tanh'
         gate_activation (string): The activation type for gates (actGate).
@@ -764,40 +772,52 @@ def gru_unit(input,
     size = size / 3
 
     # create weight
-    if weight is None:
-        weight = helper.create_parameter(
-            attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
+    weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
 
+    gate = helper.create_tmp_variable(dtype)
+    reset_hidden_pre = helper.create_tmp_variable(dtype)
+    updated_hidden = helper.create_tmp_variable(dtype)
+    inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': weight}
     # create bias
-
-    if bias is None:
+    if helper.bias_attr:
         bias_size = [1, 3 * size]
         bias = helper.create_parameter(
             attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
-
-    gate = helper.create_tmp_variable(dtype)
-    reset_hidden_pre = helper.create_tmp_variable(dtype)
-    updated_hidden = helper.create_tmp_variable(dtype)
+        inputs['Bias'] = bias
 
     helper.append_op(
         type='gru_unit',
-        inputs={'Input': input,
-                'HiddenPrev': hidden,
-                'Weight': weight},
+        inputs=inputs,
         outputs={
             'Gate': gate,
             'ResetHiddenPrev': reset_hidden_pre,
             'Hidden': updated_hidden,
         },
         attrs={
-            'activation': 0,
-            'gate_activation': 1,
+            'activation': 2,  # tanh
+            'gate_activation': 1,  # sigmoid
         })
 
     return updated_hidden, reset_hidden_pre, gate
 
 
+@templatedoc()
 def linear_chain_crf(input, label, param_attr=None):
+    """
+    Linear Chain CRF.
+
+    ${comment}
+
+    Args:
+        input(${emission_type}): ${emission_comment}
+        label(${label_type}): ${label_comment}
+        param_attr(ParamAttr): The attribute of the learnable parameter.
+
+    Returns:
+        ${log_likelihood_comment}
+
+    """
     helper = LayerHelper('linear_chain_crf', **locals())
     size = input.shape[1]
     transition = helper.create_parameter(
@@ -823,7 +843,19 @@ def linear_chain_crf(input, label, param_attr=None):
     return log_likelihood
 
 
+@templatedoc()
 def crf_decoding(input, param_attr, label=None):
+    """
+    ${comment}
+
+    Args:
+        input(${emission_type}): ${emission_comment}
+        param_attr(ParamAttr): The parameter attribute for training.
+        label(${label_type}): ${label_comment}
+
+    Returns:
+        ${viterbi_path_comment}
+    """
     helper = LayerHelper('crf_decoding', **locals())
     transition = helper.get_parameter(param_attr.name)
     viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype())
@@ -856,7 +888,7 @@ def cos_sim(X, Y):
     return out
 
 
-def dropout(x, dropout_prob, is_test=False, seed=None):
+def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
     """
     Computes dropout.
 
@@ -874,6 +906,8 @@ def dropout(x, dropout_prob, is_test=False, seed=None):
                   parameter is set to None, a random seed is used.
                   NOTE: If an integer seed is given, always the same output
                   units will be dropped. DO NOT use a fixed seed in training.
+       name(str|None): A name for this layer(optional). If set None, the layer
+                    will be named automatically.
 
     Returns:
         Variable: A tensor variable.
@@ -1118,7 +1152,7 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
     return softmax_out
 
 
-def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
+def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
     helper = LayerHelper('softmax', **locals())
     dtype = helper.input_dtype()
     softmax_out = helper.create_tmp_variable(dtype)
@@ -1176,19 +1210,19 @@ def conv2d(input,
 
         - Input:
 
-          Input shape: $(N, C_{in}, H_{in}, W_{in})$
+          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
 
-          Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
+          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
 
         - Output:
-          Output shape: $(N, C_{out}, H_{out}, W_{out})$
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
 
         Where
 
         .. math::
 
-        H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
-        W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
 
     Args:
        input(Variable): The input image with [N, C, H, W] format.
@@ -1329,6 +1363,8 @@ def sequence_pool(input, pool_type):
          sqrt   : out.data = [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2),
                     6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2)
          max    : out.data = [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
+         last   : out.data = [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
+         first  : out.data = [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
 
     Args:
         input(variable): The input variable which is a LoDTensor.
@@ -1348,6 +1384,8 @@ def sequence_pool(input, pool_type):
              sum_x = fluid.layers.sequence_pool(input=x, pool_type='sum')
              sqrt_x = fluid.layers.sequence_pool(input=x, pool_type='sqrt')
              max_x = fluid.layers.sequence_pool(input=x, pool_type='max')
+             last_x = fluid.layers.sequence_pool(input=x, pool_type='last')
+             first_x = fluid.layers.sequence_pool(input=x, pool_type='first')
     """
     helper = LayerHelper('sequence_pool', **locals())
     dtype = helper.input_dtype()
@@ -1706,6 +1744,7 @@ def conv2d_transpose(input,
                      padding=0,
                      stride=1,
                      dilation=1,
+                     groups=None,
                      param_attr=None,
                      bias_attr=None,
                      use_cudnn=True,
@@ -1776,6 +1815,12 @@ def conv2d_transpose(input,
        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
            contain two integers, (dilation_H, dilation_W). Otherwise, the
            dilation_H = dilation_W = dilation. Default: dilation = 1.
+       groups(int): The groups number of the Conv2d transpose layer. Inspired by
+           grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
+           when group=2, the first half of the filters is only connected to the
+           first half of the input channels, while the second half of the
+           filters is only connected to the second half of the input channels.
+           Default: groups=1
        param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer.
                               Default: None
        bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
@@ -1830,7 +1875,8 @@ def conv2d_transpose(input,
         filter_size = utils.convert_to_list(filter_size, 2,
                                             'conv2d_transpose.filter_size')
 
-    filter_shape = [input_channel, num_filters] + filter_size
+    groups = 1 if groups is None else groups
+    filter_shape = [input_channel, num_filters / groups] + filter_size
     img_filter = helper.create_parameter(
         dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
 
@@ -1844,6 +1890,7 @@ def conv2d_transpose(input,
             'strides': stride,
             'paddings': padding,
             'dilations': dilation,
+            'groups': groups,
             'use_cudnn': use_cudnn
         })
 
@@ -2080,11 +2127,11 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int|None): The dimension along which the sum is performed. If
+        dim (list|int|None): The dimensions along which the sum is performed. If
             :attr:`None`, sum all elements of :attr:`input` and return a
             Tensor variable with a single element, otherwise must be in the
-            range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`,
-            the dimension to reduce is :math:`rank + dim`.
+            range :math:`[-rank(input), rank(input))`. If :math:`dim[i] < 0`,
+            the dimension to reduce is :math:`rank + dim[i]`.
         keep_dim (bool|False): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
@@ -2105,15 +2152,25 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_sum(x, dim=0)  # [0.3, 0.5, 1.1, 1.6]
             fluid.layers.reduce_sum(x, dim=-1)  # [1.9, 1.6]
             fluid.layers.reduce_sum(x, dim=1, keep_dim=True)  # [[1.9], [1.6]]
+
+            # x is a Tensor variable with shape [2, 2, 2] and elements as below:
+            #      [[[1, 2], [3, 4]],
+            #      [[5, 6], [7, 8]]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_sum(x, dim=[1, 2]) # [10, 26]
+            fluid.layers.reduce_sum(x, dim=[0, 1]) # [16, 20]
+
     """
     helper = LayerHelper('reduce_sum', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    if dim is not None and not isinstance(dim, list):
+        dim = [dim]
     helper.append_op(
         type='reduce_sum',
         inputs={'X': input},
         outputs={'Out': out},
         attrs={
-            'dim': dim if dim != None else 0,
+            'dim': dim if dim != None else [0],
             'keep_dim': keep_dim,
             'reduce_all': True if dim == None else False
         })
@@ -2126,11 +2183,11 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None):
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int|None): The dimension along which the mean is computed. If
+        dim (list|int|None): The dimensions along which the mean is computed. If
             :attr:`None`, compute the mean over all elements of :attr:`input`
             and return a Tensor variable with a single element, otherwise
             must be in the range :math:`[-rank(input), rank(input))`. If
-            :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
+            :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
         keep_dim (bool): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
@@ -2150,16 +2207,26 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_mean(x)  # [0.4375]
             fluid.layers.reduce_mean(x, dim=0)  # [0.15, 0.25, 0.55, 0.8]
             fluid.layers.reduce_mean(x, dim=-1)  # [0.475, 0.4]
-            fluid.layers.reduce_mean(x, dim=1, keep_dim=True)  # [[0.475], [0.4]]
+            fluid.layers.reduce_mean(
+                x, dim=1, keep_dim=True)  # [[0.475], [0.4]]
+
+            # x is a Tensor variable with shape [2, 2, 2] and elements as below:
+            #      [[[1.0, 2.0], [3.0, 4.0]],
+            #      [[5.0, 6.0], [7.0, 8.0]]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_mean(x, dim=[1, 2]) # [2.5, 6.5]
+            fluid.layers.reduce_mean(x, dim=[0, 1]) # [4.0, 5.0]
     """
     helper = LayerHelper('reduce_mean', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    if dim is not None and not isinstance(dim, list):
+        dim = [dim]
     helper.append_op(
         type='reduce_mean',
         inputs={'X': input},
         outputs={'Out': out},
         attrs={
-            'dim': dim if dim != None else 0,
+            'dim': dim if dim != None else [0],
             'keep_dim': keep_dim,
             'reduce_all': True if dim == None else False
         })
@@ -2172,11 +2239,11 @@ def reduce_max(input, dim=None, keep_dim=False, name=None):
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int|None): The dimension along which the maximum is computed.
+        dim (list|int|None): The dimension along which the maximum is computed.
             If :attr:`None`, compute the maximum over all elements of
             :attr:`input` and return a Tensor variable with a single element,
             otherwise must be in the range :math:`[-rank(input), rank(input))`.
-            If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
+            If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
         keep_dim (bool): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
@@ -2197,15 +2264,24 @@ def reduce_max(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_max(x, dim=0)  # [0.2, 0.3, 0.6, 0.9]
             fluid.layers.reduce_max(x, dim=-1)  # [0.9, 0.7]
             fluid.layers.reduce_max(x, dim=1, keep_dim=True)  # [[0.9], [0.7]]
+
+            # x is a Tensor variable with shape [2, 2, 2] and elements as below:
+            #      [[[1.0, 2.0], [3.0, 4.0]],
+            #      [[5.0, 6.0], [7.0, 8.0]]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_max(x, dim=[1, 2]) # [4.0, 8.0]
+            fluid.layers.reduce_max(x, dim=[0, 1]) # [7.0, 8.0]
     """
     helper = LayerHelper('reduce_max', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    if dim is not None and not isinstance(dim, list):
+        dim = [dim]
     helper.append_op(
         type='reduce_max',
         inputs={'X': input},
         outputs={'Out': out},
         attrs={
-            'dim': dim if dim != None else 0,
+            'dim': dim if dim != None else [0],
             'keep_dim': keep_dim,
             'reduce_all': True if dim == None else False
         })
@@ -2218,11 +2294,11 @@ def reduce_min(input, dim=None, keep_dim=False, name=None):
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int|None): The dimension along which the minimum is computed.
+        dim (list|int|None): The dimensions along which the minimum is computed.
             If :attr:`None`, compute the minimum over all elements of
             :attr:`input` and return a Tensor variable with a single element,
             otherwise must be in the range :math:`[-rank(input), rank(input))`.
-            If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
+            If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
         keep_dim (bool): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
@@ -2243,15 +2319,24 @@ def reduce_min(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_min(x, dim=0)  # [0.1, 0.2, 0.5, 0.7]
             fluid.layers.reduce_min(x, dim=-1)  # [0.2, 0.1]
             fluid.layers.reduce_min(x, dim=1, keep_dim=True)  # [[0.2], [0.1]]
+
+            # x is a Tensor variable with shape [2, 2, 2] and elements as below:
+            #      [[[1.0, 2.0], [3.0, 4.0]],
+            #      [[5.0, 6.0], [7.0, 8.0]]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_min(x, dim=[1, 2]) # [1.0, 5.0]
+            fluid.layers.reduce_min(x, dim=[0, 1]) # [1.0, 2.0]
     """
     helper = LayerHelper('reduce_min', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    if dim is not None and not isinstance(dim, list):
+        dim = [dim]
     helper.append_op(
         type='reduce_min',
         inputs={'X': input},
         outputs={'Out': out},
         attrs={
-            'dim': dim if dim != None else 0,
+            'dim': dim if dim != None else [0],
             'keep_dim': keep_dim,
             'reduce_all': True if dim == None else False
         })
@@ -2264,11 +2349,11 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int|None): The dimension along which the product is performed. If
+        dim (list|int|None): The dimensions along which the product is performed. If
             :attr:`None`, multipy all elements of :attr:`input` and return a
             Tensor variable with a single element, otherwise must be in the
-            range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`,
-            the dimension to reduce is :math:`rank + dim`.
+            range :math:`[-rank(input), rank(input))`. If :math:`dim[i] < 0`,
+            the dimension to reduce is :math:`rank + dim[i]`.
         keep_dim (bool|False): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
@@ -2290,15 +2375,24 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_prod(x, dim=-1)  # [0.027, 0.0084]
             fluid.layers.reduce_prod(x, dim=1,
                                      keep_dim=True)  # [[0.027], [0.0084]]
+
+            # x is a Tensor variable with shape [2, 2, 2] and elements as below:
+            #      [[[1.0, 2.0], [3.0, 4.0]],
+            #      [[5.0, 6.0], [7.0, 8.0]]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_prod(x, dim=[1, 2]) # [24.0, 1680.0]
+            fluid.layers.reduce_prod(x, dim=[0, 1]) # [105.0, 384.0]
     """
     helper = LayerHelper('reduce_prod', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    if dim is not None and not isinstance(dim, list):
+        dim = [dim]
     helper.append_op(
         type='reduce_prod',
         inputs={'X': input},
         outputs={'Out': out},
         attrs={
-            'dim': dim if dim != None else 0,
+            'dim': dim if dim != None else [0],
             'keep_dim': keep_dim,
             'reduce_all': True if dim == None else False
         })
@@ -2333,7 +2427,8 @@ def split(input, num_or_sections, dim=-1, name=None):
             x0.shape  # [3, 3, 5]
             x1.shape  # [3, 3, 5]
             x2.shape  # [3, 3, 5]
-            x0, x1, x2 = fluid.layers.split(x, num_or_sections=[2, 3, 4], dim=1)
+            x0, x1, x2 = fluid.layers.split(
+                x, num_or_sections=[2, 3, 4], dim=1)
             x0.shape  # [3, 2, 5]
             x1.shape  # [3, 3, 5]
             x2.shape  # [3, 4, 5]
@@ -2401,7 +2496,6 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
 
     if len(x.shape) == 1:
         axis = 0
-
     helper = LayerHelper("l2_normalize", **locals())
 
     square = helper.create_tmp_variable(dtype=x.dtype)
@@ -2413,7 +2507,7 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
         inputs={"X": square},
         outputs={"Out": reduced_sum},
         attrs={
-            "dim": 1 if axis is None else axis,
+            "dim": [1] if axis is None else [axis],
             "keep_dim": True,
             "reduce_all": False
         })
@@ -2553,7 +2647,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     return out
 
 
-def topk(input, k):
+def topk(input, k, name=None):
     """
     This operator is used to find values and indices of the k largest entries
     for the last dimension.
@@ -2569,6 +2663,8 @@ def topk(input, k):
         input(Variable): The input variable which can be a vector or Tensor with
             higher rank.
         k(int): An integer value to specify the top k largest elements.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
 
     Returns:
         values(Variable): The k largest elements along each last dimensional
@@ -3244,7 +3340,8 @@ def softmax_with_cross_entropy(logits, label, soft_label=False):
             data = fluid.layers.data(name='data', shape=[128], dtype='float32')
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             fc = fluid.layers.fc(input=data, size=100)
-            out = fluid.layers.softmax_with_cross_entropy(logits=fc, label=label)
+            out = fluid.layers.softmax_with_cross_entropy(
+                logits=fc, label=label)
     """
     helper = LayerHelper('softmax_with_cross_entropy', **locals())
     softmax = helper.create_tmp_variable(dtype=logits.dtype)
@@ -3263,35 +3360,36 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
     """
     **Smooth L1 Loss Operator. **
 
-    This operator computes the smooth l1 loss for X and Y.
+    This operator computes the smooth L1 loss for X and Y.
     The operator takes the first dimension of X and Y as batch size.
-    For each instance, it computes the smooth l1 loss element by element first
+    For each instance, it computes the smooth L1 loss element by element first
     and then sums all the losses. So the shape of Out is [batch_size, 1].
 
     Args:
         x (Variable): A tensor with rank at least 2. The input value of smooth
-            l1 loss op with shape [batch_size, dim1, ..., dimN].
+            L1 loss op with shape [batch_size, dim1, ..., dimN].
         y (Variable): A tensor with rank at least 2. The target value of smooth
-            l1 loss op with same shape as x.
+            L1 loss op with same shape as x.
         inside_weight (Variable|None):  A tensor with rank at least 2. This
             input is optional and should have same shape with x. If provided,
             the result of (x - y) will be multiplied by this tensor element by
             element.
         outside_weight (Variable|None): A tensor with rank at least 2. This
             input is optional and should have same shape with x. If provided,
-            the out smooth l1 loss will be multiplied by this tensor element
+            the out smooth L1 loss will be multiplied by this tensor element
             by element.
-        sigma (float|None): Hyper parameter of smooth l1 loss op. A float scalar
+        sigma (float|None): Hyper parameter of smooth L1 loss op. A float scalar
             with default value 1.0.
     Returns:
-        Variable: A tensor with rank be 2. The output smooth l1 loss with
+        Variable: A tensor with rank be 2. The output smooth L1 loss with
             shape [batch_size, 1].
 
     Examples:
         .. code-block:: python
 
             data = fluid.layers.data(name='data', shape=[128], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[100], dtype='int64')
+            label = fluid.layers.data(
+                name='label', shape=[100], dtype='float32')
             fc = fluid.layers.fc(input=data, size=100)
             out = fluid.layers.smooth_l1(x=fc, y=label)
     """
@@ -3613,7 +3711,8 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
     Examples:
         .. code-block:: python
 
-          data = fluid.layers.data(name="data", shape=[3, 112, 112], dtype="float32")
+          data = fluid.layers.data(
+              name="data", shape=[3, 112, 112], dtype="float32")
           lrn = fluid.layers.lrn(input=data)
     """
     helper = LayerHelper('lrn', **locals())
@@ -3769,13 +3868,13 @@ def label_smooth(label,
 
 def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
     """
-    Region of interest pooling (also known as RoI pooling) is to perform 
+    Region of interest pooling (also known as RoI pooling) is to perform
         is to perform max pooling on inputs of nonuniform sizes to obtain
         fixed-size feature maps (e.g. 7*7).
-    The operator has three steps: 
-        1. Dividing each region proposal into equal-sized sections with 
-           the pooled_width and pooled_height 
-        2. Finding the largest value in each section 
+    The operator has three steps:
+        1. Dividing each region proposal into equal-sized sections with
+           the pooled_width and pooled_height
+        2. Finding the largest value in each section
         3. Copying these max values to the output buffer
 
     Args:
@@ -3783,8 +3882,8 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
         rois (Variable): ROIs (Regions of Interest) to pool over. It should
                          be a 2-D one level LoTensor of shape [num_rois, 4].
                          The layout is [x1, y1, x2, y2], where (x1, y1)
-                         is the top left coordinates, and (x2, y2) is the 
-                         bottom right coordinates. The num_rois is the 
+                         is the top left coordinates, and (x2, y2) is the
+                         bottom right coordinates. The num_rois is the
                          total number of ROIs in this batch data.
         pooled_height (integer): The pooled output height. Default: 1
         pooled_width (integer): The pooled output width. Default: 1
@@ -3793,11 +3892,13 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
                                to the scale used when pooling. Default: 1.0
 
     Returns:
-        pool_out (Variable): The output is a 4-D tensor of the shape 
+        pool_out (Variable): The output is a 4-D tensor of the shape
                              (num_rois, channels, pooled_h, pooled_w).
 
     Examples:
-            pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0) 
+        .. code-block:: python
+
+            pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0)
     """
     helper = LayerHelper('roi_pool', **locals())
     dtype = helper.input_dtype()
@@ -3815,3 +3916,271 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
             "spatial_scale": spatial_scale
         })
     return pool_out
+
+
+def dice_loss(input, label, epsilon=0.00001):
+    """
+    Dice loss for comparing the similarity of two batch of data,
+    usually is used for binary image segmentation i.e. labels are binary.
+    The dice loss can be defined as below equation:
+
+    .. math::
+
+        dice\_loss &= 1 - \\frac{2 * intersection\_area}{total\_area} \\\\
+                  &= \\frac{(total\_area - intersection\_area) - intersection\_area}{total\_area} \\\\
+                  &= \\frac{(union\_area - intersection\_area)}{total\_area}
+
+
+    Args:
+        input (Variable): The predictions with rank>=2. The first dimension is batch size,
+                          and the last dimension is class number.
+        label (Variable): The groud truth with the same rank with input. The first dimension
+                          is batch size, and the last dimension is 1.
+        epsilon (float): The epsilon will be added to the numerator and denominator.
+                         If both input and label are empty, it makes sure dice is 1.
+                         Default: 0.00001
+
+    Returns:
+        dice_loss (Variable): The dice loss with shape [1].
+
+    Examples:
+        .. code-block:: python
+
+            predictions = fluid.layers.softmax(x)
+            loss = fluid.layers.dice_loss(input=predictions, label=label, 2)
+    """
+    label = one_hot(label, depth=input.shape[-1])
+    reduce_dim = range(1, len(input.shape))
+    inse = reduce_sum(input * label, dim=reduce_dim)
+    dice_denominator = reduce_sum(
+        input, dim=reduce_dim) + reduce_sum(
+            label, dim=reduce_dim)
+    dice_score = 1 - inse * 2 / (dice_denominator + epsilon)
+    return reduce_mean(dice_score)
+
+
+def image_resize(input,
+                 out_shape=None,
+                 scale=None,
+                 name=None,
+                 resample='BILINEAR'):
+    """
+    Resize a batch of images.
+
+    The input must be a tensor of the shape (num_batches, channels, in_h, in_w), 
+    and the resizing only applies on the last two dimensions(hight and width).
+
+    Supporting resample methods:
+        'BILINEAR' : Bilinear interpolation
+
+    Args:
+        input (Variable): The input tensor of image resize layer,
+                          This is a 4-D tensor of the shape
+                          (num_batches, channels, in_h, in_w).
+        out_shape(list|tuple|Variable|None): Output shape of image resize
+                                    layer, the shape is (out_h, out_w).
+                                    Default: None
+        scale(float|None): The multiplier for the input height or width.
+                         At least one of out_shape or scale must be set.
+                         And out_shape has a higher priority than scale.
+                         Default: None
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+        resample(str): The resample method. It can only be 'BILINEAR' currently.
+                       Default: 'BILINEAR'
+
+    Returns:
+        out (Variable): The output is a 4-D tensor of the shape
+                        (num_batches, channls, out_h, out_w).
+
+    Examples:
+        .. code-block:: python
+
+            out = fluid.layers.image_resize(input, out_shape=[12, 12])
+    """
+    resample_methods = {'BILINEAR': 'bilinear_interp'}
+    if resample not in resample_methods:
+        raise ValueError(
+            "The 'resample' of image_resize can only be 'BILINEAR' currently.")
+    if out_shape is None and scale is None:
+        raise ValueError("One of out_shape and scale must not be None")
+    helper = LayerHelper('bilinear_interp', **locals())
+    dtype = helper.input_dtype()
+
+    def _is_list_or_turple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
+    out_h = 0
+    out_w = 0
+    inputs = {"X": input}
+    if out_shape is not None:
+        if not (_is_list_or_turple_(out_shape) and
+                len(out_shape) == 2) and not isinstance(out_shape, Variable):
+            raise ValueError('out_shape should be a list or tuple or variable')
+        if _is_list_or_turple_(out_shape):
+            out_shape = list(map(int, out_shape))
+            out_h = out_shape[0]
+            out_w = out_shape[1]
+        else:
+            inputs['OutSize'] = out_shape
+    else:
+        out_h = int(input.shape[2] * scale)
+        out_w = int(input.shape[3] * scale)
+
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type=resample_methods[resample],
+        inputs=inputs,
+        outputs={"Out": out},
+        attrs={"out_h": out_h,
+               "out_w": out_w})
+    return out
+
+
+def resize_bilinear(input, out_shape=None, scale=None, name=None):
+    """
+    This is an alias of layer 'image_resize' with bilinear interpolation.
+
+    The mathematical meaning of resize bilinear layer is
+    Bilinear interpolation.
+    Bilinear interpolation is an extension of linear interpolation for
+    interpolating functions of two variables (e.g. H-direction and
+    W-direction in this layer) on a rectilinear 2D grid.
+
+    For details, please refer to Wikipedia:
+    https://en.wikipedia.org/wiki/Bilinear_interpolation
+    """
+
+    return image_resize(input, out_shape, scale, name, 'BILINEAR')
+
+
+def image_resize_short(input, out_short_len, resample='BILINEAR'):
+    """
+    Resize a batch of images. The short edge of input images will be 
+    resized to the given 'out_short_len'. The long edge of input images 
+    will be resized proportionately to make images' length-width ratio 
+    constant.
+
+    Args:
+        input (Variable): The input tensor of image resize layer,
+                          This is a 4-D tensor of the shape
+                          (num_batches, channels, in_h, in_w).
+        out_short_len(int): The length of output images' short edge.
+
+    Returns:
+        out (Variable): The output is a 4-D tensor of the shape
+                        (num_batches, channls, out_h, out_w).
+    """
+    in_shape = input.shape
+    if len(in_shape) != 4:
+        raise ValueError(
+            "The rank of input must be 4 (num_batches, channels, in_h, in_w).")
+    hw = in_shape[2:4]
+    short_idx = hw.index(min(hw))
+    long_idx = 1 - short_idx
+    out_shape = list(hw)
+    out_shape[short_idx] = out_short_len
+    out_shape[long_idx] = int(
+        float(out_shape[long_idx]) * (float(out_short_len) / float(hw[
+            short_idx])) + 0.5)
+    return image_resize(input=input, out_shape=out_shape, resample=resample)
+
+
+def gather(input, index):
+    """
+    Output is obtained by gathering entries of the outer-most dimension 
+    of X indexed by `index` and concatenate them together.
+
+    .. math::
+
+        Out = X[Index]
+
+
+    .. code-block:: text
+
+
+                Given:
+
+                X = [[1, 2],
+                     [3, 4],
+                     [5, 6]]
+
+                Index = [1, 2]
+
+                Then:
+
+                Out = [[3, 4],
+                       [5, 6]]
+
+    Args:
+        input (Variable): The source input with rank>=1. 
+        index (Variable): The index input with rank=1.
+
+    Returns:
+        output (Variable): The output is a tensor with the same rank as input.
+
+    Examples:
+        .. code-block:: python
+
+            output = fluid.layers.gather(x, index)
+    """
+    helper = LayerHelper('gather', **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="gather",
+        inputs={"X": input,
+                "Index": index},
+        outputs={"Out": out})
+    return out
+
+
+@templatedoc()
+def random_crop(x, shape, seed=None):
+    """
+    ${comment}
+
+    Examples:
+        >>> img = fluid.layers.data("img", [3, 256, 256])
+        >>> cropped_img = fluid.layers.random_crop(img, shape=[3, 224, 224])
+
+    Args:
+        x(${x_type}): ${x_comment}
+        shape(${shape_type}): ${shape_comment}
+        seed(int|${seed_type}|None): ${seed_comment} By default, the seed will
+            get from `random.randint(-65536, 65535)`.
+
+    Returns:
+        ${out_comment}
+
+    """
+    helper = LayerHelper("random_crop", **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    if seed is None:
+        seed = random.randint(-65536, 65535)
+
+    if isinstance(seed, int):
+        seed_value = seed
+        seed = helper.create_tmp_variable(dtype="int64")
+        helper.append_op(
+            type="fill_constant",
+            inputs={},
+            outputs={"Out": seed},
+            attrs={
+                "dtype": seed.dtype,
+                "shape": [1],
+                "value": float(seed_value),
+                "force_cpu": True
+            })
+    elif not isinstance(seed, Variable):
+        raise ValueError("'seed' must be a Variable or an int.")
+    seed_out = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="random_crop",
+        inputs={"X": input,
+                "Seed": seed},
+        outputs={"Out": out,
+                 "SeedOut": seed_out},
+        attrs={"shape": shape})
+    return out
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index a9fe25744cc0b385479c9366af1b731ec221dd5a..69cfde852dd087bb9192da1f7582f925582dbce4 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -71,6 +71,8 @@ __all__ = [
     'cumsum',
     'scatter',
     'sum',
+    'polygon_box_transform',
+    'shape',
 ] + __activations__
 
 for _OP in set(__all__):
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 4be0dc6a6bfeed3ac254f5c363d3560973c031b4..75d3bf879703a1db1108eae45d879164e0024156 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -112,7 +112,7 @@ def cast(x, dtype):
     return out
 
 
-def concat(input, axis=0):
+def concat(input, axis=0, name=None):
     """
     **Concat**
 
@@ -122,6 +122,8 @@ def concat(input, axis=0):
     Args:
         input(list): List of tensors to be concatenated
         axis(int): Integer axis along which the tensors will be concatenated
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
 
     Returns:
         Variable: Output variable of the concatenation
@@ -361,6 +363,40 @@ def zeros(shape, dtype, force_cpu=False):
     return fill_constant(value=0.0, **locals())
 
 
+def reverse(x, axis):
+    """
+    **reverse**
+
+    This function reverse the input 'x' along given axises.
+
+    Args:
+        x(Vairbale): the input to be reversed.
+        axis(int|tuple|list): Axis that along which order of elements 
+                    is reversed. If it is a tuple or a list, reversing 
+                    will be apply on each axis in the tuple or list.  
+
+    Returns:
+        Variable: The reversed tensor.
+
+    Examples:
+        .. code-block:: python
+
+          out = fluid.layers.reverse(x=in, axis=0)
+          # or:
+          out = fluid.layers.reverse(x=in, axis=[0,1])
+    """
+    if isinstance(axis, int):
+        axis = [axis]
+    helper = LayerHelper("reverse", **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='reverse',
+        inputs={'Input': x},
+        outputs={'Out': [out]},
+        attrs={'axis': axis})
+    return out
+
+
 def save(x, file_path, overwrite=True):
     """
     Saves a variable as a file.
diff --git a/python/paddle/fluid/lod_tensor.py b/python/paddle/fluid/lod_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..9946d0a4ff33b2f5040f6d2e31aa20fcf9c609a7
--- /dev/null
+++ b/python/paddle/fluid/lod_tensor.py
@@ -0,0 +1,189 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import core
+import numpy as np
+
+__all__ = ['create_lod_tensor', 'create_random_int_lodtensor']
+
+
+def _validate_lod(lod, tensor_height=-1):
+    """Check whether the input length-based lod info is valid.
+
+    There are several things to check:
+    1. lod should be a list of lists. Empty list is fine.
+    2. The length of each sublist (a lod level) should be at least one.
+    3. Each element in each lod level should be an integer greater than 0.
+    4. The sum of one lod level should be equal to the length of the next lod level.
+    5. The sum of the last lod level should be equal to the tensor height. 
+       Bypass this check if user does not provide tensor_height as input.
+
+    Args:
+        lod: the length-based lod info, e.g., [[2, 3], [2, 1, 2, 3, 4]].
+        tensor_height: the outermost dimension of the tensor with which the input 
+            lod is associated with. 
+
+    Returns:
+        A boolean indicating whether the input lod is valid or not.
+    """
+    assert isinstance(lod, list), "lod should be a list"
+    # Empty lod is fine
+    if len(lod) == 0:
+        return True
+
+    lod_sum = []
+    for level in lod:
+        assert isinstance(level, list), "each item in lod should be a list"
+        # Each level of lod should have at least one length info
+        if len(level) < 1:
+            return False
+        level_sum = 0
+        for lod_len in level:
+            # Each length in a level should be > 0
+            if lod_len <= 0:
+                return False
+            level_sum += lod_len
+        lod_sum.append(level_sum)
+
+    for idx, val in enumerate(lod_sum[:-1]):
+        # Each level's sum should be equal to 
+        # the number of items in the next level
+        if val != len(lod[idx + 1]):
+            return False
+
+    if tensor_height == -1:
+        return True
+    else:
+        # Last level's sum should be equal to the tensor height
+        return lod_sum[-1] == tensor_height
+
+
+def _convert_lod(lod):
+    """Convert a length-based lod to a offset-based lod.
+
+    If the length-based lod is [[2, 3], [2, 1, 2, 3, 4]],
+    then the offset-based lod is [[0, 2, 5], [0, 2, 3, 5, 8, 12]].
+
+    Args:
+        lod: a length-based lod info. 
+
+    Returns:
+        A list of lists as the offset-based lod converted to from the input lod.
+    """
+    new_lod = []
+    for level in lod:
+        cur_len = 0
+        new_level = [cur_len]
+        for lod_len in level:
+            cur_len += lod_len
+            new_level.append(cur_len)
+        new_lod.append(new_level)
+    return new_lod
+
+
+def create_lod_tensor(data, lod, place):
+    """Create a lod tensor from a numpy array, a list, or an existing lod tensor.
+
+    Create a lod tensor by doing the following:
+    1. Check that the length-based input lod is valid.
+    2. Convert the length-based lod to a offset-based LoD.
+    3. Copy the data from a numpy array, a list or a existing lod tensor to 
+       CPU or GPU device (based on input place).
+    4. Set the level of detail (LoD) using the offset-based LoD.
+    
+    Use example:
+    Suppose we want LoDTensor to hold data for sequences of word, where each word is
+    represented by an integer. If we want to create a LoDTensor to represent two 
+    sentences, one of 2 words, and one of 3 words. 
+
+    Then 'data' can be a numpy array of integers with shape (5, 1).
+    'lod' will be [[2, 3]], indicating the length(# of words) in each sentence.
+    This length-based input lod [[2, 3]] will be converted to offset-based lod [[0, 2, 5]]
+    inside the function call.
+
+    Please refer to 
+    github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md
+    for more details regarding LoD.
+
+    Args:
+        data: a numpy array or a LoDTensor or a list holding the data to be copied.
+        lod: a list of lists indicating the length-based LoD info specified by the user. 
+        place: CPU or GPU place indicating where the data in the new LoDTensor will be stored.
+
+    Returns:
+        A fluid LoDTensor object with tensor data and lod info.
+    """
+    if isinstance(data, core.LoDTensor):
+        return create_lod_tensor(np.array(data), lod, place)
+    elif isinstance(data, list):
+        # When input data is a list, it only deal with the case where the base element 
+        # is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated 
+        # LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number 
+        # of words or other indexes in the sequence. 
+        new_lod = []
+        for seq in data:
+            new_lod.append(len(seq))
+        assert [new_lod] == lod, "data and lod do not match"
+        flattened_data = np.concatenate(data, axis=0).astype("int64")
+        flattened_data = flattened_data.reshape([len(flattened_data), 1])
+        return create_lod_tensor(flattened_data, lod, place)
+    elif isinstance(data, np.ndarray):
+        assert _validate_lod(lod,
+                             data.shape[0]), "the provided lod info is invalid"
+        tensor = core.LoDTensor()
+        tensor.set(data, place)
+        tensor.set_lod(_convert_lod(lod))
+        return tensor
+    else:
+        raise TypeError(
+            "data should be either a LoDTensor, a Numpy array or a list")
+
+
+def create_random_int_lodtensor(lod, base_shape, place, low, high):
+    """Create a LoDTensor containing random integers.
+
+    This function is frequently used in the book examples. So we revised it based on 
+    the new create_lod_tensor API and put it here in the lod_tensor module to simplify 
+    the code. 
+
+    The function does the following:
+    1. Calculate the overall shape of the LoDTensor based on the length-based 'lod' input 
+    and the shape of the basic element in 'base_shape'.
+    2. Create a numpy array of this shape.
+    3. Create the LoDTensor using create_lod_tensor API.
+
+    Suppose we want LoDTensor to hold data for sequences of word, where each word is
+    represented by an integer. If we want to create a LoDTensor to represent two 
+    sentences, one of 2 words, and one of 3 words. Then 'base_shape' is [1], input 
+    length-based 'lod' is [[2, 3]]. Then the overall shape of the LoDTensor would be 
+    [5, 1], holding 5 words for two sentences. 
+
+    Args:
+        data: a numpy array or a LoDTensor holding the data to be copied.
+        lod: a list of lists indicating the length-based LoD info specified by the user.
+        base_shape: the shape of the basic element to be held by the LoDTensor. 
+        place: CPU or GPU place indicating where the data in the new LoDTensor will be stored.
+        low: the lower bound of the random integers.
+        high: the upper bound of the random integers.
+
+    Returns:
+        A fluid LoDTensor object with tensor data and lod info. 
+    """
+    assert isinstance(base_shape, list), "base_shape should be a list"
+    converted_lod = _convert_lod(lod)
+    # append the total number of basic elements to the front of its shape
+    overall_shape = [converted_lod[-1][-1]] + base_shape
+    # the range of integer data elements is [low, high]    
+    data = np.random.random_integers(low, high, overall_shape).astype("int64")
+    return create_lod_tensor(data, lod, place)
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 0a314ddfd7c607a3bc7f7c746c4c4990fc4a52e2..115362c6bf33018342699a442c688e7356f3c206 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -28,8 +28,8 @@ from contextlib import contextmanager
 __all__ = [
     'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad',
     'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
-    'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'Adadelta', 'ModelAverage',
-    'Optimizer'
+    'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer',
+    'Adadelta', 'ModelAverage', 'Optimizer'
 ]
 
 
@@ -47,6 +47,8 @@ class Optimizer(object):
             raise TypeError("learning rate should be float or Variable")
         self.regularization = regularization
         self._learning_rate = learning_rate
+        # the learning rate type should be inferenced from loss
+        self._dtype = None
         # each program should have a independent learning rate
         # program -> Variable(learning_rate)
         self._learning_rate_map = dict()
@@ -77,7 +79,7 @@ class Optimizer(object):
             name=unique_name.generate("learning_rate"),
             shape=[1],
             value=float(self._learning_rate),
-            dtype='float32',
+            dtype='float32' if self._dtype == None else self._dtype,
             persistable=True)
 
     def global_learning_rate(self, program=None):
@@ -200,6 +202,7 @@ class Optimizer(object):
 
         # Create any accumulators
         program = loss.block.program
+        self._dtype = loss.dtype
         with program_guard(program, startup_program):
             global_block = framework.default_main_program().global_block()
             start = len(global_block.ops)
@@ -210,11 +213,13 @@ class Optimizer(object):
 
             optimize_ops = []
             for param_and_grad in parameters_and_grads:
-                if param_and_grad[0].trainable is True and param_and_grad[
-                        1] is not None:
-                    optimize_op = self._append_optimize_op(loss.block,
-                                                           param_and_grad)
-                    optimize_ops.append(optimize_op)
+                with param_and_grad[0].block.program.optimized_guard(
+                        param_and_grad[0]):
+                    if param_and_grad[0].trainable is True and param_and_grad[
+                            1] is not None:
+                        optimize_op = self._append_optimize_op(loss.block,
+                                                               param_and_grad)
+                        optimize_ops.append(optimize_op)
 
             # Get custom finish ops for subclasses
             # FIXME: Need to fix this once we figure out how to handle dependencies
@@ -391,7 +396,7 @@ class AdamOptimizer(Optimizer):
         beta_shape = [1]
         self._beta1_pow_acc = self.helper.create_global_variable(
             name=unique_name.generate('beta1_pow_acc'),
-            dtype='float32',
+            dtype='float32' if self._dtype == None else self._dtype,
             shape=beta_shape,
             lod_level=0,
             persistable=True)
@@ -400,7 +405,7 @@ class AdamOptimizer(Optimizer):
 
         self._beta2_pow_acc = self.helper.create_global_variable(
             name=unique_name.generate('beta2_pow_acc'),
-            dtype='float32',
+            dtype='float32' if self._dtype == None else self._dtype,
             shape=beta_shape,
             lod_level=0,
             persistable=True)
@@ -493,7 +498,7 @@ class AdamaxOptimizer(Optimizer):
         beta_shape = [1]
         self._beta1_pow_acc = self.helper.create_global_variable(
             name=unique_name.generate('beta1_pow_acc'),
-            dtype='float32',
+            dtype='float32' if self._dtype == None else self._dtype,
             shape=beta_shape,
             lod_level=0,
             persistable=True)
@@ -900,8 +905,10 @@ class ModelAverage(Optimizer):
         # param = (sum_1 + sum_2 + sum_3) / (num_accumulates + old_num_accumulates)
         tmp = layers.sum(x=[num_accumulates, old_num_accumulates])
         sum = layers.sum(x=[sum_1, sum_2, sum_3])
-        tmp = layers.cast(x=tmp, dtype='float32')
-        sum = layers.cast(x=sum, dtype='float32')
+        tmp = layers.cast(
+            x=tmp, dtype='float32' if self._dtype == None else self._dtype)
+        sum = layers.cast(
+            x=sum, dtype='float32' if self._dtype == None else self._dtype)
         layers.elementwise_div(x=sum, y=tmp, out=param)
 
     def _add_average_restore_op(self, block, param_grad):
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 6b80b007e9080922241ee6c66e0577a18b357980..3117dfe00c7a3df1035c439dc31b81e67781d0cc 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -19,7 +19,10 @@ import executor
 import warnings
 import sys
 
-__all__ = ['ParallelExecutor']
+__all__ = ['ParallelExecutor', 'ExecutionStrategy', 'BuildStrategy']
+
+ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
+BuildStrategy = core.ParallelExecutor.BuildStrategy
 
 
 class ParallelExecutor(object):
@@ -27,10 +30,12 @@ class ParallelExecutor(object):
                  use_cuda,
                  loss_name=None,
                  main_program=None,
-                 num_threads=None,
-                 allow_op_delay=False,
                  share_vars_from=None,
-                 use_default_grad_scale=True):
+                 exec_strategy=None,
+                 build_strategy=None,
+                 num_trainers=1,
+                 trainer_id=0,
+                 **kwargs):
         """
         ParallelExecutor can run program in parallel.
 
@@ -39,18 +44,13 @@ class ParallelExecutor(object):
             loss_name(str, default None): The loss name must set in training.
             main_program(Program, default None): The program that need to run,
                 if not provided, then default_main_program will be used.
-            num_threads(int, default None): How many threads are used for
-                training.
-            allow_op_delay(bool, default False): Whether to delay and buffer
-                some operators together for scheduling or not, which may
-                improve performance in some cases, default False.
             share_vars_from(ParallelExecutor, default None): If provied,
                 it will share variables from the specified ParallelExecutor.
-            use_default_grad_scale(bool, default True): If set True, a default
-                scale value equal to `1./device_count` would be multiplied to
-                gradients of each device and scaled gradients would be
-                aggregated. Otherwise, a customized scale value should be fed
-                to the network.
+            num_trainers(int, default 1): If greater than 1, NCCL will be
+                initialized with multpile rank of nodes, each node should have
+                same number of GPUs. Distributed training will be enabled then.
+            trainer_id(int, default 0): Must use together with num_trainers.
+                trainer_id is the "rank" of current node starts from 0.
 
         Returns:
             A ParallelExecutor object.
@@ -72,6 +72,25 @@ class ParallelExecutor(object):
               train_loss, = train_exe.run([loss.name], feed=feed_dict)
               test_loss, = test_exe.run([loss.name], feed=feed_dict)
         """
+        if len(kwargs) != 0:
+            err_msg = ""
+            for key in kwargs:
+                if key in dir(ExecutionStrategy):
+                    err_msg += \
+                        "Setting {0} by constructor is deprecated. Use " \
+                        "strategy=ExecutionStrategy(); strategy.{0}=xxx; " \
+                        "pe=ParallelExecutor(exec_strategy=strategy) " \
+                        "instead.\n ".format(key)
+                elif key in dir(BuildStrategy):
+                    err_msg += \
+                        "Setting {0} by constructor is deprecated. Use " \
+                        "strategy=BuildStrategy(); See help(" \
+                        "paddle.fluid.ParallelExecutor.BuildStrategy) \n".format(
+                            key)
+                else:
+                    err_msg += "Setting {0} by constructor is deprecated. Use strategy.\n".format(
+                        key)
+            raise ValueError(err_msg)
 
         self._places = []
         self._act_places = []
@@ -89,15 +108,25 @@ class ParallelExecutor(object):
                 self._places.append(p)
         assert self._places, "no place for execution"
 
-        if num_threads is None:
+        if exec_strategy is None:
+            exec_strategy = ExecutionStrategy()
+            if use_cuda:
+                exec_strategy.use_event = True
+            else:
+                exec_strategy.use_event = False
+
+        if exec_strategy.num_threads == 0:
             if use_cuda:
                 # Experiments on se-resnext shows that too many threads hurt
                 # performance. Worth tunning for other models in the future.
-                num_threads = len(self._places) * 2
+                exec_strategy.num_threads = len(self._places) * 2
             else:
-                num_threads = min(
+                exec_strategy.num_threads = min(
                     len(self._places) * 2, multiprocessing.cpu_count())
 
+        if build_strategy is None:
+            build_strategy = BuildStrategy()
+
         main = main_program
         main = main if main else framework.default_main_program()
         scope = executor.global_scope()
@@ -116,21 +145,14 @@ class ParallelExecutor(object):
         ]
 
         self.executor = core.ParallelExecutor(
-            num_threads,
-            True if use_cuda else False,  # use_event
             self._places,
             set([
                 p.name for p in main.global_block().iter_parameters()
                 if not p.stop_gradient
             ]),
-            set(self.persistable_vars),
-            main.desc,
-            loss_name if loss_name else '',
-            scope,
-            local_scopes,
-            allow_op_delay,
-            use_default_grad_scale)
-
+            set(self.persistable_vars), main.desc, loss_name
+            if loss_name else '', scope, local_scopes, exec_strategy,
+            build_strategy, num_trainers, trainer_id)
         self.scope = scope
 
     def run(self, fetch_list, feed=None, feed_dict=None):
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index 04fd05cc33cff3d720be75923d4af3767942669f..e2bd1d4c9a1ea5ddc0dfd19c769dcb40bfd6d04c 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -16,7 +16,10 @@ import core
 from contextlib import contextmanager
 import os
 
-__all__ = ['cuda_profiler', 'reset_profiler', 'profiler']
+__all__ = [
+    'cuda_profiler', 'reset_profiler', 'profiler', 'start_profiler',
+    'stop_profiler'
+]
 
 NVPROF_CONFIG = [
     "gpustarttimestamp",
@@ -72,20 +75,31 @@ def reset_profiler():
     core.reset_profiler()
 
 
-@contextmanager
-def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
-    """The profiler interface.
-    Different from cuda_profiler, this profiler can be used to profile both CPU
-    and GPU program. By defalut, it records the CPU and GPU operator kernels,
-    if you want to profile other program, you can refer the profiling tutorial
-    to add more records.
+def start_profiler(state):
+    """Enable the profiler.
+
+    Args:
+        state (string) : The profiling state, which should be 'CPU', 'GPU'
+            or 'All'. 'CPU' means only profile CPU. 'GPU' means profiling
+            GPU as well. 'All' also generates timeline.
+    """
+    if core.is_profiler_enabled():
+        return
+    if state not in ['CPU', 'GPU', "All"]:
+        raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
+    if state == "GPU":
+        prof_state = core.ProfilerState.kCUDA
+    elif state == "CPU":
+        prof_state = core.ProfilerState.kCPU
+    else:
+        prof_state = core.ProfilerState.kAll
+    core.enable_profiler(prof_state)
+
+
+def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
+    """Stop the profiler.
 
     Args:
-        state (string) : The profiling state, which should be 'CPU' or 'GPU',
-            telling the profiler to use CPU timer or GPU timer for profiling.
-            Although users may have already specified the execution place
-            (CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
-            would not inherit this place.
         sorted_key (string) : If None, the profiling results will be printed
             in the order of first end time of events. Otherwise, the profiling
             results will be sorted by the this flag. This flag should be one
@@ -98,17 +112,8 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
         profile_path (string) : If state == 'All', it will write a profile
             proto output file.
     """
-    if state not in ['CPU', 'GPU', "All"]:
-        raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
-    if state == "GPU":
-        prof_state = core.ProfilerState.kCUDA
-    elif state == "CPU":
-        prof_state = core.ProfilerState.kCPU
-    else:
-        prof_state = core.ProfilerState.kAll
-    core.enable_profiler(prof_state)
-    yield
-
+    if not core.is_profiler_enabled():
+        return
     sorted_key = 'default' if sorted_key is None else sorted_key
     if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']:
         raise ValueError("The sorted_key must be None or in 'calls', 'total', "
@@ -124,3 +129,34 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
     # TODO(qingqing) : redirect C++ ostream to Python stream.
     # with core.ostream_redirect(stdout=True, stderr=True):
     core.disable_profiler(key_map[sorted_key], profile_path)
+
+
+@contextmanager
+def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
+    """The profiler interface.
+    Different from cuda_profiler, this profiler can be used to profile both CPU
+    and GPU program. By defalut, it records the CPU and GPU operator kernels,
+    if you want to profile other program, you can refer the profiling tutorial
+    to add more records.
+
+    Args:
+        state (string) : The profiling state, which should be 'CPU' or 'GPU',
+            telling the profiler to use CPU timer or GPU timer for profiling.
+            Although users may have already specified the execution place
+            (CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
+            would not inherit this place.
+        sorted_key (string) : If None, the profiling results will be printed
+            in the order of first end time of events. Otherwise, the profiling
+            results will be sorted by the this flag. This flag should be one
+            of 'calls', 'total', 'max', 'min' or 'ave'.
+            The `calls` means sorting by the number of calls.
+            The `total` means sorting by the total execution time.
+            The `max` means sorting by the maximum execution time.
+            The `min` means sorting by the minimum execution time.
+            The `ave` means sorting by the average execution time.
+        profile_path (string) : If state == 'All', it will write a profile
+            proto output file.
+    """
+    start_profiler(state)
+    yield
+    stop_profiler(sorted_key, profile_path)
diff --git a/python/paddle/fluid/recordio_writer.py b/python/paddle/fluid/recordio_writer.py
index 5accaacd5361165d30b92c71ae4fd62e23e44e07..8d48e9abef0fb9861284c6302b30efb0e3994989 100644
--- a/python/paddle/fluid/recordio_writer.py
+++ b/python/paddle/fluid/recordio_writer.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import core
 import contextlib
-
-__all__ = ['convert_reader_to_recordio_file']
+__all__ = [
+    'convert_reader_to_recordio_file', 'convert_reader_to_recordio_files'
+]
 
 
 @contextlib.contextmanager
@@ -46,3 +48,36 @@ def convert_reader_to_recordio_file(
             writer.complete_append_tensor()
             counter += 1
     return counter
+
+
+def convert_reader_to_recordio_files(
+        filename,
+        batch_per_file,
+        reader_creator,
+        feeder,
+        compressor=core.RecordIOWriter.Compressor.Snappy,
+        max_num_records=1000,
+        feed_order=None):
+    if feed_order is None:
+        feed_order = feeder.feed_names
+    f_name, f_ext = os.path.splitext(filename)
+    assert (f_ext == ".recordio")
+
+    lines = []
+    f_idx = 0
+    counter = 0
+    for idx, batch in enumerate(reader_creator()):
+        lines.append(batch)
+        if idx >= batch_per_file and idx % batch_per_file == 0:
+            filename = "%s-%05d%s" % (f_name, f_idx, f_ext)
+            with create_recordio_writer(filename, compressor,
+                                        max_num_records) as writer:
+                for l in lines:
+                    res = feeder.feed(l)
+                    for each in feed_order:
+                        writer.append_tensor(res[each])
+                    writer.complete_append_tensor()
+                    counter += 1
+                lines = []
+                f_idx += 1
+    return counter
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index c006bd9a66ddb422b7d80d2ca87aa7f56a6485db..c4d6829599616cb3ea7791a189e7070974de6ae3 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -43,31 +43,32 @@ def append_regularization_ops(parameters_and_grads, regularization=None):
     """
     params_and_grads = []
     for param, grad in parameters_and_grads:
-        # If no gradient then we don't need to do anything
-        if grad is None:
+        with param.block.program.optimized_guard(param):
+            # If no gradient then we don't need to do anything
+            if grad is None:
+                params_and_grads.append((param, grad))
+                continue
+
+            regularization_term = None
+            if param.regularizer is not None:
+                # Add variable for regularization term in grad block
+                regularization_term = param.regularizer(param, grad, grad.block)
+            elif regularization is not None:
+                regularization_term = regularization(param, grad, grad.block)
+
+            # If no regularization specified, then we don't need to do anything
+            if regularization_term is None:
+                params_and_grads.append((param, grad))
+                continue
+
+            assert grad.shape == regularization_term.shape
+
+            grad.block.append_op(
+                type='elementwise_add',
+                inputs={"X": grad,
+                        "Y": regularization_term},
+                outputs={"Out": grad})
             params_and_grads.append((param, grad))
-            continue
-
-        regularization_term = None
-        if param.regularizer is not None:
-            # Add variable for regularization term in grad block
-            regularization_term = param.regularizer(param, grad, grad.block)
-        elif regularization is not None:
-            regularization_term = regularization(param, grad, grad.block)
-
-        # If no regularization specified, then we don't need to do anything
-        if regularization_term is None:
-            params_and_grads.append((param, grad))
-            continue
-
-        assert grad.shape == regularization_term.shape
-
-        grad.block.append_op(
-            type='elementwise_add',
-            inputs={"X": grad,
-                    "Y": regularization_term},
-            outputs={"Out": grad})
-        params_and_grads.append((param, grad))
 
     return params_and_grads
 
diff --git a/python/paddle/fluid/tests/book/CMakeLists.txt b/python/paddle/fluid/tests/book/CMakeLists.txt
index 673c965b662a022739f8d489c331f4de9455a926..ee734f3c782adb5196a03aca5718377009a5b4e7 100644
--- a/python/paddle/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/CMakeLists.txt
@@ -5,3 +5,5 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
+
+add_subdirectory(high-level-api)
diff --git a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..efa5ee2d06af3d31e7d84122dd7eea37d6dcf3a3
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
@@ -0,0 +1,16 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
+
+add_subdirectory(fit_a_line)
+add_subdirectory(recognize_digits)
+add_subdirectory(image_classification)
+add_subdirectory(understand_sentiment)
+add_subdirectory(label_semantic_roles)
+add_subdirectory(word2vec)
+add_subdirectory(recommender_system)
+add_subdirectory(machine_translation)
diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..673c965b662a022739f8d489c331f4de9455a926
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3117cf2e5e0513089e5e1146d49702fcc8b7ba6
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
@@ -0,0 +1,133 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import contextlib
+import numpy
+import unittest
+
+# train reader
+BATCH_SIZE = 20
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+test_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.test(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+
+def inference_program():
+    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+    return y_predict
+
+
+def linear():
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+    y_predict = inference_program()
+
+    loss = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_loss = fluid.layers.mean(loss)
+
+    return avg_loss
+
+
+def optimizer_func():
+    return fluid.optimizer.SGD(learning_rate=0.001)
+
+
+def train(use_cuda, train_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    trainer = fluid.Trainer(
+        train_func=train_program, place=place, optimizer_func=optimizer_func)
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndStepEvent):
+            if event.step == 10:
+                test_metrics = trainer.test(
+                    reader=test_reader, feed_order=['x', 'y'])
+                print test_metrics
+                '''
+                ...
+                ['25.768919467926025']
+                ['15.343549569447836']
+                ...
+                '''
+                if params_dirname is not None:
+                    trainer.save_params(params_dirname)
+                trainer.stop()
+
+    trainer.train(
+        reader=train_reader,
+        num_epochs=100,
+        event_handler=event_handler,
+        feed_order=['x', 'y'])
+
+
+# infer
+def infer(use_cuda, inference_program, params_dirname=None):
+    if params_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    inferencer = fluid.Inferencer(
+        infer_func=inference_program, param_path=params_dirname, place=place)
+
+    batch_size = 10
+    tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
+
+    results = inferencer.infer({'x': tensor_x})
+    print("infer results: ", results[0])
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the trained model
+    params_dirname = "fit_a_line.inference.model"
+
+    train(use_cuda, linear, params_dirname)
+    infer(use_cuda, inference_program, params_dirname)
+
+
+class TestFitALine(unittest.TestCase):
+    def test_cpu(self):
+        with self.program_scope_guard():
+            with fluid.unique_name.guard():
+                main(use_cuda=False)
+
+    def test_cuda(self):
+        with self.program_scope_guard():
+            with fluid.unique_name.guard():
+                main(use_cuda=True)
+
+    @contextlib.contextmanager
+    def program_scope_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..673c965b662a022739f8d489c331f4de9455a926
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fed6d914f75b690e34411aa154359c93b6ca989
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+CIFAR dataset.
+
+This module will download dataset from
+https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into
+paddle reader creators.
+
+The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
+with 6000 images per class. There are 50000 training images and 10000 test
+images.
+
+The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes
+containing 600 images each. There are 500 training images and 100 testing
+images per class.
+
+"""
+
+import cPickle
+import itertools
+import numpy
+import paddle.v2.dataset.common
+import tarfile
+
+__all__ = ['train10']
+
+URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
+CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
+CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
+
+
+def reader_creator(filename, sub_name, batch_size=None):
+    def read_batch(batch):
+        data = batch['data']
+        labels = batch.get('labels', batch.get('fine_labels', None))
+        assert labels is not None
+        for sample, label in itertools.izip(data, labels):
+            yield (sample / 255.0).astype(numpy.float32), int(label)
+
+    def reader():
+        with tarfile.open(filename, mode='r') as f:
+            names = (each_item.name for each_item in f
+                     if sub_name in each_item.name)
+
+            batch_count = 0
+            for name in names:
+                batch = cPickle.load(f.extractfile(name))
+                for item in read_batch(batch):
+                    if isinstance(batch_size, int) and batch_count > batch_size:
+                        break
+                    batch_count += 1
+                    yield item
+
+    return reader
+
+
+def train10(batch_size=None):
+    """
+    CIFAR-10 training set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'data_batch',
+        batch_size=batch_size)
diff --git a/python/paddle/fluid/tests/book/image_classification/notest_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
similarity index 74%
rename from python/paddle/fluid/tests/book/image_classification/notest_image_classification_resnet.py
rename to python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
index 17db38797cf19ae387f69f66daa42fc78cfcb7d5..2df3da9cca7042222317de626460909f018cb107 100644
--- a/python/paddle/fluid/tests/book/image_classification/notest_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import paddle
 import paddle.fluid as fluid
 import numpy
+import cifar10_small_test_set
 
 
 def resnet_cifar10(input, depth=32):
@@ -81,46 +82,52 @@ def train_network():
     cost = fluid.layers.cross_entropy(input=predict, label=label)
     avg_cost = fluid.layers.mean(cost)
     accuracy = fluid.layers.accuracy(input=predict, label=label)
-    return avg_cost, accuracy
+    return [avg_cost, accuracy]
 
 
-def train(use_cuda, save_path):
+def optimizer_func():
+    return fluid.optimizer.Adam(learning_rate=0.001)
+
+
+def train(use_cuda, train_program, params_dirname):
     BATCH_SIZE = 128
     EPOCH_NUM = 1
 
     train_reader = paddle.batch(
         paddle.reader.shuffle(
-            paddle.dataset.cifar.train10(), buf_size=128 * 10),
+            cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10),
         batch_size=BATCH_SIZE)
 
     test_reader = paddle.batch(
         paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
 
     def event_handler(event):
-        if isinstance(event, fluid.EndIteration):
-            if (event.batch_id % 10) == 0:
-                avg_cost, accuracy = trainer.test(reader=test_reader)
+        if isinstance(event, fluid.EndStepEvent):
+            avg_cost, accuracy = trainer.test(
+                reader=test_reader, feed_order=['pixel', 'label'])
 
-                print('BatchID {1:04}, Loss {2:2.2}, Acc {3:2.2}'.format(
-                    event.batch_id + 1, avg_cost, accuracy))
+            print('Loss {0:2.2}, Acc {1:2.2}'.format(avg_cost, accuracy))
 
-                if accuracy > 0.01:  # Low threshold for speeding up CI
-                    trainer.params.save(save_path)
-                    return
+            if accuracy > 0.01:  # Low threshold for speeding up CI
+                if params_dirname is not None:
+                    trainer.save_params(params_dirname)
+                return
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     trainer = fluid.Trainer(
-        train_network,
-        optimizer=fluid.optimizer.Adam(learning_rate=0.001),
-        place=place,
-        event_handler=event_handler)
-    trainer.train(train_reader, EPOCH_NUM, event_handler=event_handler)
+        train_func=train_program, optimizer_func=optimizer_func, place=place)
+
+    trainer.train(
+        reader=train_reader,
+        num_epochs=EPOCH_NUM,
+        event_handler=event_handler,
+        feed_order=['pixel', 'label'])
 
 
-def infer(use_cuda, save_path):
-    params = fluid.Params(save_path)
+def infer(use_cuda, inference_program, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    inferencer = fluid.Inferencer(inference_network, params, place=place)
+    inferencer = fluid.Inferencer(
+        infer_func=inference_program, param_path=params_dirname, place=place)
 
     # The input's dimension of conv should be 4-D or 5-D.
     # Use normilized image pixels as input data, which should be in the range
@@ -135,8 +142,16 @@ def main(use_cuda):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
     save_path = "image_classification_resnet.inference.model"
-    train(use_cuda, save_path)
-    infer(use_cuda, save_path)
+
+    train(
+        use_cuda=use_cuda,
+        train_program=train_network,
+        params_dirname=save_path)
+
+    infer(
+        use_cuda=use_cuda,
+        inference_program=inference_network,
+        params_dirname=save_path)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book/image_classification/notest_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
similarity index 70%
rename from python/paddle/fluid/tests/book/image_classification/notest_image_classification_vgg.py
rename to python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
index e83afeed2f72635a40aa2ac21dc0c8611c309de4..224cca417e717bbcc54b58be6ac0219be207dea3 100644
--- a/python/paddle/fluid/tests/book/image_classification/notest_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import paddle
 import paddle.fluid as fluid
 import numpy
+import cifar10_small_test_set
 
 
 def vgg16_bn_drop(input):
@@ -60,46 +61,50 @@ def train_network():
     cost = fluid.layers.cross_entropy(input=predict, label=label)
     avg_cost = fluid.layers.mean(cost)
     accuracy = fluid.layers.accuracy(input=predict, label=label)
-    return avg_cost, accuracy
+    return [avg_cost, accuracy]
 
 
-def train(use_cuda, save_path):
-    BATCH_SIZE = 128
-    EPOCH_NUM = 1
+def optimizer_func():
+    return fluid.optimizer.Adam(learning_rate=0.001)
+
 
+def train(use_cuda, train_program, params_dirname):
+    BATCH_SIZE = 128
     train_reader = paddle.batch(
         paddle.reader.shuffle(
-            paddle.dataset.cifar.train10(), buf_size=128 * 10),
+            cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10),
         batch_size=BATCH_SIZE)
 
     test_reader = paddle.batch(
         paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
 
     def event_handler(event):
-        if isinstance(event, fluid.EndIteration):
-            if (event.batch_id % 10) == 0:
-                avg_cost, accuracy = trainer.test(reader=test_reader)
+        if isinstance(event, fluid.EndStepEvent):
+            avg_cost, accuracy = trainer.test(
+                reader=test_reader, feed_order=['pixel', 'label'])
 
-                print('BatchID {1:04}, Loss {2:2.2}, Acc {3:2.2}'.format(
-                    event.batch_id + 1, avg_cost, accuracy))
+            print('Loss {0:2.2}, Acc {1:2.2}'.format(avg_cost, accuracy))
 
-                if accuracy > 0.01:  # Low threshold for speeding up CI
-                    trainer.params.save(save_path)
-                    return
+            if accuracy > 0.01:  # Low threshold for speeding up CI
+                if params_dirname is not None:
+                    trainer.save_params(params_dirname)
+                return
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     trainer = fluid.Trainer(
-        train_network,
-        optimizer=fluid.optimizer.Adam(learning_rate=0.001),
-        place=place,
-        event_handler=event_handler)
-    trainer.train(train_reader, EPOCH_NUM, event_handler=event_handler)
+        train_func=train_program, place=place, optimizer_func=optimizer_func)
 
+    trainer.train(
+        reader=train_reader,
+        num_epochs=1,
+        event_handler=event_handler,
+        feed_order=['pixel', 'label'])
 
-def infer(use_cuda, save_path):
-    params = fluid.Params(save_path)
+
+def infer(use_cuda, inference_program, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    inferencer = fluid.Inferencer(inference_network, params, place=place)
+    inferencer = fluid.Inferencer(
+        infer_func=inference_program, param_path=params_dirname, place=place)
 
     # The input's dimension of conv should be 4-D or 5-D.
     # Use normilized image pixels as input data, which should be in the range
@@ -114,8 +119,16 @@ def main(use_cuda):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
     save_path = "image_classification_vgg.inference.model"
-    train(use_cuda, save_path)
-    infer(use_cuda, save_path)
+
+    train(
+        use_cuda=use_cuda,
+        train_program=train_network,
+        params_dirname=save_path)
+
+    infer(
+        use_cuda=use_cuda,
+        inference_program=inference_network,
+        params_dirname=save_path)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..673c965b662a022739f8d489c331f4de9455a926
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
new file mode 100755
index 0000000000000000000000000000000000000000..0ccb3a39e02ea0c24bdfe01c5eba73b92da88a04
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
@@ -0,0 +1,265 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+import numpy as np
+
+WORD_DICT, VERB_DICT, LABEL_DICT = paddle.dataset.conll05.get_dict()
+WORD_DICT_LEN = len(WORD_DICT)
+LABEL_DICT_LEN = len(LABEL_DICT)
+PRED_DICT_LEN = len(VERB_DICT)
+MARK_DICT_LEN = 2
+IS_SPARSE = True
+BATCH_SIZE = 10
+EMBEDDING_NAME = 'emb'
+
+
+def lstm_net():
+    WORD_DIM = 32
+    MARK_DIM = 5
+    HIDDEN_DIM = 512
+    DEPTH = 8
+
+    # Data definitions
+    word = fluid.layers.data(
+        name='word_data', shape=[1], dtype='int64', lod_level=1)
+    predicate = fluid.layers.data(
+        name='verb_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_n2 = fluid.layers.data(
+        name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_n1 = fluid.layers.data(
+        name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_0 = fluid.layers.data(
+        name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p1 = fluid.layers.data(
+        name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p2 = fluid.layers.data(
+        name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+    mark = fluid.layers.data(
+        name='mark_data', shape=[1], dtype='int64', lod_level=1)
+
+    # 8 features
+    predicate_embedding = fluid.layers.embedding(
+        input=predicate,
+        size=[PRED_DICT_LEN, WORD_DIM],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr='vemb')
+
+    mark_embedding = fluid.layers.embedding(
+        input=mark,
+        size=[MARK_DICT_LEN, MARK_DIM],
+        dtype='float32',
+        is_sparse=IS_SPARSE)
+
+    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+    emb_layers = [
+        fluid.layers.embedding(
+            size=[WORD_DICT_LEN, WORD_DIM],
+            input=x,
+            param_attr=fluid.ParamAttr(name=EMBEDDING_NAME))
+        for x in word_input
+        #name=EMBEDDING_NAME, trainable=False)) for x in word_input
+    ]
+    emb_layers.append(predicate_embedding)
+    emb_layers.append(mark_embedding)
+
+    hidden_0_layers = [
+        fluid.layers.fc(input=emb, size=HIDDEN_DIM, act='tanh')
+        for emb in emb_layers
+    ]
+
+    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
+
+    lstm_0 = fluid.layers.dynamic_lstm(
+        input=hidden_0,
+        size=HIDDEN_DIM,
+        candidate_activation='relu',
+        gate_activation='sigmoid',
+        cell_activation='sigmoid')
+
+    # stack L-LSTM and R-LSTM with direct edges
+    input_tmp = [hidden_0, lstm_0]
+
+    for i in range(1, DEPTH):
+        mix_hidden = fluid.layers.sums(input=[
+            fluid.layers.fc(input=input_tmp[0], size=HIDDEN_DIM, act='tanh'),
+            fluid.layers.fc(input=input_tmp[1], size=HIDDEN_DIM, act='tanh')
+        ])
+
+        lstm = fluid.layers.dynamic_lstm(
+            input=mix_hidden,
+            size=HIDDEN_DIM,
+            candidate_activation='relu',
+            gate_activation='sigmoid',
+            cell_activation='sigmoid',
+            is_reverse=((i % 2) == 1))
+
+        input_tmp = [mix_hidden, lstm]
+
+    feature_out = fluid.layers.sums(input=[
+        fluid.layers.fc(input=input_tmp[0], size=LABEL_DICT_LEN, act='tanh'),
+        fluid.layers.fc(input=input_tmp[1], size=LABEL_DICT_LEN, act='tanh')
+    ])
+
+    return feature_out
+
+
+def inference_program():
+    predict = lstm_net()
+
+    return predict
+
+
+def train_program():
+    MIX_HIDDEN_LR = 1e-3
+
+    predict = lstm_net()
+    target = fluid.layers.data(
+        name='target', shape=[1], dtype='int64', lod_level=1)
+    crf_cost = fluid.layers.linear_chain_crf(
+        input=predict,
+        label=target,
+        param_attr=fluid.ParamAttr(
+            name='crfw', learning_rate=MIX_HIDDEN_LR))
+    avg_cost = fluid.layers.mean(crf_cost)
+
+    return [avg_cost]
+
+
+def optimize_func():
+    return fluid.optimizer.SGD(learning_rate=fluid.layers.exponential_decay(
+        learning_rate=0.01, decay_steps=100000, decay_rate=0.5, staircase=True))
+
+
+def train(use_cuda, train_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    trainer = fluid.Trainer(
+        train_func=train_program, place=place, optimizer_func=optimize_func)
+
+    feed_order = [
+        'word_data', 'ctx_n2_data', 'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
+        'ctx_p2_data', 'verb_data', 'mark_data', 'target'
+    ]
+
+    #embedding_param = fluid.global_scope().find_var(
+    #        EMBEDDING_NAME).get_tensor()
+    #embedding_param.set(
+    #        load_parameter(conll05.get_embedding(), WORD_DICT_LEN, WORD_DIM),
+    #        place)
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndEpochEvent):
+            test_reader = paddle.batch(
+                paddle.dataset.conll05.test(), batch_size=BATCH_SIZE)
+            avg_cost_set = trainer.test(
+                reader=test_reader, feed_order=feed_order)
+
+            # get avg cost
+            avg_cost = np.array(avg_cost_set).mean()
+
+            print("avg_cost: %s" % avg_cost)
+
+            if float(avg_cost) < 100.0:  # Large value to increase CI speed
+                trainer.save_params(params_dirname)
+            else:
+                print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
+                                                              float(avg_cost)))
+                if math.isnan(float(avg_cost)):
+                    sys.exit("got NaN loss, training failed.")
+
+        elif isinstance(event, fluid.EndStepEvent):
+            print("Step {0}, Epoch {1} Metrics {2}".format(
+                event.step, event.epoch, map(np.array, event.metrics)))
+            if event.step == 1:  # Run 2 iterations to speed CI
+                trainer.save_params(params_dirname)
+                trainer.stop()
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.conll05.test(), buf_size=8192),
+        batch_size=BATCH_SIZE)
+    trainer.train(
+        num_epochs=1,
+        event_handler=event_handler,
+        reader=train_reader,
+        feed_order=feed_order)
+
+
+def infer(use_cuda, inference_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    inferencer = fluid.Inferencer(
+        inference_program, param_path=params_dirname, place=place)
+
+    # Setup inputs by creating LoDTensors to represent sequences of words.
+    # Here each word is the basic element of these LoDTensors and the shape of 
+    # each word (base_shape) should be [1] since it is simply an index to 
+    # look up for the corresponding word vector.
+    # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
+    # which has only one lod level. Then the created LoDTensors will have only 
+    # one higher level structure (sequence of words, or sentence) than the basic 
+    # element (word). Hence the LoDTensor will hold data for three sentences of 
+    # length 3, 4 and 2, respectively. 
+    # Note that lod info should be a list of lists.
+    lod = [[3, 4, 2]]
+    base_shape = [1]
+    # The range of random integers is [low, high]
+    word = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+    ctx_n2 = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+    ctx_n1 = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+    ctx_0 = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+    ctx_p1 = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+    ctx_p2 = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+    pred = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=PRED_DICT_LEN - 1)
+    mark = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=MARK_DICT_LEN - 1)
+
+    results = inferencer.infer(
+        {
+            'word_data': word,
+            'ctx_n2_data': ctx_n2,
+            'ctx_n1_data': ctx_n1,
+            'ctx_0_data': ctx_0,
+            'ctx_p1_data': ctx_p1,
+            'ctx_p2_data': ctx_p2,
+            'verb_data': pred,
+            'mark_data': mark
+        },
+        return_numpy=False)
+
+    print("infer results: ", np.array(results[0]).shape)
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    params_dirname = "label_semantic_roles.inference.model"
+    train(use_cuda, train_program, params_dirname)
+    infer(use_cuda, inference_program, params_dirname)
+
+
+if __name__ == '__main__':
+    for use_cuda in (False, True):
+        main(use_cuda=use_cuda)
diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..673c965b662a022739f8d489c331f4de9455a926
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4b37df3a09f93fe965ae28ce783f06f5018020d
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
@@ -0,0 +1,306 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+import paddle.fluid.layers as pd
+from paddle.fluid.executor import Executor
+from functools import partial
+import unittest
+import os
+
+dict_size = 30000
+source_dict_dim = target_dict_dim = dict_size
+hidden_dim = 32
+word_dim = 16
+batch_size = 2
+max_length = 8
+topk_size = 50
+trg_dic_size = 10000
+beam_size = 2
+
+decoder_size = hidden_dim
+
+
+def encoder(is_sparse):
+    # encoder
+    src_word_id = pd.data(
+        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
+    src_embedding = pd.embedding(
+        input=src_word_id,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=is_sparse,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
+    lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4)
+    encoder_out = pd.sequence_last_step(input=lstm_hidden0)
+    return encoder_out
+
+
+def train_decoder(context, is_sparse):
+    # decoder
+    trg_language_word = pd.data(
+        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
+    trg_embedding = pd.embedding(
+        input=trg_language_word,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=is_sparse,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    rnn = pd.DynamicRNN()
+    with rnn.block():
+        current_word = rnn.step_input(trg_embedding)
+        pre_state = rnn.memory(init=context)
+        current_state = pd.fc(input=[current_word, pre_state],
+                              size=decoder_size,
+                              act='tanh')
+
+        current_score = pd.fc(input=current_state,
+                              size=target_dict_dim,
+                              act='softmax')
+        rnn.update_memory(pre_state, current_state)
+        rnn.output(current_score)
+
+    return rnn()
+
+
+def decode(context, is_sparse):
+    init_state = context
+    array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
+    counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)
+
+    # fill the first element with init_state
+    state_array = pd.create_array('float32')
+    pd.array_write(init_state, array=state_array, i=counter)
+
+    # ids, scores as memory
+    ids_array = pd.create_array('int64')
+    scores_array = pd.create_array('float32')
+
+    init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
+    init_scores = pd.data(
+        name="init_scores", shape=[1], dtype="float32", lod_level=2)
+
+    pd.array_write(init_ids, array=ids_array, i=counter)
+    pd.array_write(init_scores, array=scores_array, i=counter)
+
+    cond = pd.less_than(x=counter, y=array_len)
+
+    while_op = pd.While(cond=cond)
+    with while_op.block():
+        pre_ids = pd.array_read(array=ids_array, i=counter)
+        pre_state = pd.array_read(array=state_array, i=counter)
+        pre_score = pd.array_read(array=scores_array, i=counter)
+
+        # expand the lod of pre_state to be the same with pre_score
+        pre_state_expanded = pd.sequence_expand(pre_state, pre_score)
+
+        pre_ids_emb = pd.embedding(
+            input=pre_ids,
+            size=[dict_size, word_dim],
+            dtype='float32',
+            is_sparse=is_sparse)
+
+        # use rnn unit to update rnn
+        current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb],
+                              size=decoder_size,
+                              act='tanh')
+        current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score)
+        # use score to do beam search
+        current_score = pd.fc(input=current_state_with_lod,
+                              size=target_dict_dim,
+                              act='softmax')
+        topk_scores, topk_indices = pd.topk(current_score, k=topk_size)
+        selected_ids, selected_scores = pd.beam_search(
+            pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
+
+        pd.increment(x=counter, value=1, in_place=True)
+
+        # update the memories
+        pd.array_write(current_state, array=state_array, i=counter)
+        pd.array_write(selected_ids, array=ids_array, i=counter)
+        pd.array_write(selected_scores, array=scores_array, i=counter)
+
+        pd.less_than(x=counter, y=array_len, cond=cond)
+
+    translation_ids, translation_scores = pd.beam_search_decode(
+        ids=ids_array, scores=scores_array)
+
+    # return init_ids, init_scores
+
+    return translation_ids, translation_scores
+
+
+def train_program(is_sparse):
+    context = encoder(is_sparse)
+    rnn_out = train_decoder(context, is_sparse)
+    label = pd.data(
+        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
+    cost = pd.cross_entropy(input=rnn_out, label=label)
+    avg_cost = pd.mean(cost)
+    return avg_cost
+
+
+def optimizer_func():
+    return fluid.optimizer.Adagrad(
+        learning_rate=1e-4,
+        regularization=fluid.regularizer.L2DecayRegularizer(
+            regularization_coeff=0.1))
+
+
+def train(use_cuda, is_sparse, is_local=True):
+    EPOCH_NUM = 1
+
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+
+    feed_order = [
+        'src_word_id', 'target_language_word', 'target_language_next_word'
+    ]
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndStepEvent):
+            print('pass_id=' + str(event.epoch) + ' batch=' + str(event.step))
+            if event.step == 10:
+                trainer.stop()
+
+    trainer = fluid.Trainer(
+        train_func=partial(train_program, is_sparse),
+        place=place,
+        optimizer_func=optimizer_func)
+
+    trainer.train(
+        reader=train_reader,
+        num_epochs=EPOCH_NUM,
+        event_handler=event_handler,
+        feed_order=feed_order)
+
+
+def decode_main(use_cuda, is_sparse):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    context = encoder(is_sparse)
+    translation_ids, translation_scores = decode(context, is_sparse)
+
+    exe = Executor(place)
+    exe.run(framework.default_startup_program())
+
+    init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64')
+    init_scores_data = np.array(
+        [1. for _ in range(batch_size)], dtype='float32')
+    init_ids_data = init_ids_data.reshape((batch_size, 1))
+    init_scores_data = init_scores_data.reshape((batch_size, 1))
+    init_lod = [1] * batch_size
+    init_lod = [init_lod, init_lod]
+
+    init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
+    init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+
+    feed_order = ['src_word_id']
+    feed_list = [
+        framework.default_main_program().global_block().var(var_name)
+        for var_name in feed_order
+    ]
+    feeder = fluid.DataFeeder(feed_list, place)
+
+    for data in train_data():
+        feed_dict = feeder.feed(map(lambda x: [x[0]], data))
+        feed_dict['init_ids'] = init_ids
+        feed_dict['init_scores'] = init_scores
+
+        result_ids, result_scores = exe.run(
+            framework.default_main_program(),
+            feed=feed_dict,
+            fetch_list=[translation_ids, translation_scores],
+            return_numpy=False)
+        print result_ids.lod()
+        break
+
+
+class TestMachineTranslation(unittest.TestCase):
+    pass
+
+
+@contextlib.contextmanager
+def scope_prog_guard():
+    prog = fluid.Program()
+    startup_prog = fluid.Program()
+    scope = fluid.core.Scope()
+    with fluid.scope_guard(scope):
+        with fluid.program_guard(prog, startup_prog):
+            yield
+
+
+def inject_test_train(use_cuda, is_sparse):
+    f_name = 'test_{0}_{1}_train'.format('cuda' if use_cuda else 'cpu', 'sparse'
+                                         if is_sparse else 'dense')
+
+    def f(*args):
+        with scope_prog_guard():
+            train(use_cuda, is_sparse)
+
+    setattr(TestMachineTranslation, f_name, f)
+
+
+def inject_test_decode(use_cuda, is_sparse, decorator=None):
+    f_name = 'test_{0}_{1}_decode'.format('cuda'
+                                          if use_cuda else 'cpu', 'sparse'
+                                          if is_sparse else 'dense')
+
+    def f(*args):
+        with scope_prog_guard():
+            decode_main(use_cuda, is_sparse)
+
+    if decorator is not None:
+        f = decorator(f)
+
+    setattr(TestMachineTranslation, f_name, f)
+
+
+for _use_cuda_ in (False, True):
+    for _is_sparse_ in (False, True):
+        inject_test_train(_use_cuda_, _is_sparse_)
+
+for _use_cuda_ in (False, True):
+    for _is_sparse_ in (False, True):
+
+        _decorator_ = None
+        if _use_cuda_:
+            _decorator_ = unittest.skip(
+                reason='Beam Search does not support CUDA!')
+
+        inject_test_decode(
+            is_sparse=_is_sparse_, use_cuda=_use_cuda_, decorator=_decorator_)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..673c965b662a022739f8d489c331f4de9455a926
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a09db25dc0e2c71772aa06e6d0cf993321612e4
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -0,0 +1,137 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import paddle.fluid as fluid
+import paddle
+import sys
+import numpy
+import unittest
+import math
+import sys
+import os
+
+BATCH_SIZE = 64
+
+
+def inference_program():
+    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+    return prediction
+
+
+def train_program():
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    predict = inference_program()
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    acc = fluid.layers.accuracy(input=predict, label=label)
+    return [avg_cost, acc]
+
+
+def optimizer_func():
+    return fluid.optimizer.Adam(learning_rate=0.001)
+
+
+def train(use_cuda, train_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    trainer = fluid.Trainer(
+        train_func=train_program,
+        place=place,
+        optimizer_func=optimizer_func,
+        parallel=True)
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndEpochEvent):
+            test_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
+            avg_cost, acc = trainer.test(
+                reader=test_reader, feed_order=['img', 'label'])
+
+            print("avg_cost: %s" % avg_cost)
+            print("acc     : %s" % acc)
+
+            if acc > 0.2:  # Smaller value to increase CI speed
+                trainer.save_params(params_dirname)
+            else:
+                print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
+                    event.epoch + 1, avg_cost, acc))
+                if math.isnan(avg_cost):
+                    sys.exit("got NaN loss, training failed.")
+        elif isinstance(event, fluid.EndStepEvent):
+            print("Step {0}, Epoch {1} Metrics {2}".format(
+                event.step, event.epoch, map(numpy.array, event.metrics)))
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=500),
+        batch_size=BATCH_SIZE)
+
+    trainer.train(
+        num_epochs=1,
+        event_handler=event_handler,
+        reader=train_reader,
+        feed_order=['img', 'label'])
+
+
+def infer(use_cuda, inference_program, params_dirname=None):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    inferencer = fluid.Inferencer(
+        infer_func=inference_program, param_path=params_dirname, place=place)
+
+    batch_size = 1
+    tensor_img = numpy.random.uniform(-1.0, 1.0,
+                                      [batch_size, 1, 28, 28]).astype("float32")
+
+    results = inferencer.infer({'img': tensor_img})
+
+    print("infer results: ", results[0])
+
+
+def main(use_cuda):
+    params_dirname = "recognize_digits_conv.inference.model"
+
+    # call train() with is_local argument to run distributed train
+    train(
+        use_cuda=use_cuda,
+        train_program=train_program,
+        params_dirname=params_dirname)
+    infer(
+        use_cuda=use_cuda,
+        inference_program=inference_program,
+        params_dirname=params_dirname)
+
+
+if __name__ == '__main__':
+    # for use_cuda in (False, True):
+    main(use_cuda=True)
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2b544e791d7ea35ff7d2c9a2dce7ce7f5680f38
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
@@ -0,0 +1,118 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import paddle.fluid as fluid
+import paddle
+import sys
+import numpy
+import unittest
+import math
+import sys
+import os
+
+BATCH_SIZE = 64
+
+
+def inference_program():
+    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+
+    hidden = fluid.layers.fc(input=img, size=200, act='tanh')
+    hidden = fluid.layers.fc(input=hidden, size=200, act='tanh')
+    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    return prediction
+
+
+def train_program():
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    predict = inference_program()
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    acc = fluid.layers.accuracy(input=predict, label=label)
+    return [avg_cost, acc]
+
+
+def optimizer_func():
+    return fluid.optimizer.Adam(learning_rate=0.001)
+
+
+def train(use_cuda, train_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    trainer = fluid.Trainer(
+        train_func=train_program, place=place, optimizer_func=optimizer_func)
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndEpochEvent):
+            test_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
+            avg_cost, acc = trainer.test(
+                reader=test_reader, feed_order=['img', 'label'])
+
+            print("avg_cost: %s" % avg_cost)
+            print("acc     : %s" % acc)
+
+            if acc > 0.2:  # Smaller value to increase CI speed
+                trainer.save_params(params_dirname)
+            else:
+                print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
+                    event.epoch + 1, avg_cost, acc))
+                if math.isnan(avg_cost):
+                    sys.exit("got NaN loss, training failed.")
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=500),
+        batch_size=BATCH_SIZE)
+
+    trainer.train(
+        num_epochs=1,
+        event_handler=event_handler,
+        reader=train_reader,
+        feed_order=['img', 'label'])
+
+
+def infer(use_cuda, inference_program, params_dirname=None):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    inferencer = fluid.Inferencer(
+        infer_func=inference_program, param_path=params_dirname, place=place)
+
+    batch_size = 1
+    tensor_img = numpy.random.uniform(-1.0, 1.0,
+                                      [batch_size, 1, 28, 28]).astype("float32")
+
+    results = inferencer.infer({'img': tensor_img})
+
+    print("infer results: ", results[0])
+
+
+def main(use_cuda):
+    params_dirname = "recognize_digits_mlp.inference.model"
+
+    # call train() with is_local argument to run distributed train
+    train(
+        use_cuda=use_cuda,
+        train_program=train_program,
+        params_dirname=params_dirname)
+    infer(
+        use_cuda=use_cuda,
+        inference_program=inference_program,
+        params_dirname=params_dirname)
+
+
+if __name__ == '__main__':
+    # for use_cuda in (False, True):
+    main(use_cuda=False)
diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/recommender_system/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..673c965b662a022739f8d489c331f4de9455a926
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
new file mode 100644
index 0000000000000000000000000000000000000000..090c11ce1e79201f0d65d3540527791ab2191d4a
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
@@ -0,0 +1,258 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import sys
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.fluid.nets as nets
+
+IS_SPARSE = True
+USE_GPU = False
+BATCH_SIZE = 256
+
+
+def get_usr_combined_features():
+    # FIXME(dzh) : old API integer_value(10) may have range check.
+    # currently we don't have user configurated check.
+
+    USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1
+
+    uid = layers.data(name='user_id', shape=[1], dtype='int64')
+
+    usr_emb = layers.embedding(
+        input=uid,
+        dtype='float32',
+        size=[USR_DICT_SIZE, 32],
+        param_attr='user_table',
+        is_sparse=IS_SPARSE)
+
+    usr_fc = layers.fc(input=usr_emb, size=32)
+
+    USR_GENDER_DICT_SIZE = 2
+
+    usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64')
+
+    usr_gender_emb = layers.embedding(
+        input=usr_gender_id,
+        size=[USR_GENDER_DICT_SIZE, 16],
+        param_attr='gender_table',
+        is_sparse=IS_SPARSE)
+
+    usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
+
+    USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
+    usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64")
+
+    usr_age_emb = layers.embedding(
+        input=usr_age_id,
+        size=[USR_AGE_DICT_SIZE, 16],
+        is_sparse=IS_SPARSE,
+        param_attr='age_table')
+
+    usr_age_fc = layers.fc(input=usr_age_emb, size=16)
+
+    USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
+    usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64")
+
+    usr_job_emb = layers.embedding(
+        input=usr_job_id,
+        size=[USR_JOB_DICT_SIZE, 16],
+        param_attr='job_table',
+        is_sparse=IS_SPARSE)
+
+    usr_job_fc = layers.fc(input=usr_job_emb, size=16)
+
+    concat_embed = layers.concat(
+        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1)
+
+    usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
+
+    return usr_combined_features
+
+
+def get_mov_combined_features():
+
+    MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1
+
+    mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')
+
+    mov_emb = layers.embedding(
+        input=mov_id,
+        dtype='float32',
+        size=[MOV_DICT_SIZE, 32],
+        param_attr='movie_table',
+        is_sparse=IS_SPARSE)
+
+    mov_fc = layers.fc(input=mov_emb, size=32)
+
+    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
+
+    category_id = layers.data(
+        name='category_id', shape=[1], dtype='int64', lod_level=1)
+
+    mov_categories_emb = layers.embedding(
+        input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
+
+    mov_categories_hidden = layers.sequence_pool(
+        input=mov_categories_emb, pool_type="sum")
+
+    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
+
+    mov_title_id = layers.data(
+        name='movie_title', shape=[1], dtype='int64', lod_level=1)
+
+    mov_title_emb = layers.embedding(
+        input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
+
+    mov_title_conv = nets.sequence_conv_pool(
+        input=mov_title_emb,
+        num_filters=32,
+        filter_size=3,
+        act="tanh",
+        pool_type="sum")
+
+    concat_embed = layers.concat(
+        input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)
+
+    # FIXME(dzh) : need tanh operator
+    mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
+
+    return mov_combined_features
+
+
+def inference_program():
+    usr_combined_features = get_usr_combined_features()
+    mov_combined_features = get_mov_combined_features()
+
+    inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
+    scale_infer = layers.scale(x=inference, scale=5.0)
+
+    return scale_infer
+
+
+def train_program():
+
+    scale_infer = inference_program()
+
+    label = layers.data(name='score', shape=[1], dtype='float32')
+    square_cost = layers.square_error_cost(input=scale_infer, label=label)
+    avg_cost = layers.mean(square_cost)
+
+    return [avg_cost, scale_infer]
+
+
+def optimizer_func():
+    return fluid.optimizer.SGD(learning_rate=0.2)
+
+
+def train(use_cuda, train_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    trainer = fluid.Trainer(
+        train_func=train_program, place=place, optimizer_func=optimizer_func)
+
+    feed_order = [
+        'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',
+        'movie_title', 'score'
+    ]
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndStepEvent):
+            test_reader = paddle.batch(
+                paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
+            avg_cost_set = trainer.test(
+                reader=test_reader, feed_order=feed_order)
+
+            # get avg cost
+            avg_cost = np.array(avg_cost_set).mean()
+
+            print("avg_cost: %s" % avg_cost)
+
+            if float(avg_cost) < 4:  # Smaller value to increase CI speed
+                trainer.save_params(params_dirname)
+                trainer.stop()
+            else:
+                print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
+                                                              float(avg_cost)))
+                if math.isnan(float(avg_cost)):
+                    sys.exit("got NaN loss, training failed.")
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.movielens.train(), buf_size=8192),
+        batch_size=BATCH_SIZE)
+
+    trainer.train(
+        num_epochs=1,
+        event_handler=event_handler,
+        reader=train_reader,
+        feed_order=feed_order)
+
+
+def infer(use_cuda, inference_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    inferencer = fluid.Inferencer(
+        inference_program, param_path=params_dirname, place=place)
+
+    # Use the first data from paddle.dataset.movielens.test() as input.
+    # Use create_lod_tensor(data, lod, place) API to generate LoD Tensor,
+    # where `data` is a list of sequences of index numbers, `lod` is 
+    # the level of detail (lod) info associated with `data`.
+    # For example, data = [[10, 2, 3], [2, 3]] means that it contains
+    # two sequences of indexes, of length 3 and 2, respectively.
+    # Correspondingly, lod = [[3, 2]] contains one level of detail info,
+    # indicating that `data` consists of two sequences of length 3 and 2. 
+    user_id = fluid.create_lod_tensor([[1]], [[1]], place)
+    gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
+    age_id = fluid.create_lod_tensor([[0]], [[1]], place)
+    job_id = fluid.create_lod_tensor([[10]], [[1]], place)
+    movie_id = fluid.create_lod_tensor([[783]], [[1]], place)
+    category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place)
+    movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]], [[5]],
+                                          place)
+
+    results = inferencer.infer(
+        {
+            'user_id': user_id,
+            'gender_id': gender_id,
+            'age_id': age_id,
+            'job_id': job_id,
+            'movie_id': movie_id,
+            'category_id': category_id,
+            'movie_title': movie_title
+        },
+        return_numpy=False)
+
+    print("infer results: ", np.array(results[0]))
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    params_dirname = "recommender_system.inference.model"
+    train(
+        use_cuda=use_cuda,
+        train_program=train_program,
+        params_dirname=params_dirname)
+    infer(
+        use_cuda=use_cuda,
+        inference_program=inference_program,
+        params_dirname=params_dirname)
+
+
+if __name__ == '__main__':
+    main(USE_GPU)
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d71147a85e77ea6dc5b6391aa169abd9b02a0aa1
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt
@@ -0,0 +1,12 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+# This test is buggy
+# py_test(test_understand_sentiment_dynamic_rnn SRCS
+# 	test_understand_sentiment_dynamic_rnn.py SERIAL)
+LIST(REMOVE_ITEM TEST_OPS test_understand_sentiment_dynamic_rnn)
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b61f7a00ce5e2a08c2105fb7f50e6868ef25df3
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+from functools import partial
+import numpy as np
+
+CLASS_DIM = 2
+EMB_DIM = 128
+HID_DIM = 512
+BATCH_SIZE = 128
+
+
+def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim):
+    emb = fluid.layers.embedding(
+        input=data, size=[input_dim, emb_dim], is_sparse=True)
+    conv_3 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=3,
+        act="tanh",
+        pool_type="sqrt")
+    conv_4 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=4,
+        act="tanh",
+        pool_type="sqrt")
+    prediction = fluid.layers.fc(input=[conv_3, conv_4],
+                                 size=class_dim,
+                                 act="softmax")
+    return prediction
+
+
+def inference_program(word_dict):
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+
+    dict_dim = len(word_dict)
+    net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM)
+    return net
+
+
+def train_program(word_dict):
+    prediction = inference_program(word_dict)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    accuracy = fluid.layers.accuracy(input=prediction, label=label)
+    return [avg_cost, accuracy]
+
+
+def optimizer_func():
+    return fluid.optimizer.Adagrad(learning_rate=0.002)
+
+
+def train(use_cuda, train_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    trainer = fluid.Trainer(
+        train_func=partial(train_program, word_dict),
+        place=place,
+        optimizer_func=optimizer_func)
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndEpochEvent):
+            test_reader = paddle.batch(
+                paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
+            avg_cost, acc = trainer.test(
+                reader=test_reader, feed_order=['words', 'label'])
+
+            print("avg_cost: %s" % avg_cost)
+            print("acc     : %s" % acc)
+
+            if acc > 0.2:  # Smaller value to increase CI speed
+                trainer.save_params(params_dirname)
+                trainer.stop()
+
+            else:
+                print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
+                    event.epoch + 1, avg_cost, acc))
+                if math.isnan(avg_cost):
+                    sys.exit("got NaN loss, training failed.")
+        elif isinstance(event, fluid.EndStepEvent):
+            print("Step {0}, Epoch {1} Metrics {2}".format(
+                event.step, event.epoch, map(np.array, event.metrics)))
+            if event.step == 1:  # Run 2 iterations to speed CI
+                trainer.save_params(params_dirname)
+                trainer.stop()
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=25000),
+        batch_size=BATCH_SIZE)
+
+    trainer.train(
+        num_epochs=1,
+        event_handler=event_handler,
+        reader=train_reader,
+        feed_order=['words', 'label'])
+
+
+def infer(use_cuda, inference_program, params_dirname=None):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    word_dict = paddle.dataset.imdb.word_dict()
+
+    inferencer = fluid.Inferencer(
+        infer_func=partial(inference_program, word_dict),
+        param_path=params_dirname,
+        place=place)
+
+    # Setup input by creating LoDTensor to represent sequence of words.
+    # Here each word is the basic element of the LoDTensor and the shape of 
+    # each word (base_shape) should be [1] since it is simply an index to 
+    # look up for the corresponding word vector.
+    # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
+    # which has only one lod level. Then the created LoDTensor will have only 
+    # one higher level structure (sequence of words, or sentence) than the basic 
+    # element (word). Hence the LoDTensor will hold data for three sentences of 
+    # length 3, 4 and 2, respectively. 
+    # Note that lod info should be a list of lists.
+    lod = [[3, 4, 2]]
+    base_shape = [1]
+    # The range of random integers is [low, high]
+    tensor_words = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=len(word_dict) - 1)
+    results = inferencer.infer({'words': tensor_words})
+    print("infer results: ", results)
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    params_dirname = "understand_sentiment_conv.inference.model"
+    train(use_cuda, train_program, params_dirname)
+    infer(use_cuda, inference_program, params_dirname)
+
+
+if __name__ == '__main__':
+    for use_cuda in (False, True):
+        main(use_cuda=use_cuda)
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa7c567b4d66ba07c26d54436fb305011cfeccf2
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+from functools import partial
+import numpy as np
+
+CLASS_DIM = 2
+EMB_DIM = 128
+BATCH_SIZE = 128
+LSTM_SIZE = 128
+
+
+def dynamic_rnn_lstm(data, input_dim, class_dim, emb_dim, lstm_size):
+    emb = fluid.layers.embedding(
+        input=data, size=[input_dim, emb_dim], is_sparse=True)
+    sentence = fluid.layers.fc(input=emb, size=lstm_size, act='tanh')
+
+    rnn = fluid.layers.DynamicRNN()
+    with rnn.block():
+        word = rnn.step_input(sentence)
+        prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
+        prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
+
+        def gate_common(ipt, hidden, size):
+            gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
+            gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
+            return gate0 + gate1
+
+        forget_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
+                                                         lstm_size))
+        input_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
+                                                        lstm_size))
+        output_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
+                                                         lstm_size))
+        cell_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
+                                                       lstm_size))
+
+        cell = forget_gate * prev_cell + input_gate * cell_gate
+        hidden = output_gate * fluid.layers.tanh(x=cell)
+        rnn.update_memory(prev_cell, cell)
+        rnn.update_memory(prev_hidden, hidden)
+        rnn.output(hidden)
+
+    last = fluid.layers.sequence_last_step(rnn())
+    prediction = fluid.layers.fc(input=last, size=class_dim, act="softmax")
+    return prediction
+
+
+def inference_program(word_dict):
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+
+    dict_dim = len(word_dict)
+    pred = dynamic_rnn_lstm(data, dict_dim, CLASS_DIM, EMB_DIM, LSTM_SIZE)
+    return pred
+
+
+def train_program(word_dict):
+    prediction = inference_program(word_dict)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    accuracy = fluid.layers.accuracy(input=prediction, label=label)
+    return [avg_cost, accuracy]
+
+
+def optimizer_func():
+    return fluid.optimizer.Adagrad(learning_rate=0.002)
+
+
+def train(use_cuda, train_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    trainer = fluid.Trainer(
+        train_func=partial(train_program, word_dict),
+        place=place,
+        optimizer_func=optimizer_func)
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndEpochEvent):
+            test_reader = paddle.batch(
+                paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
+            avg_cost, acc = trainer.test(
+                reader=test_reader, feed_order=['words', 'label'])
+
+            print("avg_cost: %s" % avg_cost)
+            print("acc     : %s" % acc)
+
+            if acc > 0.2:  # Smaller value to increase CI speed
+                trainer.save_params(params_dirname)
+                trainer.stop()
+
+            else:
+                print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
+                    event.epoch + 1, avg_cost, acc))
+                if math.isnan(avg_cost):
+                    sys.exit("got NaN loss, training failed.")
+        elif isinstance(event, fluid.EndStepEvent):
+            print("Step {0}, Epoch {1} Metrics {2}".format(
+                event.step, event.epoch, map(np.array, event.metrics)))
+            if event.step == 1:  # Run 2 iterations to speed CI
+                trainer.save_params(params_dirname)
+                trainer.stop()
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=25000),
+        batch_size=BATCH_SIZE)
+
+    trainer.train(
+        num_epochs=1,
+        event_handler=event_handler,
+        reader=train_reader,
+        feed_order=['words', 'label'])
+
+
+def infer(use_cuda, inference_program, params_dirname=None):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    word_dict = paddle.dataset.imdb.word_dict()
+
+    inferencer = fluid.Inferencer(
+        infer_func=partial(inference_program, word_dict),
+        param_path=params_dirname,
+        place=place)
+
+    # Setup input by creating LoDTensor to represent sequence of words.
+    # Here each word is the basic element of the LoDTensor and the shape of 
+    # each word (base_shape) should be [1] since it is simply an index to 
+    # look up for the corresponding word vector.
+    # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
+    # which has only one lod level. Then the created LoDTensor will have only 
+    # one higher level structure (sequence of words, or sentence) than the basic 
+    # element (word). Hence the LoDTensor will hold data for three sentences of 
+    # length 3, 4 and 2, respectively. 
+    # Note that lod info should be a list of lists.
+    lod = [[3, 4, 2]]
+    base_shape = [1]
+    # The range of random integers is [low, high]
+    tensor_words = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=len(word_dict) - 1)
+    results = inferencer.infer({'words': tensor_words})
+    print("infer results: ", results)
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    params_dirname = "understand_sentiment_conv.inference.model"
+    train(use_cuda, train_program, params_dirname)
+    infer(use_cuda, inference_program, params_dirname)
+
+
+if __name__ == '__main__':
+    for use_cuda in (False, True):
+        main(use_cuda=use_cuda)
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..113dda88ca974c9e6241f127091bd96fb2af4a70
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+from functools import partial
+import numpy as np
+
+CLASS_DIM = 2
+EMB_DIM = 128
+HID_DIM = 512
+STACKED_NUM = 3
+BATCH_SIZE = 128
+
+
+def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num):
+    assert stacked_num % 2 == 1
+
+    emb = fluid.layers.embedding(
+        input=data, size=[input_dim, emb_dim], is_sparse=True)
+
+    fc1 = fluid.layers.fc(input=emb, size=hid_dim)
+    lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
+
+    inputs = [fc1, lstm1]
+
+    for i in range(2, stacked_num + 1):
+        fc = fluid.layers.fc(input=inputs, size=hid_dim)
+        lstm, cell = fluid.layers.dynamic_lstm(
+            input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
+        inputs = [fc, lstm]
+
+    fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
+    lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
+
+    prediction = fluid.layers.fc(input=[fc_last, lstm_last],
+                                 size=class_dim,
+                                 act='softmax')
+    return prediction
+
+
+def inference_program(word_dict):
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+
+    dict_dim = len(word_dict)
+    net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM,
+                           STACKED_NUM)
+    return net
+
+
+def train_program(word_dict):
+    prediction = inference_program(word_dict)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    accuracy = fluid.layers.accuracy(input=prediction, label=label)
+    return [avg_cost, accuracy]
+
+
+def optimizer_func():
+    return fluid.optimizer.Adagrad(learning_rate=0.002)
+
+
+def train(use_cuda, train_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    trainer = fluid.Trainer(
+        train_func=partial(train_program, word_dict),
+        place=place,
+        optimizer_func=optimizer_func)
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndEpochEvent):
+            test_reader = paddle.batch(
+                paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
+            avg_cost, acc = trainer.test(
+                reader=test_reader, feed_order=['words', 'label'])
+
+            print("avg_cost: %s" % avg_cost)
+            print("acc     : %s" % acc)
+
+            if acc > 0.2:  # Smaller value to increase CI speed
+                trainer.save_params(params_dirname)
+                trainer.stop()
+
+            else:
+                print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
+                    event.epoch + 1, avg_cost, acc))
+                if math.isnan(avg_cost):
+                    sys.exit("got NaN loss, training failed.")
+        elif isinstance(event, fluid.EndStepEvent):
+            print("Step {0}, Epoch {1} Metrics {2}".format(
+                event.step, event.epoch, map(np.array, event.metrics)))
+            if event.step == 1:  # Run 2 iterations to speed CI
+                trainer.save_params(params_dirname)
+                trainer.stop()
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=25000),
+        batch_size=BATCH_SIZE)
+
+    trainer.train(
+        num_epochs=1,
+        event_handler=event_handler,
+        reader=train_reader,
+        feed_order=['words', 'label'])
+
+
+def infer(use_cuda, inference_program, params_dirname=None):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    word_dict = paddle.dataset.imdb.word_dict()
+
+    inferencer = fluid.Inferencer(
+        infer_func=partial(inference_program, word_dict),
+        param_path=params_dirname,
+        place=place)
+
+    # Setup input by creating LoDTensor to represent sequence of words.
+    # Here each word is the basic element of the LoDTensor and the shape of 
+    # each word (base_shape) should be [1] since it is simply an index to 
+    # look up for the corresponding word vector.
+    # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
+    # which has only one lod level. Then the created LoDTensor will have only 
+    # one higher level structure (sequence of words, or sentence) than the basic 
+    # element (word). Hence the LoDTensor will hold data for three sentences of 
+    # length 3, 4 and 2, respectively. 
+    # Note that lod info should be a list of lists.
+    lod = [[3, 4, 2]]
+    base_shape = [1]
+    # The range of random integers is [low, high]
+    tensor_words = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=len(word_dict) - 1)
+    results = inferencer.infer({'words': tensor_words})
+    print("infer results: ", results)
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    params_dirname = "understand_sentiment_stacked_lstm.inference.model"
+    train(use_cuda, train_program, params_dirname)
+    infer(use_cuda, inference_program, params_dirname)
+
+
+if __name__ == '__main__':
+    for use_cuda in (False, True):
+        main(use_cuda=use_cuda)
diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/word2vec/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..673c965b662a022739f8d489c331f4de9455a926
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/tests/book/word2vec/no_test_word2vec_new_api.py b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
similarity index 54%
rename from python/paddle/fluid/tests/book/word2vec/no_test_word2vec_new_api.py
rename to python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
index 35e163dc9df5a35ee5774b6b157366c4eabcb0f7..ba44f72d9b03c3a44560a8a30cba2253256314ef 100644
--- a/python/paddle/fluid/tests/book/word2vec/no_test_word2vec_new_api.py
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
@@ -25,16 +25,6 @@ HIDDEN_SIZE = 256
 N = 5
 BATCH_SIZE = 32
 
-
-def create_random_lodtensor(lod, place, low, high):
-    # The range of data elements is [low, high]
-    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
-    res = fluid.LoDTensor()
-    res.set(data, place)
-    res.set_lod([lod])
-    return res
-
-
 word_dict = paddle.dataset.imikolov.build_dict()
 dict_size = len(word_dict)
 
@@ -80,67 +70,102 @@ def inference_program(is_sparse):
 
 
 def train_program(is_sparse):
-    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    # The declaration of 'next_word' must be after the invoking of inference_program,
+    # or the data input order of train program would be [next_word, firstw, secondw,
+    # thirdw, forthw], which is not correct.
     predict_word = inference_program(is_sparse)
+    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
     cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
     avg_cost = fluid.layers.mean(cost)
     return avg_cost
 
 
-def train(use_cuda, is_sparse, save_path):
+def optimizer_func():
+    return fluid.optimizer.SGD(learning_rate=0.001)
+
+
+def train(use_cuda, train_program, params_dirname):
     train_reader = paddle.batch(
         paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
+    test_reader = paddle.batch(
+        paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     def event_handler(event):
-        print type(event)
-        if isinstance(event, fluid.EndEpochEvent):
-            avg_cost = trainer.test(reader=paddle.dataset.imikolov.test(
-                word_dict, N))
-
-            if avg_cost < 5.0:
-                trainer.save_params(save_path)
-                return
+        if isinstance(event, fluid.EndStepEvent):
+            outs = trainer.test(
+                reader=test_reader,
+                feed_order=['firstw', 'secondw', 'thirdw', 'forthw', 'nextw'])
+            avg_cost = outs[0]
+            print("loss= ", avg_cost)
+
+            if avg_cost < 10.0:
+                trainer.save_params(params_dirname)
+                trainer.stop()
+
             if math.isnan(avg_cost):
                 sys.exit("got NaN loss, training failed.")
 
     trainer = fluid.Trainer(
-        partial(train_program, is_sparse),
-        fluid.optimizer.SGD(learning_rate=0.001),
-        place=place)
+        train_func=train_program, optimizer_func=optimizer_func, place=place)
+
     trainer.train(
-        reader=train_reader, num_epochs=100, event_handler=event_handler)
+        reader=train_reader,
+        num_epochs=1,
+        event_handler=event_handler,
+        feed_order=['firstw', 'secondw', 'thirdw', 'forthw', 'nextw'])
 
 
-def infer(use_cuda, is_sparse, save_path):
+def infer(use_cuda, inference_program, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     inferencer = fluid.Inferencer(
-        partial(inference_program, is_sparse),
-        param_path=save_path,
-        place=place)
-
-    lod = [0, 1]
-    first_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
-    second_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
-    third_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
-    fourth_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
-    result = inferencer.infer({
-        'firstw': first_word,
-        'secondw': second_word,
-        'thirdw': third_word,
-        'forthw': fourth_word
-    })
-    print(result)
+        infer_func=inference_program, param_path=params_dirname, place=place)
+
+    # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word 
+    # is simply an index to look up for the corresponding word vector and hence 
+    # the shape of word (base_shape) should be [1]. The length-based level of 
+    # detail (lod) info of each LoDtensor should be [[1]] meaning there is only 
+    # one lod_level and there is only one sequence of one word on this level.
+    # Note that lod info should be a list of lists.
+    lod = [[1]]
+    base_shape = [1]
+    # The range of random integers is [low, high]
+    first_word = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=dict_size - 1)
+    second_word = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=dict_size - 1)
+    third_word = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=dict_size - 1)
+    fourth_word = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=dict_size - 1)
+
+    result = inferencer.infer(
+        {
+            'firstw': first_word,
+            'secondw': second_word,
+            'thirdw': third_word,
+            'forthw': fourth_word
+        },
+        return_numpy=False)
+    print(np.array(result[0]))
 
 
 def main(use_cuda, is_sparse):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
 
-    save_path = "word2vec.inference.model"
-    train(use_cuda, is_sparse, save_path)
-    infer(use_cuda, is_sparse, save_path)
+    params_dirname = "word2vec.inference.model"
+
+    train(
+        use_cuda=use_cuda,
+        train_program=partial(train_program, is_sparse),
+        params_dirname=params_dirname)
+
+    infer(
+        use_cuda=use_cuda,
+        inference_program=partial(inference_program, is_sparse),
+        params_dirname=params_dirname)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book/test_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
similarity index 93%
rename from python/paddle/fluid/tests/book/test_understand_sentiment.py
rename to python/paddle/fluid/tests/book/notest_understand_sentiment.py
index 241778e303036d068dc0a40e4574a02eb97ad134..c6687e8ad7fcc45c82d6dcb2256e9055a81cc61c 100644
--- a/python/paddle/fluid/tests/book/test_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
@@ -125,14 +125,6 @@ def stacked_lstm_net(data,
     return avg_cost, accuracy, prediction
 
 
-def create_random_lodtensor(lod, place, low, high):
-    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
-    res = fluid.LoDTensor()
-    res.set(data, place)
-    res.set_lod([lod])
-    return res
-
-
 def train(word_dict,
           net_method,
           use_cuda,
@@ -170,7 +162,7 @@ def train(word_dict,
         assert save_dirname is None
 
     adagrad = fluid.optimizer.Adagrad(learning_rate=0.002)
-    optimize_ops, params_grads = adagrad.minimize(cost)
+    adagrad.minimize(cost)
 
     train_data = paddle.batch(
         paddle.reader.shuffle(
@@ -242,9 +234,21 @@ def infer(word_dict, use_cuda, save_dirname=None):
 
         word_dict_len = len(word_dict)
 
-        lod = [0, 4, 10]
-        tensor_words = create_random_lodtensor(
-            lod, place, low=0, high=word_dict_len - 1)
+        # Setup input by creating LoDTensor to represent sequence of words.
+        # Here each word is the basic element of the LoDTensor and the shape of 
+        # each word (base_shape) should be [1] since it is simply an index to 
+        # look up for the corresponding word vector.
+        # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
+        # which has only one lod level. Then the created LoDTensor will have only 
+        # one higher level structure (sequence of words, or sentence) than the basic 
+        # element (word). Hence the LoDTensor will hold data for three sentences of 
+        # length 3, 4 and 2, respectively. 
+        # Note that lod info should be a list of lists.
+        lod = [[3, 4, 2]]
+        base_shape = [1]
+        # The range of random integers is [low, high]
+        tensor_words = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=word_dict_len - 1)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
index ecb34699af0dc14782601702ab8afedbca7e1bfd..b1a6b524d33cae97c8982ffb8f780b1b07761a09 100644
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -33,7 +33,7 @@ def train(use_cuda, save_dirname, is_local):
     avg_cost = fluid.layers.mean(cost)
 
     sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-    optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+    sgd_optimizer.minimize(avg_cost)
 
     BATCH_SIZE = 20
 
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index dbcdb5766e7d20efdb12da0ea4c6f005d903849b..0f3a4c9242a81a3c1fb90268245715a8e59a207a 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -125,7 +125,7 @@ def train(net_type, use_cuda, save_dirname, is_local):
     test_program = fluid.default_main_program().clone(for_test=True)
 
     optimizer = fluid.optimizer.Adam(learning_rate=0.001)
-    optimize_ops, params_grads = optimizer.minimize(avg_cost)
+    optimizer.minimize(avg_cost)
 
     BATCH_SIZE = 128
     PASS_NUM = 1
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index 50ef29c4572f1b12fe9793bbf037cd7fe71a9e53..bc8a1aafc82d62501cecfa71be0cc3851c75eae2 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -36,7 +36,7 @@ depth = 8
 mix_hidden_lr = 1e-3
 
 IS_SPARSE = True
-PASS_NUM = 100
+PASS_NUM = 10
 BATCH_SIZE = 10
 
 embedding_name = 'emb'
@@ -116,29 +116,6 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
     return feature_out
 
 
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
-def create_random_lodtensor(lod, place, low, high):
-    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
-    res = fluid.LoDTensor()
-    res.set(data, place)
-    res.set_lod([lod])
-    return res
-
-
 def train(use_cuda, save_dirname=None, is_local=True):
     # define network topology
     word = fluid.layers.data(
@@ -175,19 +152,13 @@ def train(use_cuda, save_dirname=None, is_local=True):
             decay_steps=100000,
             decay_rate=0.5,
             staircase=True))
-    optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+    sgd_optimizer.minimize(avg_cost)
 
     # TODO(qiao)
     # add dependency track and move this config before optimizer
     crf_decode = fluid.layers.crf_decoding(
         input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
 
-    chunk_evaluator = fluid.evaluator.ChunkEvaluator(
-        input=crf_decode,
-        label=target,
-        chunk_scheme="IOB",
-        num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
-
     train_data = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.conll05.test(), buf_size=8192),
@@ -203,7 +174,6 @@ def train(use_cuda, save_dirname=None, is_local=True):
 
     def train_loop(main_program):
         exe.run(fluid.default_startup_program())
-
         embedding_param = fluid.global_scope().find_var(
             embedding_name).get_tensor()
         embedding_param.set(
@@ -213,27 +183,19 @@ def train(use_cuda, save_dirname=None, is_local=True):
         start_time = time.time()
         batch_id = 0
         for pass_id in xrange(PASS_NUM):
-            chunk_evaluator.reset(exe)
             for data in train_data():
-                cost, precision, recall, f1_score = exe.run(
-                    main_program,
-                    feed=feeder.feed(data),
-                    fetch_list=[avg_cost] + chunk_evaluator.metrics)
-                pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(
-                    exe)
+                cost = exe.run(main_program,
+                               feed=feeder.feed(data),
+                               fetch_list=[avg_cost])
+                cost = cost[0]
 
                 if batch_id % 10 == 0:
-                    print("avg_cost:" + str(cost) + " precision:" + str(
-                        precision) + " recall:" + str(recall) + " f1_score:" +
-                          str(f1_score) + " pass_precision:" + str(
-                              pass_precision) + " pass_recall:" + str(
-                                  pass_recall) + " pass_f1_score:" + str(
-                                      pass_f1_score))
+                    print("avg_cost:" + str(cost))
                     if batch_id != 0:
                         print("second per batch: " + str((time.time(
                         ) - start_time) / batch_id))
                     # Set the threshold low to speed up the CI test
-                    if float(pass_precision) > 0.01:
+                    if float(cost) < 60.0:
                         if save_dirname is not None:
                             # TODO(liuyiqun): Change the target to crf_decode
                             fluid.io.save_inference_model(save_dirname, [
@@ -286,23 +248,35 @@ def infer(use_cuda, save_dirname=None):
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
 
-        lod = [0, 4, 10]
-        word = create_random_lodtensor(
-            lod, place, low=0, high=word_dict_len - 1)
-        pred = create_random_lodtensor(
-            lod, place, low=0, high=pred_dict_len - 1)
-        ctx_n2 = create_random_lodtensor(
-            lod, place, low=0, high=word_dict_len - 1)
-        ctx_n1 = create_random_lodtensor(
-            lod, place, low=0, high=word_dict_len - 1)
-        ctx_0 = create_random_lodtensor(
-            lod, place, low=0, high=word_dict_len - 1)
-        ctx_p1 = create_random_lodtensor(
-            lod, place, low=0, high=word_dict_len - 1)
-        ctx_p2 = create_random_lodtensor(
-            lod, place, low=0, high=word_dict_len - 1)
-        mark = create_random_lodtensor(
-            lod, place, low=0, high=mark_dict_len - 1)
+        # Setup inputs by creating LoDTensors to represent sequences of words.
+        # Here each word is the basic element of these LoDTensors and the shape of 
+        # each word (base_shape) should be [1] since it is simply an index to 
+        # look up for the corresponding word vector.
+        # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
+        # which has only one lod level. Then the created LoDTensors will have only 
+        # one higher level structure (sequence of words, or sentence) than the basic 
+        # element (word). Hence the LoDTensor will hold data for three sentences of 
+        # length 3, 4 and 2, respectively. 
+        # Note that lod info should be a list of lists.
+        lod = [[3, 4, 2]]
+        base_shape = [1]
+        # The range of random integers is [low, high]
+        word = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=word_dict_len - 1)
+        pred = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=pred_dict_len - 1)
+        ctx_n2 = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=word_dict_len - 1)
+        ctx_n1 = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=word_dict_len - 1)
+        ctx_0 = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=word_dict_len - 1)
+        ctx_p1 = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=word_dict_len - 1)
+        ctx_p2 = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=word_dict_len - 1)
+        mark = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=mark_dict_len - 1)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py
index 46c6b9c29a265741a99655d5ac29244798f6fec2..23e5900f127a7a3253c551f8f7fbceba08382209 100644
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -147,28 +147,6 @@ def decoder_decode(context, is_sparse):
     return translation_ids, translation_scores
 
 
-def set_init_lod(data, lod, place):
-    res = fluid.LoDTensor()
-    res.set(data, place)
-    res.set_lod(lod)
-    return res
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
 def train_main(use_cuda, is_sparse, is_local=True):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
@@ -185,30 +163,32 @@ def train_main(use_cuda, is_sparse, is_local=True):
         learning_rate=1e-4,
         regularization=fluid.regularizer.L2DecayRegularizer(
             regularization_coeff=0.1))
-    optimize_ops, params_grads = optimizer.minimize(avg_cost)
+    optimizer.minimize(avg_cost)
 
     train_data = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.wmt14.train(dict_size), buf_size=1000),
         batch_size=batch_size)
 
+    feed_order = [
+        'src_word_id', 'target_language_word', 'target_language_next_word'
+    ]
+
     exe = Executor(place)
 
     def train_loop(main_program):
         exe.run(framework.default_startup_program())
 
+        feed_list = [
+            main_program.global_block().var(var_name) for var_name in feed_order
+        ]
+        feeder = fluid.DataFeeder(feed_list, place)
+
         batch_id = 0
         for pass_id in xrange(1):
             for data in train_data():
-                word_data = to_lodtensor(map(lambda x: x[0], data), place)
-                trg_word = to_lodtensor(map(lambda x: x[1], data), place)
-                trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
                 outs = exe.run(main_program,
-                               feed={
-                                   'src_word_id': word_data,
-                                   'target_language_word': trg_word,
-                                   'target_language_next_word': trg_word_next
-                               },
+                               feed=feeder.feed(data),
                                fetch_list=[avg_cost])
                 avg_cost_val = np.array(outs[0])
                 print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
@@ -258,26 +238,32 @@ def decode_main(use_cuda, is_sparse):
         [1. for _ in range(batch_size)], dtype='float32')
     init_ids_data = init_ids_data.reshape((batch_size, 1))
     init_scores_data = init_scores_data.reshape((batch_size, 1))
-    init_lod = [i for i in range(batch_size)] + [batch_size]
+    init_lod = [1] * batch_size
     init_lod = [init_lod, init_lod]
 
+    init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
+    init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
+
     train_data = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.wmt14.train(dict_size), buf_size=1000),
         batch_size=batch_size)
-    for _, data in enumerate(train_data()):
-        init_ids = set_init_lod(init_ids_data, init_lod, place)
-        init_scores = set_init_lod(init_scores_data, init_lod, place)
 
-        src_word_data = to_lodtensor(map(lambda x: x[0], data), place)
+    feed_order = ['src_word_id']
+    feed_list = [
+        framework.default_main_program().global_block().var(var_name)
+        for var_name in feed_order
+    ]
+    feeder = fluid.DataFeeder(feed_list, place)
+
+    for data in train_data():
+        feed_dict = feeder.feed(map(lambda x: [x[0]], data))
+        feed_dict['init_ids'] = init_ids
+        feed_dict['init_scores'] = init_scores
 
         result_ids, result_scores = exe.run(
             framework.default_main_program(),
-            feed={
-                'src_word_id': src_word_data,
-                'init_ids': init_ids,
-                'init_scores': init_scores
-            },
+            feed=feed_dict,
             fetch_list=[translation_ids, translation_scores],
             return_numpy=False)
         print result_ids.lod()
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index c115aa4d7d6b514f9207543730e5e76cb0d2040c..578b1162fbd7e3a1b1c0cc934406818f2e07e019 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -95,7 +95,7 @@ def train(nn_type,
     test_program = fluid.default_main_program().clone(for_test=True)
 
     optimizer = fluid.optimizer.Adam(learning_rate=0.001)
-    optimize_ops, params_grads = optimizer.minimize(avg_loss)
+    optimizer.minimize(avg_loss)
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py
index d022dedbff805d597b68b5a47f7931f2dd946615..65d6552acc9b3d31a97a45290e4613a633fffa3c 100644
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -160,7 +160,7 @@ def train(use_cuda, save_dirname, is_local=True):
     test_program = fluid.default_main_program().clone(for_test=True)
 
     sgd_optimizer = SGDOptimizer(learning_rate=0.2)
-    optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+    sgd_optimizer.minimize(avg_cost)
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
@@ -173,63 +173,33 @@ def train(use_cuda, save_dirname, is_local=True):
     test_reader = paddle.batch(
         paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
 
-    feeding = {
-        'user_id': 0,
-        'gender_id': 1,
-        'age_id': 2,
-        'job_id': 3,
-        'movie_id': 4,
-        'category_id': 5,
-        'movie_title': 6,
-        'score': 7
-    }
-
-    def func_feed(feeding, data):
-        feed_tensors = {}
-        for (key, idx) in feeding.iteritems():
-            tensor = fluid.LoDTensor()
-            if key != "category_id" and key != "movie_title":
-                if key == "score":
-                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
-                        "float32")
-                else:
-                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
-                        "int64")
-            else:
-                numpy_data = map(lambda x: np.array(x[idx]).astype("int64"),
-                                 data)
-                lod_info = [len(item) for item in numpy_data]
-                offset = 0
-                lod = [offset]
-                for item in lod_info:
-                    offset += item
-                    lod.append(offset)
-                numpy_data = np.concatenate(numpy_data, axis=0)
-                tensor.set_lod([lod])
-
-            numpy_data = numpy_data.reshape([numpy_data.shape[0], 1])
-            tensor.set(numpy_data, place)
-            feed_tensors[key] = tensor
-        return feed_tensors
+    feed_order = [
+        'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',
+        'movie_title', 'score'
+    ]
 
     def train_loop(main_program):
         exe.run(framework.default_startup_program())
 
+        feed_list = [
+            main_program.global_block().var(var_name) for var_name in feed_order
+        ]
+        feeder = fluid.DataFeeder(feed_list, place)
+
         PASS_NUM = 100
         for pass_id in range(PASS_NUM):
             for batch_id, data in enumerate(train_reader()):
                 # train a mini-batch
                 outs = exe.run(program=main_program,
-                               feed=func_feed(feeding, data),
+                               feed=feeder.feed(data),
                                fetch_list=[avg_cost])
                 out = np.array(outs[0])
                 if (batch_id + 1) % 10 == 0:
                     avg_cost_set = []
                     for test_data in test_reader():
-                        avg_cost_np = exe.run(
-                            program=test_program,
-                            feed=func_feed(feeding, test_data),
-                            fetch_list=[avg_cost])
+                        avg_cost_np = exe.run(program=test_program,
+                                              feed=feeder.feed(test_data),
+                                              fetch_list=[avg_cost])
                         avg_cost_set.append(avg_cost_np[0])
                         break  # test only 1 segment for speeding up CI
 
@@ -279,23 +249,6 @@ def infer(use_cuda, save_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
 
-    def create_lod_tensor(data, lod=None):
-        tensor = fluid.LoDTensor()
-        if lod is None:
-            # Tensor, the shape is [batch_size, 1]
-            index = 0
-            lod_0 = [index]
-            for l in range(len(data)):
-                index += 1
-                lod_0.append(index)
-            lod = [lod_0]
-        tensor.set_lod(lod)
-
-        flattened_data = np.concatenate(data, axis=0).astype("int64")
-        flattened_data = flattened_data.reshape([len(flattened_data), 1])
-        tensor.set(flattened_data, place)
-        return tensor
-
     inference_scope = fluid.core.Scope()
     with fluid.scope_guard(inference_scope):
         # Use fluid.io.load_inference_model to obtain the inference program desc,
@@ -307,26 +260,33 @@ def infer(use_cuda, save_dirname=None):
 
         # Use the first data from paddle.dataset.movielens.test() as input
         assert feed_target_names[0] == "user_id"
-        user_id = create_lod_tensor([[1]])
+        # Use create_lod_tensor(data, lod, place) API to generate LoD Tensor
+        # where `data` is a list of sequences of index numbers, `lod` is 
+        # the level of detail (lod) info associated with `data`.
+        # For example, data = [[10, 2, 3], [2, 3]] means that it contains
+        # two sequences of indexes, of length 3 and 2, respectively.
+        # Correspondingly, lod = [[3, 2]] contains one level of detail info,
+        # indicating that `data` consists of two sequences of length 3 and 2. 
+        user_id = fluid.create_lod_tensor([[1]], [[1]], place)
 
         assert feed_target_names[1] == "gender_id"
-        gender_id = create_lod_tensor([[1]])
+        gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
 
         assert feed_target_names[2] == "age_id"
-        age_id = create_lod_tensor([[0]])
+        age_id = fluid.create_lod_tensor([[0]], [[1]], place)
 
         assert feed_target_names[3] == "job_id"
-        job_id = create_lod_tensor([[10]])
+        job_id = fluid.create_lod_tensor([[10]], [[1]], place)
 
         assert feed_target_names[4] == "movie_id"
-        movie_id = create_lod_tensor([[783]])
+        movie_id = fluid.create_lod_tensor([[783]], [[1]], place)
 
         assert feed_target_names[5] == "category_id"
-        category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
+        category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place)
 
         assert feed_target_names[6] == "movie_title"
-        movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
-                                        [[0, 5]])
+        movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]],
+                                              [[5]], place)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
diff --git a/python/paddle/fluid/tests/book/notest_rnn_encoder_decoder.py b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
similarity index 87%
rename from python/paddle/fluid/tests/book/notest_rnn_encoder_decoder.py
rename to python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
index ce640dece8a5067bd10f410a2bb58874b7cc0908..7ada57def6bfedb113ea1a56f9677116b80488ea 100644
--- a/python/paddle/fluid/tests/book/notest_rnn_encoder_decoder.py
+++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -152,29 +152,6 @@ def seq_to_seq_net():
     return avg_cost, prediction
 
 
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = core.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
-def create_random_lodtensor(lod, place, low, high):
-    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
-    res = fluid.LoDTensor()
-    res.set(data, place)
-    res.set_lod([lod])
-    return res
-
-
 def train(use_cuda, save_dirname=None):
     [avg_cost, prediction] = seq_to_seq_net()
 
@@ -188,22 +165,20 @@ def train(use_cuda, save_dirname=None):
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = Executor(place)
-
     exe.run(framework.default_startup_program())
 
+    feed_order = ['source_sequence', 'target_sequence', 'label_sequence']
+    feed_list = [
+        framework.default_main_program().global_block().var(var_name)
+        for var_name in feed_order
+    ]
+    feeder = fluid.DataFeeder(feed_list, place)
+
     batch_id = 0
     for pass_id in xrange(2):
         for data in train_data():
-            word_data = to_lodtensor(map(lambda x: x[0], data), place)
-            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
-            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
-
             outs = exe.run(framework.default_main_program(),
-                           feed={
-                               'source_sequence': word_data,
-                               'target_sequence': trg_word,
-                               'label_sequence': trg_word_next
-                           },
+                           feed=feeder.feed(data),
                            fetch_list=[avg_cost])
 
             avg_cost_val = np.array(outs[0])
@@ -237,9 +212,23 @@ def infer(use_cuda, save_dirname=None):
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
 
-        lod = [0, 4, 10]
-        word_data = create_random_lodtensor(lod, place, low=0, high=1)
-        trg_word = create_random_lodtensor(lod, place, low=0, high=1)
+        # Setup input by creating LoDTensor to represent sequence of words.
+        # Here each word is the basic element of the LoDTensor and the shape of 
+        # each word (base_shape) should be [1] since it is simply an index to 
+        # look up for the corresponding word vector.
+        # Suppose the length_based level of detail (lod) info is set to [[4, 6]],
+        # which has only one lod level. Then the created LoDTensor will have only 
+        # one higher level structure (sequence of words, or sentence) than the basic 
+        # element (word). Hence the LoDTensor will hold data for two sentences of 
+        # length 4 and 6, respectively. 
+        # Note that lod info should be a list of lists.
+        lod = [[4, 6]]
+        base_shape = [1]
+        # The range of random integers is [low, high]
+        word_data = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=1)
+        trg_word = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=1)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index 6dec0f6857e86b4b9c1c67af934aa9bfdb1c3df7..3118d88701e5f64ae50f7ee774ea8174aa7758eb 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -21,15 +21,6 @@ import math
 import sys
 
 
-def create_random_lodtensor(lod, place, low, high):
-    # The range of data elements is [low, high]
-    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
-    res = fluid.LoDTensor()
-    res.set(data, place)
-    res.set_lod([lod])
-    return res
-
-
 def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
     PASS_NUM = 100
     EMBED_SIZE = 32
@@ -101,7 +92,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
         avg_cost = fluid.layers.mean(pd())
 
     sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-    optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+    sgd_optimizer.minimize(avg_cost)
 
     train_reader = paddle.batch(
         paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
@@ -175,16 +166,23 @@ def infer(use_cuda, save_dirname=None):
         word_dict = paddle.dataset.imikolov.build_dict()
         dict_size = len(word_dict)
 
-        # Setup inputs, by creating 4 words, the lod of which should be [0, 1]
-        lod = [0, 1]
-        first_word = create_random_lodtensor(
-            lod, place, low=0, high=dict_size - 1)
-        second_word = create_random_lodtensor(
-            lod, place, low=0, high=dict_size - 1)
-        third_word = create_random_lodtensor(
-            lod, place, low=0, high=dict_size - 1)
-        fourth_word = create_random_lodtensor(
-            lod, place, low=0, high=dict_size - 1)
+        # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word 
+        # is simply an index to look up for the corresponding word vector and hence 
+        # the shape of word (base_shape) should be [1]. The length-based level of 
+        # detail (lod) info of each LoDtensor should be [[1]] meaning there is only 
+        # one lod_level and there is only one sequence of one word on this level.
+        # Note that lod info should be a list of lists.
+        lod = [[1]]
+        base_shape = [1]
+        # The range of random integers is [low, high]
+        first_word = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=dict_size - 1)
+        second_word = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=dict_size - 1)
+        third_word = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=dict_size - 1)
+        fourth_word = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=dict_size - 1)
 
         assert feed_target_names[0] == 'firstw'
         assert feed_target_names[1] == 'secondw'
diff --git a/python/paddle/fluid/tests/book/understand_sentiment/notest_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/understand_sentiment/notest_understand_sentiment_stacked_lstm.py
deleted file mode 100644
index 9948e5c0234ed78237c94f9a25d6401619267d0d..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/understand_sentiment/notest_understand_sentiment_stacked_lstm.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle
-import paddle.fluid as fluid
-from functools import partial
-
-CLASS_DIM = 2
-EMB_DIM = 128
-HID_DIM = 512
-STACKED_NUM = 3
-
-
-def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num):
-    assert stacked_num % 2 == 1
-
-    emb = fluid.layers.embedding(
-        input=data, size=[input_dim, emb_dim], is_sparse=True)
-
-    fc1 = fluid.layers.fc(input=emb, size=hid_dim)
-    lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
-
-    inputs = [fc1, lstm1]
-
-    for i in range(2, stacked_num + 1):
-        fc = fluid.layers.fc(input=inputs, size=hid_dim)
-        lstm, cell = fluid.layers.dynamic_lstm(
-            input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
-        inputs = [fc, lstm]
-
-    fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
-    lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
-
-    prediction = fluid.layers.fc(input=[fc_last, lstm_last],
-                                 size=class_dim,
-                                 act='softmax')
-    return prediction
-
-
-def inference_network(word_dict):
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-
-    dict_dim = len(word_dict)
-    net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM,
-                           STACKED_NUM)
-    return net
-
-
-def train_network(word_dict):
-    prediction = inference_network(word_dict)
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(cost)
-    accuracy = fluid.layers.accuracy(input=prediction, label=label)
-    return avg_cost, accuracy
-
-
-def train(use_cuda, save_path):
-    BATCH_SIZE = 128
-    EPOCH_NUM = 5
-
-    word_dict = paddle.dataset.imdb.word_dict()
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.imdb.train(word_dict), buf_size=1000),
-        batch_size=BATCH_SIZE)
-
-    test_data = paddle.batch(
-        paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
-
-    def event_handler(event):
-        if isinstance(event, fluid.EndIteration):
-            if (event.batch_id % 10) == 0:
-                avg_cost, accuracy = trainer.test(reader=test_data)
-
-                print('BatchID {1:04}, Loss {2:2.2}, Acc {3:2.2}'.format(
-                    event.batch_id + 1, avg_cost, accuracy))
-
-                if accuracy > 0.01:  # Low threshold for speeding up CI
-                    trainer.params.save(save_path)
-                    return
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    trainer = fluid.Trainer(
-        partial(train_network, word_dict),
-        optimizer=fluid.optimizer.Adagrad(learning_rate=0.002),
-        place=place,
-        event_handler=event_handler)
-
-    trainer.train(train_data, EPOCH_NUM, event_handler=event_handler)
-
-
-def infer(use_cuda, save_path):
-    params = fluid.Params(save_path)
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    word_dict = paddle.dataset.imdb.word_dict()
-    inferencer = fluid.Inferencer(
-        partial(inference_network, word_dict), params, place=place)
-
-    def create_random_lodtensor(lod, place, low, high):
-        data = np.random.random_integers(low, high,
-                                         [lod[-1], 1]).astype("int64")
-        res = fluid.LoDTensor()
-        res.set(data, place)
-        res.set_lod([lod])
-        return res
-
-    lod = [0, 4, 10]
-    tensor_words = create_random_lodtensor(
-        lod, place, low=0, high=len(word_dict) - 1)
-    results = inferencer.infer({'words': tensor_words})
-    print("infer results: ", results)
-
-
-def main(use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-    save_path = "understand_sentiment_stacked_lstm.inference.model"
-    train(use_cuda, save_path)
-    infer(use_cuda, save_path)
-
-
-if __name__ == '__main__':
-    for use_cuda in (False, True):
-        main(use_cuda=use_cuda)
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
index a1ca6d981fafb401985d03e9f2d63d1cb41b21b5..fa696acdfa9058af14f0bd34ce1a2980db5aeafc 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
@@ -80,21 +80,6 @@ def encoder_decoder():
     return rnn()
 
 
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = core.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
 def main():
     rnn_out = encoder_decoder()
     label = layers.data(
@@ -122,18 +107,21 @@ def main():
 
     exe.run(framework.default_startup_program())
 
+    feed_order = [
+        'src_word_id', 'target_language_word', 'target_language_next_word'
+    ]
+
+    feed_list = [
+        fluid.default_main_program().global_block().var(var_name)
+        for var_name in feed_order
+    ]
+    feeder = fluid.DataFeeder(feed_list, place)
+
     batch_id = 0
     for pass_id in xrange(10):
         for data in train_data():
-            word_data = to_lodtensor(map(lambda x: x[0], data), place)
-            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
-            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
             outs = exe.run(fluid.default_main_program(),
-                           feed={
-                               'src_word_id': word_data,
-                               'target_language_word': trg_word,
-                               'target_language_next_word': trg_word_next
-                           },
+                           feed=feeder.feed(data),
                            fetch_list=[avg_cost])
             avg_cost_val = np.array(outs[0])
             print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
diff --git a/python/paddle/fluid/tests/test_concurrency.py b/python/paddle/fluid/tests/no_test_concurrency.py
similarity index 100%
rename from python/paddle/fluid/tests/test_concurrency.py
rename to python/paddle/fluid/tests/no_test_concurrency.py
diff --git a/python/paddle/fluid/tests/test_cpp_reader.py b/python/paddle/fluid/tests/test_cpp_reader.py
index e54c73b2956dd99ee57804318130c261e133d21a..6cc291dfcffdd7083f498389834e37bd06ca4572 100644
--- a/python/paddle/fluid/tests/test_cpp_reader.py
+++ b/python/paddle/fluid/tests/test_cpp_reader.py
@@ -44,8 +44,8 @@ create_random_data_generator_op = startup_block.append_op(
     attrs={
         "shape_concat": [1, 2, 1, 1],
         "ranks": [2, 2],
-        "min": 0.0,
-        "max": 1.0,
+        "low": 0.0,
+        "high": 1.0,
         'lod_levels': [0, 0]
     })
 
diff --git a/python/paddle/fluid/tests/test_data_feeder.py b/python/paddle/fluid/tests/test_data_feeder.py
index 861dd3174a21d59fe12e0b794ecb2a934946ac71..ce3ba3ebc50d7b015f379b5e80b179463a7b231a 100644
--- a/python/paddle/fluid/tests/test_data_feeder.py
+++ b/python/paddle/fluid/tests/test_data_feeder.py
@@ -13,15 +13,62 @@
 # limitations under the License.
 
 import paddle.fluid as fluid
+import unittest
 
 
-def test_converter():
-    img = fluid.layers.data(name='image', shape=[1, 28, 28])
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    feeder = fluid.DataFeeder([img, label], fluid.CPUPlace())
-    result = feeder.feed([[[0] * 784, [9]], [[1] * 784, [1]]])
-    print(result)
+class TestDataFeeder(unittest.TestCase):
+    def test_lod_level_0_converter(self):
+        img = fluid.layers.data(name='image', shape=[1, 28, 28])
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        feeder = fluid.DataFeeder([img, label], fluid.CPUPlace())
+        result = feeder.feed([([0] * 784, [9]), ([1] * 784, [1])])
+        print(result)
+
+        self.assertEqual(result['image'].shape(), [2, 1, 28, 28])
+        self.assertEqual(result['label'].shape(), [2, 1])
+        self.assertEqual(result['image'].lod(), [])
+        self.assertEqual(result['label'].lod(), [])
+
+    def test_lod_level_1_converter(self):
+        # lod_level = 1
+        # each sentence has a different number of words
+        sentences = fluid.layers.data(
+            name='sentences', shape=[1], dtype='int64', lod_level=1)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        feeder = fluid.DataFeeder([sentences, label], fluid.CPUPlace())
+
+        # lod = [[0, 3, 5, 9]]
+        # data = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
+        # label = [1] * len(data)
+        result = feeder.feed(
+            [([1, 2, 3], [1]), ([4, 5], [1]), ([6, 7, 8, 9], [1])])
+        print(result)
+
+        self.assertEqual(result['sentences'].shape(), [9, 1])
+        self.assertEqual(result['label'].shape(), [3, 1])
+        self.assertEqual(result['sentences'].lod(), [[0, 3, 5, 9]])
+        self.assertEqual(result['label'].lod(), [])
+
+    def test_lod_level_2_converter(self):
+        # lod_level = 2
+        # paragraphs -> sentences -> words
+        paragraphs = fluid.layers.data(
+            name='paragraphs', shape=[1], dtype='int64', lod_level=2)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        feeder = fluid.DataFeeder([paragraphs, label], fluid.CPUPlace())
+
+        # lod = [[0, 2, 3], [0, 3, 5, 9]]
+        # data = [[[1, 2, 3], [4, 5]], [[6, 7, 8, 9]]]
+        # label = [1] * len(data)
+        result = feeder.feed(
+            [([[1, 2, 3], [4, 5]], [1]), ([[6, 7, 8, 9]], [1])])
+        print(result)
+
+        self.assertEqual(result['paragraphs'].shape(), [9, 1])
+        self.assertEqual(result['label'].shape(), [2, 1])
+        self.assertEqual(result['paragraphs'].lod(), [[0, 2, 3], [0, 3, 5, 9]])
+        self.assertEqual(result['label'].lod(), [])
 
 
 if __name__ == '__main__':
-    test_converter()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 921260ef3f4b1f9e4c65b3ffb440dc34cb0a9376..8569d838bdd414eb84c6c87674990a25a2fdcdf9 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -109,6 +109,24 @@ class TestDetection(unittest.TestCase):
         print(str(program))
 
 
+class TestPriorBox(unittest.TestCase):
+    def test_prior_box(self):
+        data_shape = [3, 224, 224]
+        images = fluid.layers.data(
+            name='pixel', shape=data_shape, dtype='float32')
+        conv1 = fluid.layers.conv2d(images, 3, 3, 2)
+        box, var = layers.prior_box(
+            input=conv1,
+            image=images,
+            min_sizes=[100.0],
+            aspect_ratios=[1.],
+            flip=True,
+            clip=True)
+        assert len(box.shape) == 4
+        assert box.shape == var.shape
+        assert box.shape[3] == 4
+
+
 class TestMultiBoxHead(unittest.TestCase):
     def test_multi_box_head(self):
         data_shape = [3, 224, 224]
diff --git a/python/paddle/fluid/tests/test_lod_tensor.py b/python/paddle/fluid/tests/test_lod_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..013d72f418cf7ac11eb31fd221052039e896e203
--- /dev/null
+++ b/python/paddle/fluid/tests/test_lod_tensor.py
@@ -0,0 +1,91 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+from paddle.fluid.lod_tensor import create_lod_tensor, create_random_int_lodtensor, _validate_lod, _convert_lod
+import numpy
+import unittest
+
+
+class TestLoDTensor(unittest.TestCase):
+    def test_validate_lod(self):
+        lod = (1, 2, 1)
+        self.assertRaises(AssertionError, _validate_lod, lod, -1)
+        lod = [[1, 2], (2, 3)]
+        self.assertRaises(AssertionError, _validate_lod, lod, -1)
+        lod = [1, 2, 3]
+        self.assertRaises(AssertionError, _validate_lod, lod, -1)
+
+        lod = []
+        self.assertTrue(_validate_lod(lod, -1))
+        lod = [[], [1], [3]]
+        self.assertFalse(_validate_lod(lod, -1))
+        lod = [[0], [-1], [3]]
+        self.assertFalse(_validate_lod(lod, -1))
+
+        # Each level's sum should be equal to the number of items in the next level
+        # Moreover, last level's sum should be equal to the tensor height
+        lod = [[2, 3], [1, 3, 1, 2, 1]]
+        self.assertTrue(_validate_lod(lod, tensor_height=8))
+        lod = [[1, 3], [2, 1, 3]]
+        self.assertFalse(_validate_lod(lod, tensor_height=6))
+        lod = [[1, 3], [2, 1, 3, 4]]
+        self.assertFalse(_validate_lod(lod, tensor_height=5))
+
+    def test_convert_lod(self):
+        lod = [[1, 2, 3]]
+        converted_lod = [[0, 1, 3, 6]]
+        self.assertEqual(_convert_lod(lod), converted_lod)
+
+        lod = [[2, 3], [1, 3, 1, 2, 1]]
+        converted_lod = [[0, 2, 5], [0, 1, 4, 5, 7, 8]]
+        self.assertEqual(_convert_lod(lod), converted_lod)
+
+    def test_create_lod_tensor(self):
+        # Create LoDTensor from a list
+        data = [[1, 2, 3], [3, 4]]
+        wrong_lod = [[2, 2]]
+        correct_lod = [[3, 2]]
+        self.assertRaises(AssertionError, create_lod_tensor, data, wrong_lod,
+                          fluid.CPUPlace())
+        tensor = create_lod_tensor(data, correct_lod, fluid.CPUPlace())
+        self.assertEqual(tensor.lod(), [[0, 3, 5]])
+
+        # Create LoDTensor from numpy array
+        data = numpy.random.random([10, 1])
+        lod = [[2, 1], [3, 3, 4]]
+        tensor = create_lod_tensor(data, lod, fluid.CPUPlace())
+        self.assertEqual(tensor.lod(), [[0, 2, 3], [0, 3, 6, 10]])
+
+        # Create LoDTensor from another LoDTensor, they are differnt instances
+        new_lod = [[2, 2, 1], [1, 2, 2, 3, 2]]
+        new_tensor = create_lod_tensor(tensor, new_lod, fluid.CPUPlace())
+        self.assertEqual(tensor.lod(), [[0, 2, 3], [0, 3, 6, 10]])
+        self.assertEqual(new_tensor.lod(), [[0, 2, 4, 5], [0, 1, 3, 5, 8, 10]])
+
+    def test_create_random_int_lodtensor(self):
+        # The shape of a word, commonly used in speech and NLP problem, is [1]
+        shape = [1]
+        lod = [[2, 3, 5]]
+        dict_size = 10000
+        low = 0
+        high = dict_size - 1
+        tensor = create_random_int_lodtensor(lod, shape,
+                                             fluid.CPUPlace(), low, high)
+        self.assertEqual(tensor.lod(), [[0, 2, 5, 10]])
+        self.assertEqual(tensor.shape(), [10, 1])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index d9190408e151283ece8460286dd67818dd39da3e..673bd728718ca233b426fe2aaae307413d875174 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -17,7 +17,7 @@ endif(NOT WITH_DISTRIBUTE)
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
 list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184 
 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185
-list(REMOVE_ITEM TEST_OPS test_nce) # IXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778
+list(REMOVE_ITEM TEST_OPS test_nce) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778
 list(REMOVE_ITEM TEST_OPS test_recurrent_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152
 list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957
 
@@ -26,82 +26,27 @@ list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not
 
 function(py_test_modules TARGET_NAME)
   if(WITH_TESTING)
-    set(options "")
+    set(options SERIAL)
     set(oneValueArgs "")
-    set(multiValueArgs MODULES DEPS ARGS ENVS)
+    set(multiValueArgs MODULES DEPS ENVS)
     cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
              COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS}
-             ${PYTHON_EXECUTABLE} -u -m unittest --verbose ${py_test_modules_MODULES} ${py_test_modules_ARGS}
+             ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    if (py_test_modules_SERIAL)
+        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    endif()
   endif()
 endfunction()
-
-list(REMOVE_ITEM TEST_OPS test_sequence_expand)
-
-# test time consuming OPs in a separate process for expliot parallism
-list(REMOVE_ITEM TEST_OPS test_parallel_executor)
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
-list(REMOVE_ITEM TEST_OPS test_dyn_rnn)
-list(REMOVE_ITEM TEST_OPS test_mul_op)
-
-# tests that need to be run in separate process.
-list(REMOVE_ITEM TEST_OPS test_multihead_attention)
-list(REMOVE_ITEM TEST_OPS test_calc_gradient)
-list(REMOVE_ITEM TEST_OPS test_while_op)
-list(REMOVE_ITEM TEST_OPS test_lod_array_length_op)
-list(REMOVE_ITEM TEST_OPS test_reorder_lod_tensor)
-list(REMOVE_ITEM TEST_OPS test_profiler)
-list(REMOVE_ITEM TEST_OPS test_nvprof)
-list(REMOVE_ITEM TEST_OPS test_normalization_wrapper)
-list(REMOVE_ITEM TEST_OPS test_executor_and_mul)
-list(REMOVE_ITEM TEST_OPS test_assign_value_op)
-list(REMOVE_ITEM TEST_OPS test_array_read_write_op)
-list(REMOVE_ITEM TEST_OPS test_lod_rank_table)
-list(REMOVE_ITEM TEST_OPS test_weight_normalization)
-list(REMOVE_ITEM TEST_OPS test_conditional_block)
-list(REMOVE_ITEM TEST_OPS test_parameter)
-list(REMOVE_ITEM TEST_OPS test_registry)
-list(REMOVE_ITEM TEST_OPS test_fetch_var)
-list(REMOVE_ITEM TEST_OPS test_parallel_op)
-list(REMOVE_ITEM TEST_OPS test_dynrnn_static_input)
 list(REMOVE_ITEM TEST_OPS test_dist_train)
-
-# tests that can be bundled together in one python process for speed.
-if(WITH_FAST_BUNDLE_TEST)
-    py_test_modules("test_all_ops" MODULES ${TEST_OPS})
-else()
-    foreach(TEST_OP ${TEST_OPS})
-        py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-    endforeach(TEST_OP)
-endif(WITH_FAST_BUNDLE_TEST)
-
-#
-py_test_modules(test_sequence_expand MODULES test_sequence_expand)
-# tests with high overhead
-py_test_modules(test_parallel_executor MODULES test_parallel_executor)
-py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR})
-py_test_modules(test_train_dyn_rnn MODULES test_dyn_rnn)
-py_test_modules(test_mul_op MODULES test_mul_op)
-
-# tests that need to be run in separate process.
-py_test_modules(test_multihead_attention MODULES test_multihead_attention)
-py_test_modules(test_calc_gradient MODULES test_calc_gradient)
-py_test_modules(test_while_op MODULES test_while_op)
-py_test_modules(test_lod_array_length_op MODULES test_lod_array_length_op)
-py_test_modules(test_reorder_lod_tensor MODULES test_reorder_lod_tensor)
-py_test_modules(test_profiler MODULES test_profiler)
-py_test_modules(test_nvprof MODULES test_nvprof)
-py_test_modules(test_normalization_wrapper MODULES test_normalization_wrapper)
-py_test_modules(test_executor_and_mul MODULES test_executor_and_mul)
-py_test_modules(test_assign_value_op MODULES test_assign_value_op)
-py_test_modules(test_array_read_write_op MODULES test_array_read_write_op)
-py_test_modules(test_lod_rank_table MODULES test_lod_rank_table)
-py_test_modules(test_weight_normalization MODULES test_weight_normalization)
-py_test_modules(test_conditional_block MODULES test_conditional_block)
-py_test_modules(test_parameter MODULES test_parameter)
-py_test_modules(test_registry MODULES test_registry)
-py_test_modules(test_fetch_var MODULES test_fetch_var)
-py_test_modules(test_dynrnn_static_input MODULES test_dynrnn_static_input)
-py_test_modules(test_parallel_op MODULES test_parallel_op)
-py_test_modules(test_dist_train MODULES test_dist_train)
+list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
+list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
+# TODO(wuyi): this test hungs on CI, will add it back later
+list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
+foreach(TEST_OP ${TEST_OPS})
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+endforeach(TEST_OP)
+py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
+py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
diff --git a/python/paddle/fluid/tests/unittests/benchmark.py b/python/paddle/fluid/tests/unittests/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..e891ee932f1440001eb25b222f1f4613e97dfcb1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/benchmark.py
@@ -0,0 +1,113 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import time
+import itertools
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from op_test import OpTest
+
+
+class BenchmarkSuite(OpTest):
+    def timeit_function(self, callback, iters, *args, **kwargs):
+        assert iters != 0, "Iters should >= 1"
+        start = time.time()
+        for i in range(iters):
+            callback(*args, **kwargs)
+        elapse = time.time() - start
+        return elapse / iters
+
+    def _assert_cpu_gpu_same(self, cpu_outs, gpu_outs, fetch_list, atol):
+        for item_cpu_out, item_gpu_out, variable in zip(cpu_outs, gpu_outs,
+                                                        fetch_list):
+            # the cpu version is baseline, expect gpu version keep same with cpu version.
+            expect = item_cpu_out
+            expect_t = np.array(item_cpu_out)
+            actual = item_gpu_out
+            actual_t = np.array(item_gpu_out)
+            var_name = variable if isinstance(variable,
+                                              basestring) else variable.name
+            self.assertTrue(
+                np.allclose(
+                    actual_t, expect_t, atol=atol),
+                "Output (" + var_name + ") has diff" + str(actual_t) + "\n" +
+                str(expect_t))
+            self.assertListEqual(actual.lod(),
+                                 expect.lod(),
+                                 "Output (" + var_name + ") has different lod")
+
+    def _get_input_names(self):
+        inputs = []
+        for name, value in self.inputs.iteritems():
+            if isinstance(value, list):
+                inputs.extend([sub_name for sub_name, _ in value])
+            inputs.append(name)
+        return inputs
+
+    def _get_output_names(self):
+        outputs = []
+        for var_name, var in self.outputs.iteritems():
+            if isinstance(var, list):
+                for sub_var_name, sub_var in var:
+                    outputs.append(sub_var_name)
+            else:
+                outputs.append(var_name)
+        if len(outputs) == 0:
+            for out_name, out_dup in Operator.get_op_outputs(self.op_type):
+                outputs.append(str(out_name))
+        return outputs
+
+    def check_output_stability(self, atol=1e-8):
+        places = self._get_places()
+        if len(places) < 2:
+            return
+        cpu_outs, fetch_list = self._calc_output(places[0])
+        gpu_outs, _ = self._calc_output(places[1])
+        self._assert_cpu_gpu_same(cpu_outs, gpu_outs, fetch_list, atol)
+
+    def timeit_output_with_place(self, place, iters):
+        return self.timeit_function(self.calc_output, iters, place)
+
+    def timeit_output(self, iters=100):
+        places = self._get_places()
+        elapses = []
+        for place in places:
+            elapses.append(self.timeit_output_with_place(place, iters))
+        for place, elapse in zip(places, elapses):
+            print("One pass of ({2}_op) at {0} cost {1}".format(
+                str(place), elapse, self.op_type))
+
+    def timeit_grad_with_place(self, place, iters=100):
+        inputs_to_check = self._get_input_names()
+        output_names = self._get_output_names()
+        return self.timeit_function(
+            self._get_gradient,
+            iters,
+            inputs_to_check,
+            place,
+            output_names,
+            no_grad_set=None)
+
+    def timeit_grad(self, iters=100):
+        places = self._get_places()
+        elapses = []
+        for place in places:
+            elapses.append(self.timeit_grad_with_place(place, iters))
+        for place, elapse in zip(places, elapses):
+            print("One pass of ({2}_grad_op) at {0} cost {1}".format(
+                str(place), elapse, self.op_type))
diff --git a/python/paddle/fluid/tests/unittests/benchmark_sum_op.py b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..91a5f1bca4441d80489a02eb9283928e38321826
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
@@ -0,0 +1,82 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+from benchmark import BenchmarkSuite
+from op_test import OpTest
+
+# This is a demo op test case for operator benchmarking and high resolution number stability alignment.
+
+
+class TestSumOp(BenchmarkSuite):
+    def setUp(self):
+        self.op_type = "sum"
+        self.customize_testcase()
+        self.customize_fetch_list()
+
+    def customize_fetch_list(self):
+        """
+        customize fetch list, configure the wanted variables.
+        >>> self.fetch_list = ["Out"]
+        """
+        self.fetch_list = ["Out"]
+        # pass
+
+    def customize_testcase(self):
+        # a test case
+        x0 = np.random.random((300, 400)).astype('float32')
+        x1 = np.random.random((300, 400)).astype('float32')
+        x2 = np.random.random((300, 400)).astype('float32')
+
+        # NOTE: if the output is empty, then it will autofilled by benchmarkSuite.
+        # only the output dtype is used, the shape, lod and data is computed from input.
+        self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
+        self.outputs = {"Out": x0 + x1 + x2}
+
+    def test_check_output(self):
+        """
+        compare the output with customized output. In this case,
+        you should set the correct output by hands.
+        >>> self.outputs = {"Out": x0 + x1 + x2}
+        """
+        self.check_output(atol=1e-8)
+
+    def test_output_stability(self):
+        # compare the cpu gpu output in high resolution.
+        self.check_output_stability()
+
+    def test_timeit_output(self):
+        """
+        perf the op, time cost will be averged in iters.
+        output example
+        >>> One pass of (sum_op) at CPUPlace cost 0.000461330413818
+        >>> One pass of (sum_op) at CUDAPlace(0) cost 0.000556070804596
+        """
+        self.timeit_output(iters=100)
+
+    def test_timeit_grad(self):
+        """
+        perf the op gradient, time cost will be averged in iters.
+        output example
+        >>> One pass of (sum_grad_op) at CPUPlace cost 0.00279935121536
+        >>> One pass of (sum_grad_op) at CUDAPlace(0) cost 0.00500632047653
+        """
+        self.timeit_grad(iters=100)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 299ab8e51f017e1980a8b40e3830fc42b1ff7ccc..307caae4b0cf4869c1abb755215aa97795d47e15 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -15,13 +15,17 @@
 import unittest
 import numpy as np
 import random
+import time
 import itertools
-import paddle.fluid.core as core
 import collections
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
 from paddle.fluid.backward import append_backward
 from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
-from paddle.fluid.framework import Program, OpProtoHolder
+from paddle.fluid.framework import Program, OpProtoHolder, Variable
+from testsuite import create_op, set_input, append_input_output, append_loss_ops
 
 
 def randomize_probability(batch_size, class_num, dtype='float32'):
@@ -33,67 +37,6 @@ def randomize_probability(batch_size, class_num, dtype='float32'):
     return prob
 
 
-def create_op(scope, op_type, inputs, outputs, attrs):
-    kwargs = dict()
-
-    def __create_var__(name, var_name):
-        scope.var(var_name).get_tensor()
-        kwargs[name].append(var_name)
-
-    for in_name, in_dup in Operator.get_op_inputs(op_type):
-        if in_name in inputs:
-            kwargs[in_name] = []
-            if in_dup:
-                sub_in = inputs[in_name]
-                for item in sub_in:
-                    sub_in_name, _ = item[0], item[1]
-                    __create_var__(in_name, sub_in_name)
-            else:
-                __create_var__(in_name, in_name)
-
-    for out_name, out_dup in Operator.get_op_outputs(op_type):
-        if out_name in outputs:
-            kwargs[out_name] = []
-            if out_dup:
-                sub_out = outputs[out_name]
-                for item in sub_out:
-                    sub_out_name, _ = item[0], item[1]
-                    __create_var__(out_name, sub_out_name)
-            else:
-                __create_var__(out_name, out_name)
-
-    for attr_name in Operator.get_op_attr_names(op_type):
-        if attr_name in attrs:
-            kwargs[attr_name] = attrs[attr_name]
-
-    return Operator(op_type, **kwargs)
-
-
-def set_input(scope, op, inputs, place):
-    def __set_input__(var_name, var):
-        if isinstance(var, tuple) or isinstance(var, np.ndarray):
-            tensor = scope.find_var(var_name).get_tensor()
-            if isinstance(var, tuple):
-                tensor.set_lod(var[1])
-                var = var[0]
-            tensor.set_dims(var.shape)
-            tensor.set(var, place)
-        elif isinstance(var, float):
-            scope.find_var(var_name).set_float(var)
-        elif isinstance(var, int):
-            scope.find_var(var_name).set_int(var)
-
-    for in_name, in_dup in Operator.get_op_inputs(op.type()):
-        if in_name in inputs:
-            if in_dup:
-                sub_in = inputs[in_name]
-                for item in sub_in:
-                    sub_in_name, sub_in_val = item[0], item[1]
-                    __set_input__(sub_in_name, sub_in_val)
-            else:
-                __set_input__(in_name, inputs[in_name])
-
-
 def get_numeric_gradient(place,
                          scope,
                          op,
@@ -167,54 +110,15 @@ def get_numeric_gradient(place,
     return gradient_flat.reshape(tensor_to_check.get_dims())
 
 
-def append_input_output(block, op_proto, np_list, is_input):
-    '''Insert VarDesc and generate Python variable instance'''
-    proto_list = op_proto.inputs if is_input else op_proto.outputs
-
-    def create_var(block, name, np_list, var_proto):
-        if name not in np_list:
-            assert var_proto.intermediate, "{} not found".format(name)
-            shape = None
-            lod_level = None
-        else:
-            np_value = np_list[name]
-            if isinstance(np_value, tuple):
-                shape = list(np_value[0].shape)
-                lod_level = len(np_value[1])
-            else:
-                shape = list(np_value.shape)
-                lod_level = 0
-        return block.create_var(
-            dtype="float32", shape=shape, lod_level=lod_level, name=name)
-
-    var_dict = {}
-    for var_proto in proto_list:
-        var_name = str(var_proto.name)
-        if is_input:
-            if (var_name not in np_list) and var_proto.dispensable:
-                continue
-            assert (var_name in np_list) or (var_proto.dispensable), \
-                "Missing {} as input".format(var_name)
-        if var_proto.duplicable:
-            assert isinstance(np_list[var_name], list), \
-                "Duplicable {} should be set as list".format(var_name)
-            var_list = []
-            for (name, np_value) in np_list[var_name]:
-                var_list.append(
-                    create_var(block, name, {name: np_value}, var_proto))
-            var_dict[var_name] = var_list
-        else:
-            var_dict[var_name] = create_var(block, var_name, np_list, var_proto)
-
-    return var_dict
-
-
 class OpTest(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         '''Fix random seeds to remove randomness from tests'''
         cls._np_rand_state = np.random.get_state()
         cls._py_rand_state = random.getstate()
+        cls.call_once = False
+        cls.dtype = "float32"
+        cls.outputs = {}
 
         np.random.seed(123)
         random.seed(124)
@@ -225,6 +129,31 @@ class OpTest(unittest.TestCase):
         np.random.set_state(cls._np_rand_state)
         random.setstate(cls._py_rand_state)
 
+    def try_call_once(self, data_type):
+        if not self.call_once:
+            self.call_once = True
+            self.dtype = data_type
+
+    def infer_dtype_from_inputs_outputs(self, inputs, outputs):
+        def infer_dtype(numpy_dict):
+            assert isinstance(
+                numpy_dict,
+                dict), "self.inputs, self.outputs must be numpy_dict"
+            for var_name, var_value in numpy_dict.iteritems():
+                if isinstance(var_value, (np.ndarray, np.generic)):
+                    self.try_call_once(var_value.dtype)
+                elif isinstance(var_value, (list, tuple)):
+                    # the case of self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
+                    if len(var_value) > 1 and isinstance(var_value[1], (
+                            np.ndarray, np.generic)):
+                        instance = var_value[1]
+                        self.try_call_once(instance[1].dtype)
+                else:
+                    self.try_call_once("float32")
+
+        infer_dtype(inputs)
+        infer_dtype(outputs)
+
     def feed_var(self, input_vars, place):
         feed_map = {}
         for var_name in input_vars:
@@ -248,18 +177,14 @@ class OpTest(unittest.TestCase):
 
         return feed_map
 
-    def calc_output(self, place):
-        outs, _ = self._calc_output(place)
-        return outs
-
-    def _calc_output(self, place):
+    def _append_ops(self, block):
         op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
-
-        program = Program()
-        block = program.global_block()
-
-        inputs = append_input_output(block, op_proto, self.inputs, True)
-        outputs = append_input_output(block, op_proto, self.outputs, False)
+        "infer datatype from inputs and outputs for this test case"
+        self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
+        inputs = append_input_output(block, op_proto, self.inputs, True,
+                                     self.dtype)
+        outputs = append_input_output(block, op_proto, self.outputs, False,
+                                      self.dtype)
         op = block.append_op(
             type=self.op_type,
             inputs=inputs,
@@ -269,22 +194,68 @@ class OpTest(unittest.TestCase):
         op.desc.infer_var_type(block.desc)
         op.desc.infer_shape(block.desc)
 
-        fetch_list = []
-        for var_name, var in outputs.iteritems():
-            if var_name in self.outputs:
+    def _get_io_vars(self, block, numpy_inputs):
+        inputs = {}
+        for name, value in numpy_inputs.iteritems():
+            if isinstance(value, list):
+                var_list = [
+                    block.var(sub_name) for sub_name, sub_value in value
+                ]
+                inputs[name] = var_list
+            else:
+                inputs[name] = block.var(name)
+        return inputs
+
+    def _get_inputs(self, block):
+        return self._get_io_vars(block, self.inputs)
+
+    def _get_outputs(self, block):
+        return self._get_io_vars(block, self.outputs)
+
+    def calc_output(self, place):
+        outs, _ = self._calc_output(place)
+        return outs
+
+    def _calc_output(self, place, parallel=False):
+
+        program = Program()
+        block = program.global_block()
+        self._append_ops(block)
+
+        inputs = self._get_inputs(block)
+        outputs = self._get_outputs(block)
+        feed_map = self.feed_var(inputs, place)
+
+        if parallel:
+            use_cuda = False
+            if isinstance(place, fluid.CUDAPlace(0)):
+                use_cuda = True
+            executor = fluid.ParallelExecutor(
+                use_cuda=use_cuda, loss_name=loss.name, main_program=program)
+        else:
+            executor = Executor(place)
+
+        fetch_list = getattr(self, "fetch_list", [])
+        # if the fetch_list is customized by user, we use it directly.
+        # if not, fill the fetch_list by the user configured outputs in test.
+        if len(fetch_list) == 0:
+            for var_name, var in outputs.iteritems():
                 if isinstance(var, list):
                     for v in var:
                         fetch_list.append(v)
                 else:
                     fetch_list.append(var)
-
-        feed_map = self.feed_var(inputs, place)
-
-        exe = Executor(place)
-        outs = exe.run(program,
-                       feed=feed_map,
-                       fetch_list=fetch_list,
-                       return_numpy=False)
+        # if the fetch_list still empty, fill the fetch_list by the operator output.
+        if len(fetch_list) == 0:
+            for out_name, out_dup in Operator.get_op_outputs(self.op_type):
+                fetch_list.append(str(out_name))
+        # fetch_list = map(block.var, fetch_list)
+        if not isinstance(fetch_list[0], Variable):
+            fetch_list = map(block.var, fetch_list)
+        outs = executor.run(program,
+                            feed=feed_map,
+                            fetch_list=fetch_list,
+                            return_numpy=False)
         return outs, fetch_list
 
     def check_output_with_place(self, place, atol):
@@ -340,17 +311,19 @@ class OpTest(unittest.TestCase):
                                          "Output (" + out_name +
                                          ") has different lod at " + str(place))
 
-    def check_output(self, atol=1e-5):
-        places = [core.CPUPlace()]
+    def _get_places(self):
+        places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
             places.append(core.CUDAPlace(0))
+        return places
+
+    def check_output(self, atol=1e-5):
+        places = self._get_places()
         for place in places:
             self.check_output_with_place(place, atol)
 
     def check_output_customized(self, checker):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
-            places.append(core.CUDAPlace(0))
+        places = self._get_places()
         for place in places:
             outs = self.calc_output(place)
             outs = [np.array(out) for out in outs]
@@ -383,9 +356,7 @@ class OpTest(unittest.TestCase):
                    in_place=False,
                    max_relative_error=0.005,
                    user_defined_grads=None):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
-            places.append(core.CUDAPlace(0))
+        places = self._get_places()
         for place in places:
             self.check_grad_with_place(place, inputs_to_check, output_names,
                                        no_grad_set, numeric_grad_delta,
@@ -432,35 +403,6 @@ class OpTest(unittest.TestCase):
                                max_relative_error,
                                "Gradient Check On %s" % str(place))
 
-    @staticmethod
-    def _create_var_descs_(block, var_dict):
-        # FIXME: Try unify with `append_input_output`
-        for param_name in var_dict:
-            var = var_dict[param_name]
-            if not isinstance(var, list) and not isinstance(var, tuple):
-                var = [(param_name, var, None)]
-            if not isinstance(var[0], list) and not isinstance(var[0], tuple):
-                var = [(param_name, var[0], var[1])]
-
-            for i, item in enumerate(var):
-                if not isinstance(item[0], basestring):
-                    item = [[param_name] + list(item)]
-                if len(item) == 2:
-                    if isinstance(item[1], tuple):
-                        var[i] = [item[0], item[1][0], item[1][1]]
-                    else:
-                        # only set var name and value, set lod to None
-                        var[i] = list(item) + [None]
-            var_descs = [(block.create_var(
-                name=name, shape=each.shape, dtype=each.dtype), each, lod)
-                         for name, each, lod in var]
-
-            yield param_name, var_descs
-
-    @staticmethod
-    def _merge_list(iterable):
-        return reduce(lambda a, b: list(a) + list(b), iterable, [])
-
     @staticmethod
     def _numpy_to_lod_tensor(np_value, lod, place):
         tensor = core.LoDTensor()
@@ -473,9 +415,9 @@ class OpTest(unittest.TestCase):
     def np_dtype_to_fluid_dtype(input):
         """Change the dtype of float16 numpy array
 
-        numpy float16 is binded to paddle::platform::float16 
+        numpy float16 is binded to paddle::platform::float16
         in tensor_py.h via the help of uint16 data type since
-        the internal memory representation of float16 is 
+        the internal memory representation of float16 is
         uint16_t in paddle and np.uint16 in numpy, which are
         themselves binded together by pybind.
 
@@ -483,91 +425,39 @@ class OpTest(unittest.TestCase):
             input: input numpy array
 
         Returns:
-            input: The dtype of input will be changed to np.uint16 if 
+            input: The dtype of input will be changed to np.uint16 if
                 it is originally np.float16, such that the internal memory
-                of input will be reinterpreted as of dtype np.uint16. 
+                of input will be reinterpreted as of dtype np.uint16.
         """
         if input.dtype == np.float16:
             input.dtype = np.uint16
         return input
 
-    def _get_gradient(self, input_to_check, place, output_names, no_grad_set):
+    def _get_gradient(self,
+                      input_to_check,
+                      place,
+                      output_names,
+                      no_grad_set,
+                      parallel=False):
         prog = Program()
         block = prog.global_block()
-        inputs_with_np = {
-            key: value
-            for (key, value) in OpTest._create_var_descs_(
-                block, getattr(self, 'inputs', {}))
-        }
-        outputs_with_np = {
-            key: val
-            for (key, val) in OpTest._create_var_descs_(
-                block, getattr(self, 'outputs', {}))
-        }
-        inputs = {
-            k: [item[0] for item in inputs_with_np[k]]
-            for k in inputs_with_np
-        }
-        outputs = {
-            k: [item[0] for item in outputs_with_np[k]]
-            for k in outputs_with_np
-        }
-
-        op = block.append_op(
-            type=self.op_type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=getattr(self, 'attrs', {}))
-
-        # infer variable type and infer shape in compile-time
-        op.desc.infer_var_type(block.desc)
-        op.desc.infer_shape(block.desc)
-
-        mean_inputs = map(block.var, output_names)
-
-        if len(mean_inputs) == 1:
-            loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1])
-            op = block.append_op(
-                inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean')
-            op.desc.infer_var_type(block.desc)
-            op.desc.infer_shape(block.desc)
-        else:
-            avg_sum = []
-            for cur_loss in mean_inputs:
-                cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1])
-                op = block.append_op(
-                    inputs={"X": [cur_loss]},
-                    outputs={"Out": [cur_avg_loss]},
-                    type="mean")
-                op.desc.infer_var_type(block.desc)
-                op.desc.infer_shape(block.desc)
-                avg_sum.append(cur_avg_loss)
-
-            loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1])
-            op_sum = block.append_op(
-                inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum')
-            op_sum.desc.infer_var_type(block.desc)
-            op_sum.desc.infer_shape(block.desc)
-
-            loss = block.create_var(dtype=loss_sum.dtype, shape=[1])
-            op_loss = block.append_op(
-                inputs={"X": loss_sum},
-                outputs={"Out": loss},
-                type='scale',
-                attrs={'scale': 1.0 / float(len(avg_sum))})
-            op_loss.desc.infer_var_type(block.desc)
-            op_loss.desc.infer_shape(block.desc)
-
+        self._append_ops(block)
+        loss = append_loss_ops(block, output_names)
         param_grad_list = append_backward(
             loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set)
 
-        feed_dict = {
-            item[0].name: OpTest._numpy_to_lod_tensor(item[1], item[2], place)
-            for p_name in inputs_with_np for item in inputs_with_np[p_name]
-        }
+        inputs = self._get_inputs(block)
+        feed_dict = self.feed_var(inputs, place)
 
         fetch_list = [g for p, g in param_grad_list]
-        executor = Executor(place)
+        if parallel:
+            use_cuda = False
+            if isinstance(place, fluid.CUDAPlace(0)):
+                use_cuda = True
+            executor = fluid.ParallelExecutor(
+                use_cuda=use_cuda, loss_name=loss.name, main_program=program)
+        else:
+            executor = Executor(place)
         return map(np.array,
                    executor.run(prog, feed_dict, fetch_list,
                                 return_numpy=False))
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9c3c648717814c28c39a401487925824e885946
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import time
+import numpy as np
+
+__all__ = ['TestParallelExecutorBase']
+
+
+class TestParallelExecutorBase(unittest.TestCase):
+    def check_network_convergence(self,
+                                  method,
+                                  memory_opt=True,
+                                  iter=50,
+                                  batch_size=None,
+                                  allow_op_delay=False,
+                                  feed_dict=None,
+                                  seed=None,
+                                  use_parallel_executor=True,
+                                  balance_parameter_opt_between_cards=False):
+        def run_executor(exe, feed, fetch_list, program=None):
+            if isinstance(exe, fluid.ParallelExecutor):
+                res = exe.run(fetch_list=fetch_list, feed=feed)
+            elif isinstance(exe, fluid.Executor):
+                if program is None:
+                    program = fluid.default_main_program()
+                res = exe.run(program=program, feed=feed, fetch_list=fetch_list)
+            else:
+                raise ValueError('Unkown type exe')
+            return res
+
+        main = fluid.Program()
+        startup = fluid.Program()
+        startup.random_seed = 1  # Fix random seed
+        with fluid.program_guard(main, startup):
+            if seed is not None:
+                startup.random_seed = seed
+            loss = method(use_feed=feed_dict is not None)
+            adam = fluid.optimizer.Adam()
+            adam.minimize(loss)
+            if memory_opt:
+                fluid.memory_optimize(main)
+            place = fluid.CUDAPlace(0)
+            startup_exe = fluid.Executor(place)
+            startup_exe.run(startup)
+            exec_strategy = fluid.ExecutionStrategy()
+            exec_strategy.allow_op_delay = allow_op_delay
+
+            build_strategy = fluid.BuildStrategy()
+            build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if balance_parameter_opt_between_cards else fluid.BuildStrategy.ReduceStrategy.AllReduce
+
+            if use_parallel_executor:
+                exe = fluid.ParallelExecutor(
+                    True,
+                    loss_name=loss.name,
+                    exec_strategy=exec_strategy,
+                    build_strategy=build_strategy)
+            else:
+                exe = fluid.Executor(place=place)
+
+            if batch_size is not None:
+                batch_size *= fluid.core.get_cuda_device_count()
+            begin = time.time()
+            first_loss, = run_executor(
+                exe=exe, feed=feed_dict, fetch_list=[loss.name])
+            first_loss = np.array(first_loss)
+
+            for i in xrange(iter):
+                run_executor(exe=exe, feed=feed_dict, fetch_list=[])
+
+            last_loss, = run_executor(
+                exe=exe, feed=feed_dict, fetch_list=[loss.name])
+            end = time.time()
+
+            if batch_size is not None:
+                print "%.4f Instance per second" % (
+                    (batch_size * iter + 2) / (end - begin))
+
+            last_loss = np.array(last_loss)
+
+            print first_loss, last_loss
+            # self.assertGreater(first_loss[0], last_loss[0])
+            return first_loss, last_loss
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
index bffb4f3b666a7ddcc133b7c30fab132b49aa1d0e..87c11e7880e73b911f21dda77c1cc2b4850b3591 100644
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
@@ -17,7 +17,10 @@ import numpy as np
 from op_test import OpTest
 
 
-def bilinear_interp_np(input, out_h, out_w):
+def bilinear_interp_np(input, out_h, out_w, out_size):
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
     batch_size, channel, in_h, in_w = input.shape
     if out_h > 1:
         ratio_h = (in_h - 1.0) / (out_h - 1.0)
@@ -49,12 +52,15 @@ def bilinear_interp_np(input, out_h, out_w):
 
 class TestBilinearInterpOp(OpTest):
     def setUp(self):
+        self.out_size = None
         self.init_test_case()
         self.op_type = "bilinear_interp"
         input_np = np.random.random(self.input_shape).astype("float32")
-        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w)
-
+        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
+                                       self.out_size)
         self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
         self.attrs = {'out_h': self.out_h, 'out_w': self.out_w}
         self.outputs = {'Out': output_np}
 
@@ -68,6 +74,7 @@ class TestBilinearInterpOp(OpTest):
         self.input_shape = [2, 3, 4, 4]
         self.out_h = 2
         self.out_w = 2
+        self.out_size = np.array([3, 3]).astype("int32")
 
 
 class TestCase1(TestBilinearInterpOp):
@@ -91,5 +98,29 @@ class TestCase3(TestBilinearInterpOp):
         self.out_w = 128
 
 
+class TestCase4(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.out_size = np.array([2, 2]).astype("int32")
+
+
+class TestCase5(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.out_size = np.array([11, 11]).astype("int32")
+
+
+class TestCase6(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 128, 64]
+        self.out_h = 64
+        self.out_w = 128
+        self.out_size = np.array([65, 129]).astype("int32")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
index 56f5af91d8e58086c12fde6948229675569aa271..b4c48d85f2c564d877c0a29e64dd2944d2b26ea3 100644
--- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
@@ -19,7 +19,8 @@ import math
 from op_test import OpTest
 
 
-def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
+def box_coder(target_box, prior_box, prior_box_var, output_box, code_type,
+              box_normalized):
     prior_box_x = (
         (prior_box[:, 2] + prior_box[:, 0]) / 2).reshape(1, prior_box.shape[0])
     prior_box_y = (
@@ -30,6 +31,9 @@ def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
         (prior_box[:, 3] - prior_box[:, 1])).reshape(1, prior_box.shape[0])
     prior_box_var = prior_box_var.reshape(1, prior_box_var.shape[0],
                                           prior_box_var.shape[1])
+    if not box_normalized:
+        prior_box_height = prior_box_height + 1
+        prior_box_width = prior_box_width + 1
 
     if (code_type == "EncodeCenterSize"):
         target_box_x = ((target_box[:, 2] + target_box[:, 0]) / 2).reshape(
@@ -40,6 +44,9 @@ def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
             target_box.shape[0], 1)
         target_box_height = ((target_box[:, 3] - target_box[:, 1])).reshape(
             target_box.shape[0], 1)
+        if not box_normalized:
+            target_box_height = target_box_height + 1
+            target_box_width = target_box_width + 1
 
         output_box[:,:,0] = (target_box_x - prior_box_x) / prior_box_width / \
                 prior_box_var[:,:,0]
@@ -64,9 +71,13 @@ def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
         output_box[:, :, 1] = target_box_y - target_box_height / 2
         output_box[:, :, 2] = target_box_x + target_box_width / 2
         output_box[:, :, 3] = target_box_y + target_box_height / 2
+        if not box_normalized:
+            output_box[:, :, 2] = output_box[:, :, 2] - 1
+            output_box[:, :, 3] = output_box[:, :, 3] - 1
 
 
-def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type):
+def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type,
+                    box_normalized):
     n = target_box.shape[0]
     m = prior_box.shape[0]
     output_box = np.zeros((n, m, 4), dtype=np.float32)
@@ -74,11 +85,11 @@ def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type):
         if (code_type == "EncodeCenterSize"):
             box_coder(target_box[lod[i]:lod[i + 1], :], prior_box,
                       prior_box_var, output_box[lod[i]:lod[i + 1], :, :],
-                      code_type)
+                      code_type, box_normalized)
         elif (code_type == "DecodeCenterSize"):
             box_coder(target_box[lod[i]:lod[i + 1], :, :], prior_box,
                       prior_box_var, output_box[lod[i]:lod[i + 1], :, :],
-                      code_type)
+                      code_type, box_normalized)
     return output_box
 
 
@@ -93,15 +104,45 @@ class TestBoxCoderOp(OpTest):
         prior_box_var = np.random.random((10, 4)).astype('float32')
         target_box = np.random.random((5, 10, 4)).astype('float32')
         code_type = "DecodeCenterSize"
+        box_normalized = False
         output_box = batch_box_coder(prior_box, prior_box_var, target_box,
-                                     lod[0], code_type)
+                                     lod[0], code_type, box_normalized)
 
         self.inputs = {
             'PriorBox': prior_box,
             'PriorBoxVar': prior_box_var,
             'TargetBox': target_box,
         }
-        self.attrs = {'code_type': 'decode_center_size'}
+        self.attrs = {
+            'code_type': 'decode_center_size',
+            'box_normalized': False
+        }
+        self.outputs = {'OutputBox': output_box}
+
+
+class TestBoxCoderOpWithoutBoxVar(OpTest):
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        self.op_type = "box_coder"
+        lod = [[0, 1, 2, 3, 4, 5]]
+        prior_box = np.random.random((10, 4)).astype('float32')
+        prior_box_var = np.ones((10, 4)).astype('float32')
+        target_box = np.random.random((5, 10, 4)).astype('float32')
+        code_type = "DecodeCenterSize"
+        box_normalized = False
+        output_box = batch_box_coder(prior_box, prior_box_var, target_box,
+                                     lod[0], code_type, box_normalized)
+
+        self.inputs = {
+            'PriorBox': prior_box,
+            'TargetBox': target_box,
+        }
+        self.attrs = {
+            'code_type': 'decode_center_size',
+            'box_normalized': False
+        }
         self.outputs = {'OutputBox': output_box}
 
 
@@ -116,15 +157,16 @@ class TestBoxCoderOpWithLoD(OpTest):
         prior_box_var = np.random.random((10, 4)).astype('float32')
         target_box = np.random.random((20, 4)).astype('float32')
         code_type = "EncodeCenterSize"
+        box_normalized = True
         output_box = batch_box_coder(prior_box, prior_box_var, target_box,
-                                     lod[0], code_type)
+                                     lod[0], code_type, box_normalized)
 
         self.inputs = {
             'PriorBox': prior_box,
             'PriorBoxVar': prior_box_var,
             'TargetBox': (target_box, lod),
         }
-        self.attrs = {'code_type': 'encode_center_size'}
+        self.attrs = {'code_type': 'encode_center_size', 'box_normalized': True}
         self.outputs = {'OutputBox': output_box}
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index d864b9b348e961c585749d47d449d775b2dfebc9..ded2f130288a4a959a1c859b2cc8ccf0912efb12 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -21,8 +21,11 @@ from op_test import OpTest
 
 def conv2dtranspose_forward_naive(input_, filter_, attrs):
     in_n, in_c, in_h, in_w = input_.shape
-    f_c, out_c, f_h, f_w = filter_.shape
+    f_c, f_out_c, f_h, f_w = filter_.shape
+    groups = attrs['groups']
     assert in_c == f_c
+    out_c = f_out_c * groups
+    sub_in_c = in_c / groups
 
     stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
         'dilations']
@@ -36,15 +39,21 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
     for n in range(in_n):
         for i in range(in_h):
             for j in range(in_w):
-                input_masked = input_[n, :, i, j]  # (c)
-                input_masked = np.reshape(input_masked, (in_c, 1, 1))
-                input_masked = np.tile(input_masked, (1, f_h, f_w))
-
-                for k in range(out_c):
-                    tmp_out = np.sum(input_masked * filter_[:, k, :, :], axis=0)
-                    i1, i2 = i * stride[0], i * stride[0] + d_bolck_h
-                    j1, j2 = j * stride[0], j * stride[0] + d_bolck_h
-                    out[n, k, i1:i2:dilations[0], j1:j2:dilations[1]] += tmp_out
+                for g in range(groups):
+                    input_masked = input_[n, g * sub_in_c:(g + 1) * sub_in_c, i,
+                                          j]  # (c)
+                    input_masked = np.reshape(input_masked, (sub_in_c, 1, 1))
+                    input_masked = np.tile(input_masked, (1, f_h, f_w))
+
+                    for k in range(f_out_c):
+                        tmp_out = np.sum(
+                            input_masked *
+                            filter_[g * sub_in_c:(g + 1) * sub_in_c, k, :, :],
+                            axis=0)
+                        i1, i2 = i * stride[0], i * stride[0] + d_bolck_h
+                        j1, j2 = j * stride[0], j * stride[0] + d_bolck_h
+                        out[n, g * f_out_c + k, i1:i2:dilations[0], j1:j2:
+                            dilations[1]] += tmp_out
 
     out = out[:, :, pad[0]:out_h - pad[0], pad[1]:out_w - pad[1]]
     return out
@@ -64,6 +73,7 @@ class TestConv2dTransposeOp(OpTest):
         self.attrs = {
             'strides': self.stride,
             'paddings': self.pad,
+            'groups': self.groups,
             'dilations': self.dilations,
             'use_cudnn': self.use_cudnn,
             'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
@@ -127,6 +137,7 @@ class TestConv2dTransposeOp(OpTest):
         self.pad = [0, 0]
         self.stride = [1, 1]
         self.dilations = [1, 1]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5]  # NCHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3]
@@ -140,16 +151,29 @@ class TestWithPad(TestConv2dTransposeOp):
         self.pad = [1, 1]
         self.stride = [1, 1]
         self.dilations = [1, 1]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5]  # NCHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3]
 
 
+class TestWithGroups(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.groups = 2
+        self.input_size = [2, 4, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 3, 3, 3]
+
+
 class TestWithStride(TestConv2dTransposeOp):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
         self.dilations = [1, 1]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5]  # NCHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3]
@@ -159,6 +183,7 @@ class TestWithDilation(TestConv2dTransposeOp):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
+        self.groups = 1
         self.dilations = [2, 2]
         self.input_size = [2, 3, 5, 5]  # NCHW
         f_c = self.input_size[1]
@@ -176,6 +201,7 @@ class TestCUDNNWithPad(TestWithPad):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
+        self.groups = 1
         self.dilations = [1, 1]
         self.input_size = [2, 3, 5, 5]  # NCHW
         f_c = self.input_size[1]
@@ -190,6 +216,7 @@ class TestCUDNNWithStride(TestWithStride):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
+        self.groups = 1
         self.dilations = [1, 1]
         self.input_size = [2, 3, 5, 5]  # NCHW
         f_c = self.input_size[1]
@@ -200,6 +227,21 @@ class TestCUDNNWithStride(TestWithStride):
         self.op_type = "conv2d_transpose"
 
 
+class TestCUDNNWithGroups(TestWithGroups):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.groups = 2
+        self.input_size = [2, 4, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 3, 3, 3]
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv2d_transpose"
+
+
 # Please Don't remove the following code.
 # Currently, CI use cudnn V5.0 which not support dilation conv.
 # class TestCUDNNWithDilation(TestWithDilation):
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
index 55ba238710c56dd0daea388cd2dcdb79243bb71e..c9f26d10df8ff39d6bd77b1597336600f676d362 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
@@ -21,8 +21,11 @@ from op_test import OpTest
 
 def conv3dtranspose_forward_naive(input_, filter_, attrs):
     in_n, in_c, in_d, in_h, in_w = input_.shape
-    f_c, out_c, f_d, f_h, f_w = filter_.shape
+    f_c, f_out_c, f_d, f_h, f_w = filter_.shape
+    groups = attrs['groups']
     assert in_c == f_c
+    out_c = f_out_c * groups
+    sub_in_c = in_c / groups
 
     stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
         'dilations']
@@ -39,18 +42,23 @@ def conv3dtranspose_forward_naive(input_, filter_, attrs):
         for d in range(in_d):
             for i in range(in_h):
                 for j in range(in_w):
-                    input_masked = input_[n, :, d, i, j]  # (c)
-                    input_masked = np.reshape(input_masked, (in_c, 1, 1, 1))
-                    input_masked = np.tile(input_masked, (1, f_d, f_h, f_w))
-
-                    for k in range(out_c):
-                        tmp_out = np.sum(input_masked * filter_[:, k, :, :, :],
-                                         axis=0)
-                        d1, d2 = d * stride[0], d * stride[0] + d_bolck_d
-                        i1, i2 = i * stride[1], i * stride[1] + d_bolck_h
-                        j1, j2 = j * stride[2], j * stride[2] + d_bolck_w
-                        out[n, k, d1:d2:dilations[0], i1:i2:dilations[1], j1:j2:
-                            dilations[2]] += tmp_out
+                    for g in range(groups):
+                        input_masked = input_[n, g * sub_in_c:(g + 1
+                                                               ) * sub_in_c, d,
+                                              i, j]  # (c)
+                        input_masked = np.reshape(input_masked,
+                                                  (sub_in_c, 1, 1, 1))
+                        input_masked = np.tile(input_masked, (1, f_d, f_h, f_w))
+
+                        for k in range(f_out_c):
+                            tmp_out = np.sum(input_masked * filter_[
+                                g * sub_in_c:(g + 1) * sub_in_c, k, :, :, :],
+                                             axis=0)
+                            d1, d2 = d * stride[0], d * stride[0] + d_bolck_d
+                            i1, i2 = i * stride[1], i * stride[1] + d_bolck_h
+                            j1, j2 = j * stride[2], j * stride[2] + d_bolck_w
+                            out[n, g * f_out_c + k, d1:d2:dilations[0], i1:i2:
+                                dilations[1], j1:j2:dilations[2]] += tmp_out
 
     out = out[:, :, pad[0]:out_d - pad[0], pad[1]:out_h - pad[1], pad[2]:out_w -
               pad[2]]
@@ -72,6 +80,7 @@ class TestConv3dTransposeOp(OpTest):
             'strides': self.stride,
             'paddings': self.pad,
             'dilations': self.dilations,
+            'groups': self.groups,
             'use_cudnn': self.use_cudnn,
             'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
         }
@@ -134,6 +143,7 @@ class TestConv3dTransposeOp(OpTest):
         self.pad = [0, 0, 0]
         self.stride = [1, 1, 1]
         self.dilations = [1, 1, 1]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3, 3]
@@ -147,16 +157,29 @@ class TestWithPad(TestConv3dTransposeOp):
         self.pad = [1, 1, 1]
         self.stride = [1, 1, 1]
         self.dilations = [1, 1, 1]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3, 3]
 
 
+class TestWithGroups(TestConv3dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 2
+        self.input_size = [2, 4, 5, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 3, 3, 3, 3]
+
+
 class TestWithStride(TestConv3dTransposeOp):
     def init_test_case(self):
         self.pad = [1, 1, 1]
         self.stride = [2, 2, 2]
         self.dilations = [1, 1, 1]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3, 3]
@@ -167,6 +190,7 @@ class TestWithDilation(TestConv3dTransposeOp):
         self.pad = [1, 1, 1]
         self.stride = [1, 1, 1]
         self.dilations = [2, 2, 2]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3, 3]
@@ -184,6 +208,7 @@ class TestCUDNNWithPad(TestWithPad):
         self.pad = [1, 1, 1]
         self.stride = [1, 1, 1]
         self.dilations = [1, 1, 1]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3, 3]
@@ -198,6 +223,7 @@ class TestCUDNNWithStride(TestWithStride):
         self.pad = [1, 1, 1]
         self.stride = [2, 2, 2]
         self.dilations = [1, 1, 1]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3, 3]
@@ -207,6 +233,21 @@ class TestCUDNNWithStride(TestWithStride):
         self.op_type = "conv3d_transpose"
 
 
+class TestCUDNNWithGroups(TestWithGroups):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 2
+        self.input_size = [2, 4, 5, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 3, 3, 3, 3]
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+
+
 # Please Don't remove the following code.
 # Currently, CI use cudnn V5.0 which not support dilation conv.
 # class TestCUDNNWithDilation(TestWithDilation):
diff --git a/python/paddle/fluid/tests/unittests/test_crop_op.py b/python/paddle/fluid/tests/unittests/test_crop_op.py
index 20cc3a643f1adfc04faad15e1b7baad3e22d9d29..4016089c01644f0389855ab114360f90c50a1bbe 100644
--- a/python/paddle/fluid/tests/unittests/test_crop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crop_op.py
@@ -42,9 +42,9 @@ class TestCropOp(OpTest):
     def setUp(self):
         self.op_type = "crop"
         self.crop_by_input = False
+        self.offset_by_input = False
         self.attrs = {}
         self.initTestCase()
-        self.attrs['offsets'] = self.offsets
         if self.crop_by_input:
             self.inputs = {
                 'X': np.random.random(self.x_shape).astype("float32"),
@@ -55,6 +55,10 @@ class TestCropOp(OpTest):
             self.inputs = {
                 'X': np.random.random(self.x_shape).astype("float32"),
             }
+        if self.offset_by_input:
+            self.inputs['Offsets'] = np.array(self.offsets).astype('int32')
+        else:
+            self.attrs['offsets'] = self.offsets
         self.outputs = {
             'Out': crop(self.inputs['X'], self.offsets, self.crop_shape)
         }
@@ -101,5 +105,22 @@ class TestCase4(TestCropOp):
         self.crop_by_input = True
 
 
+class TestCase5(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (3, 4, 5)
+        self.crop_shape = [2, 2, 3]
+        self.offsets = [1, 0, 2]
+        self.offset_by_input = True
+
+
+class TestCase6(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (10, 9, 14)
+        self.crop_shape = [3, 3, 5]
+        self.offsets = [3, 5, 4]
+        self.crop_by_input = True
+        self.offset_by_input = True
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_debugger.py b/python/paddle/fluid/tests/unittests/test_debugger.py
index 67b03f635b6f8a3003efabe5425325080d47f61c..870952f2f916dcdec5991ac5c10d2da3a7ab18a8 100644
--- a/python/paddle/fluid/tests/unittests/test_debugger.py
+++ b/python/paddle/fluid/tests/unittests/test_debugger.py
@@ -15,7 +15,7 @@
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid import debuger
+from paddle.fluid import debugger
 from paddle.fluid.framework import Program
 
 
@@ -51,9 +51,9 @@ class TestDebugger(unittest.TestCase):
             outputs={"Out": mul_out},
             attrs={"x_num_col_dims": 1})
 
-        print(debuger.pprint_program_codes(p))
+        print(debugger.pprint_program_codes(p))
 
-        debuger.draw_block_graphviz(p.block(0), path="./test.dot")
+        debugger.draw_block_graphviz(p.block(0), path="./test.dot")
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_detection_map_op.py b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
index a905a854ad157ffa3d7816dfbd445f3e344a1249..f545ad155ccd28c2d34e424d307eed49b37f20fb 100644
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
@@ -160,7 +160,9 @@ class TestDetectionMAPOp(OpTest):
         label_count, true_pos, false_pos = get_input_pos(
             self.class_pos_count, self.true_pos, self.true_pos_lod,
             self.false_pos, self.false_pos_lod)
-        for (label, difficult, xmin, ymin, xmax, ymax) in self.label:
+        for v in self.label:
+            label = v[0]
+            difficult = False if len(v) == 5 else v[1]
             if self.evaluate_difficult:
                 label_count[label] += 1
             elif not difficult:
@@ -245,6 +247,15 @@ class TestDetectionMAPOpSkipDiff(TestDetectionMAPOp):
                        [2, 0.8, 0, 1], [2, 0.1, 1, 0], [3, 0.2, 0, 1]]
 
 
+class TestDetectionMAPOpWithoutDiff(TestDetectionMAPOp):
+    def init_test_case(self):
+        super(TestDetectionMAPOpWithoutDiff, self).init_test_case()
+
+        # label xmin ymin xmax ymax
+        self.label = [[1, 0.1, 0.1, 0.3, 0.3], [1, 0.6, 0.6, 0.8, 0.8],
+                      [2, 0.3, 0.3, 0.6, 0.5], [1, 0.7, 0.1, 0.9, 0.3]]
+
+
 class TestDetectionMAPOp11Point(TestDetectionMAPOp):
     def init_test_case(self):
         super(TestDetectionMAPOp11Point, self).init_test_case()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_train.py b/python/paddle/fluid/tests/unittests/test_dist_train.py
index 77e9a8f7e72a9e0790ce1d1f48356abcca8eaccf..2314bb2ed8a4eeb34752fd5d040f8a8476798aa6 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
@@ -12,19 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import time
 import unittest
+from multiprocessing import Process
+
+import numpy
 
 import paddle.fluid as fluid
-import paddle.fluid.core as core
 import paddle.fluid.layers as layers
-import numpy
-from multiprocessing import Process
-from threading import Thread
-import os, sys
-import time
 
 
 class TestSendOp(unittest.TestCase):
+    @unittest.skip(
+        "This test is buggy. We cannot use time.sleep to sync processes, the connection may fail in unittest."
+    )
     def test_send(self):
         # Run init_serv in a thread
         place = fluid.CPUPlace()
@@ -34,7 +36,7 @@ class TestSendOp(unittest.TestCase):
         p.start()
 
         time.sleep(10)
-        with open("/tmp/paddle.%d.selected_port" % p.pid, "r") as fn:
+        with open("/tmp/paddle.%d.port" % p.pid, "r") as fn:
             selected_port = int(fn.readlines()[0])
         self.init_client(place, selected_port)
 
@@ -52,15 +54,18 @@ class TestSendOp(unittest.TestCase):
             serv = layers.ListenAndServ(
                 "127.0.0.1:0", ["X"], optimizer_mode=False)
             with serv.do():
+                out_var = main.global_block().create_var(
+                    name="scale_0.tmp_0",
+                    psersistable=True,
+                    dtype="float32",
+                    shape=[32, 32])
                 x = layers.data(
                     shape=[32, 32],
                     dtype='float32',
                     name="X",
                     append_batch_size=False)
                 fluid.initializer.Constant(value=1.0)(x, main.global_block())
-                o = layers.scale(x=x, scale=10.0)
-            main.global_block().create_var(
-                name=o.name, psersistable=False, dtype=o.dtype, shape=o.shape)
+                layers.scale(x=x, scale=10.0, out=out_var)
 
         self.server_exe = fluid.Executor(place)
         self.server_exe.run(main)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..32647f9aa81431a3ecc798df6f1360a14fd978af
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -0,0 +1,65 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+from paddle.fluid.transpiler.distribute_transpiler import delete_ops
+
+from transpiler_test import TranspilerTest
+
+
+class TestDistTranspiler(TranspilerTest):
+    def setUp(self):
+        self.current_pserver_ep = "127.0.0.1:6174"
+
+    def test_transpiler(self):
+        trainer = self.get_trainer()
+        pserver, startup = self.get_pserver(self.current_pserver_ep)
+        self.assertEqual([op.type for op in trainer.global_block().ops],
+                         self.get_expect_trainer_ops())
+
+        self.assertEqual(len(pserver.blocks), 3)
+        # block0: listen_and_serv
+        self.assertEqual([op.type for op in pserver.blocks[0].ops],
+                         ["listen_and_serv"])
+        # block2: optimize pass
+        self.assertEqual([op.type for op in pserver.blocks[1].ops],
+                         ["sum", "scale", "sgd"])
+
+        # confirm startup program
+
+        self.assertEqual([op.type for op in startup.global_block().ops], [
+            "fill_constant", "fill_constant", "uniform_random", "uniform_random"
+        ])
+
+        # the variable #fc_w will be split into two blocks
+        fc_w_var = startup.global_block().var("fc_w.block1")
+        self.assertEqual(fc_w_var.shape, (500, 1000))
+
+    def get_expect_trainer_ops(self):
+        trainer = fluid.Program()
+
+        with fluid.program_guard(trainer):
+            optimize_ops, params_grads = self.net_conf()
+
+        delete_ops(trainer.global_block(), optimize_ops)
+        ops = [op.type for op in trainer.global_block().ops] + [
+            "split_byref", "send_vars", "send_barrier", "recv", "recv",
+            "fetch_barrier", "concat"
+        ]
+        ops.insert(ops.index("elementwise_add_grad") + 1, "send_vars")
+        return ops
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index 1f52bd90d0d49bda6c180019e90ebd923c91439c..96d47906a0606bba4b1d2207f7da85b058e42a2b 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -252,5 +252,25 @@ class TestFP16ElementwiseAddOp_rowwise_add_1(TestFP16ElementwiseAddOp):
         self.axis = 1
 
 
+class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(3, 20, 20).astype(self.dtype)
+        self.y = np.random.rand(3, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestFP16ElementwiseAddOp_channelwise_add(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(3, 10, 20).astype(self.dtype)
+        self.y = np.random.rand(3, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..281068e945e76a42635868d19573498f79fde1f3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
@@ -0,0 +1,60 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import math
+from op_test import OpTest
+
+
+def quantize_max_abs(x, num_bits):
+    range = math.pow(2, num_bits) - 1
+    scale = np.max(np.abs(x).flatten())
+    y = np.round(x / scale * range)
+    return y, scale
+
+
+def dequantize_max_abs(x, num_bits, scale):
+    range = math.pow(2, num_bits) - 1
+    y = (scale / range) * x
+    return y
+
+
+class TestFakeDequantizeMaxAbsOp(OpTest):
+    def set_args(self):
+        self.num_bits = 8
+
+    def setUp(self):
+        self.set_args()
+        self.op_type = "fake_dequantize_max_abs"
+        x = np.random.randn(31, 65).astype("float32")
+        yq, scale = quantize_max_abs(x, self.num_bits)
+        print 'scale ', scale
+        ydq = dequantize_max_abs(yq, self.num_bits, scale)
+
+        self.inputs = {'X': yq}
+        self.attrs = {'num_bits': self.num_bits, 'scale': float(scale)}
+        self.outputs = {'Out': ydq}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestFakeDequantizeMaxAbsOp5Bits(OpTest):
+    def set_args(self):
+        self.num_bits = 5
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
index 66e3e2d51d118756d4881955b4df8eb4d2bbc094..533d8ccfac82a2e298af16181ab16bf7aa3db282 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
@@ -50,5 +50,27 @@ class TestFillConstantBatchSizeLikeWhenSecondDimIsBatchSize(OpTest):
         self.check_output()
 
 
+class TestFillConstantBatchSizeLikeWithLoDTensor(OpTest):
+    def setUp(self):
+        self.op_type = "fill_constant_batch_size_like"
+        self.inputs = {
+            'Input': (np.random.random((31, 28)).astype("float32"),
+                      [[0, 9, 23, 31]])
+        }
+        self.attrs = {
+            'value': 3.5,
+            'shape': [-1, 16],
+            'input_dim_idx': 0,
+            'output_dim_idx': 0
+        }
+
+        out = np.random.random((3, 16)).astype("float32")
+        out.fill(3.5)
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py
index 6fd043c27e27db53c95be3630b6c08216e8e35f4..4ae90864806204197c52bbbdc5516f141afd4613 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
@@ -20,8 +20,9 @@ from op_test import OpTest
 class TestGatherOp(OpTest):
     def setUp(self):
         self.op_type = "gather"
-        xnp = np.random.random((10, 20)).astype("float32")
-        self.inputs = {'X': xnp, 'Index': np.array([1, 3, 5]).astype("int32")}
+        self.config()
+        xnp = np.random.random(self.x_shape).astype("float32")
+        self.inputs = {'X': xnp, 'Index': np.array(self.index).astype("int32")}
         self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
 
     def test_check_output(self):
@@ -30,6 +31,16 @@ class TestGatherOp(OpTest):
     def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
+    def config(self):
+        self.x_shape = (10, 20)
+        self.index = [1, 3, 5]
+
+
+class TestCase1(TestGatherOp):
+    def config(self):
+        self.x_shape = (10)
+        self.index = [1, 3, 5]
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_is_empty_op.py b/python/paddle/fluid/tests/unittests/test_is_empty_op.py
index 4d11cf226be2ba4ffbe015198fed3191f1e02f72..11121d9b65351eab639b7618fac0e54714cf4680 100644
--- a/python/paddle/fluid/tests/unittests/test_is_empty_op.py
+++ b/python/paddle/fluid/tests/unittests/test_is_empty_op.py
@@ -14,42 +14,24 @@
 
 import unittest
 import numpy as np
-from paddle.fluid.op import Operator
-import paddle.fluid.core as core
+from op_test import OpTest
 
 
-def create_tensor(scope, name, np_data):
-    tensor = scope.var(name).get_tensor()
-    tensor.set_dims(np_data.shape)
-    tensor.set(np_data, core.CPUPlace())
-    return tensor
-
-
-class TestIsEmptyOp(unittest.TestCase):
+class TestEmpty(OpTest):
     def setUp(self):
-        self.scope = core.Scope()
-        # create input variables
-        np_data0 = np.array([0, 1, 2])
-        create_tensor(self.scope, "X0", np_data0)
-
-        np_data1 = np.array([1])
-        t = create_tensor(self.scope, "X1", np_data1)
-        t.set_dims([0])
+        self.op_type = "is_empty"
+        self.inputs = {'X': np.array([1, 2, 3])}
+        self.outputs = {'Out': np.array([False])}
 
-        # create output variables
-        self.scope.var("out")
+    def test_check_output(self):
+        self.check_output()
 
-    def test_no_empty(self):
-        self.one_case("X0", False)
 
-    def test_empty(self):
-        self.one_case("X1", True)
-
-    def one_case(self, input, target):
-        op = Operator(type="is_empty", X=input, Out="out")
-        op.run(self.scope, core.CPUPlace())
-        out = self.scope.var("out").get_tensor()
-        self.assertEqual(np.array(out)[0], target)
+class TestNotEmpty(TestEmpty):
+    def setUp(self):
+        self.op_type = "is_empty"
+        self.inputs = {'X': np.array([])}
+        self.outputs = {'Out': np.array([True])}
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index c5414abf0fee6b686dccf7c97e9c6d5408ecf62a..621a450fa6a6a8f47e3f1c1de609614b2359c33b 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -369,6 +369,24 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(output)
         print(str(program))
 
+    def test_resize_bilinear(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[3, 9, 6], dtype="float32")
+            output = layers.resize_bilinear(x, out_shape=[12, 12])
+            self.assertIsNotNone(output)
+            output = layers.resize_bilinear(x, scale=3)
+            self.assertIsNotNone(output)
+        print(str(program))
+
+    def test_polygon_box_transform(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[8, 4, 4], dtype="float32")
+            output = layers.polygon_box_transform(input=x)
+            self.assertIsNotNone(output)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1226027ddc9c0b9dce9cedc5d1d20c0708647b6f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -0,0 +1,106 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import os
+import signal
+import subprocess
+import time
+import unittest
+from multiprocessing import Process
+from op_test import OpTest
+
+
+def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id):
+    x = fluid.layers.data(name='x', shape=[1], dtype='float32')
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+    # loss function
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+
+    # optimizer
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(avg_cost)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    pserver_endpoints = ip + ":" + port
+    current_endpoint = ip + ":" + port
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id,
+        pservers=pserver_endpoints,
+        trainers=trainers,
+        sync_mode=sync_mode)
+    pserver_prog = t.get_pserver_program(current_endpoint)
+    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+    exe.run(pserver_startup)
+    exe.run(pserver_prog)
+
+
+class TestListenAndServOp(OpTest):
+    def setUp(self):
+        self.ps_timeout = 5
+        self.ip = "127.0.0.1"
+        self.port = "6173"
+        self.trainers = 1
+        self.trainer_id = 1
+
+    def _start_pserver(self, use_cuda, sync_mode):
+        p = Process(
+            target=run_pserver,
+            args=(use_cuda, sync_mode, self.ip, self.port, self.trainers,
+                  self.trainer_id))
+        p.start()
+        return p.pid
+
+    def _wait_ps_ready(self, pid):
+        retry_times = self.ps_timeout
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(0.5)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
+
+    def test_rpc_interfaces(self):
+        # TODO(Yancey1989): need to make sure the rpc interface correctly.
+        pass
+
+    def test_handle_signal_in_serv_op(self):
+        # run pserver on CPU in sync mode
+        pid = self._start_pserver(False, True)
+        self._wait_ps_ready(pid)
+
+        # raise SIGTERM to pserver
+        os.kill(pid, signal.SIGTERM)
+
+        # run pserver on CPU in async mode
+        pid = self._start_pserver(False, False)
+        self._wait_ps_ready(pid)
+
+        # raise SIGTERM to pserver
+        os.kill(pid, signal.SIGTERM)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_op.py b/python/paddle/fluid/tests/unittests/test_lstm_op.py
index f8ff5a3361af66612f08b2aa4eaffa363f04c594..e726f99d49877a1bc464090092ec80b97ab15d0c 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_op.py
@@ -194,107 +194,104 @@ class TestLstmOp(OpTest):
             ['Input', 'Weight', 'Bias'], ['Hidden'], max_relative_error=5e-4)
 
 
-class TestLstmOpHasInitial(TestLstmOp):
-    def set_argument(self):
-        self.lod = [[0, 2, 5, 7]]
-        self.D = 16
-
-        self.act_gate = 'sigmoid'
-        self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
-
-        self.has_initial_state = True
-        self.is_reverse = True
-        self.use_peepholes = True
-
-    def test_check_grad(self):
-        # TODO(qingqing) remove folowing lines after the check_grad is refined.
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'],
-            max_relative_error=5e-4)
-
-    def test_check_grad_ingore_bias(self):
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight'], ['Hidden'],
-            max_relative_error=5e-4,
-            no_grad_set=set('Bias'))
-
-    def test_check_grad_ingore_weight(self):
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Bias'], ['Hidden'],
-            max_relative_error=5e-4,
-            no_grad_set=set('Weight'))
-
-    def test_check_grad_ingore_input(self):
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Weight', 'Bias'], ['Hidden'],
-            max_relative_error=5e-4,
-            no_grad_set=set('Input'))
-
-    def test_check_grad_ingore_h0(self):
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'],
-            max_relative_error=5e-4,
-            no_grad_set=set('H0'))
-
-    def test_check_grad_ingore_c0(self):
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'],
-            max_relative_error=5e-4,
-            no_grad_set=set('C0'))
-
-
-class TestLstmOpRerverse(TestLstmOp):
-    def set_argument(self):
-        self.lod = [[0, 2, 5, 7]]
-        self.D = 16
-
-        self.act_gate = 'sigmoid'
-        self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
-
-        self.has_initial_state = False
-        self.is_reverse = True
-        self.use_peepholes = True
-
-
-class TestLstmOpNotUsePeepholes(TestLstmOp):
-    def set_argument(self):
-        self.lod = [[0, 2, 5, 7]]
-        self.D = 16
-
-        self.act_gate = 'sigmoid'
-        self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
-
-        self.has_initial_state = False
-        self.is_reverse = True
-        self.use_peepholes = False
-
+# class TestLstmOpHasInitial(TestLstmOp):
+#     def set_argument(self):
+#         self.lod = [[0, 2, 5, 7]]
+#         self.D = 16
+
+#         self.act_gate = 'sigmoid'
+#         self.act_cell = 'tanh'
+#         self.act_cand = 'tanh'
+
+#         self.has_initial_state = True
+#         self.is_reverse = True
+#         self.use_peepholes = True
+
+#     def test_check_grad(self):
+#         # TODO(qingqing) remove folowing lines after the check_grad is refined.
+#         N = len(self.lod[0]) - 1
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'],
+#             max_relative_error=5e-4)
+
+#     def test_check_grad_ingore_bias(self):
+#         N = len(self.lod[0]) - 1
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Input', 'Weight'], ['Hidden'],
+#             max_relative_error=5e-4,
+#             no_grad_set=set('Bias'))
+
+#     def test_check_grad_ingore_weight(self):
+#         N = len(self.lod[0]) - 1
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Input', 'Bias'], ['Hidden'],
+#             max_relative_error=5e-4,
+#             no_grad_set=set('Weight'))
+
+#     def test_check_grad_ingore_input(self):
+#         N = len(self.lod[0]) - 1
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Weight', 'Bias'], ['Hidden'],
+#             max_relative_error=5e-4,
+#             no_grad_set=set('Input'))
+
+#     def test_check_grad_ingore_h0(self):
+#         N = len(self.lod[0]) - 1
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'],
+#             max_relative_error=5e-4,
+#             no_grad_set=set('H0'))
+
+#     def test_check_grad_ingore_c0(self):
+#         N = len(self.lod[0]) - 1
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'],
+#             max_relative_error=5e-4,
+#             no_grad_set=set('C0'))
+
+# class TestLstmOpRerverse(TestLstmOp):
+#     def set_argument(self):
+#         self.lod = [[0, 2, 5, 7]]
+#         self.D = 16
+
+#         self.act_gate = 'sigmoid'
+#         self.act_cell = 'tanh'
+#         self.act_cand = 'tanh'
+
+#         self.has_initial_state = False
+#         self.is_reverse = True
+#         self.use_peepholes = True
+
+# class TestLstmOpNotUsePeepholes(TestLstmOp):
+#     def set_argument(self):
+#         self.lod = [[0, 2, 5, 7]]
+#         self.D = 16
+
+#         self.act_gate = 'sigmoid'
+#         self.act_cell = 'tanh'
+#         self.act_cand = 'tanh'
+
+#         self.has_initial_state = False
+#         self.is_reverse = True
+#         self.use_peepholes = False
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_op.py b/python/paddle/fluid/tests/unittests/test_matmul_op.py
index 44ac4683891ffd3141a126740f4fddb47550e183..cae2c8fa87d9857de8f26cf4962d9370eca66243 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_op.py
@@ -111,21 +111,24 @@ class Generator(object):
 
 
 # Generate test cases for all possibilities
-for dim_X in [1, 2, 3]:
-    for dim_Y in [1, 2, 3]:
-        for transpose_X in [False, True]:
-            for transpose_Y in [False, True]:
-                test_name = (
-                    'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
-                        dim_X, dim_Y, transpose_X, transpose_Y))
-                shape_X, shape_Y = generate_compatible_shapes(
-                    dim_X, dim_Y, transpose_X, transpose_Y)
-                globals()[test_name] = type(test_name, (Generator, OpTest), {
-                    'shape_X': shape_X,
-                    'shape_Y': shape_Y,
-                    'transpose_X': transpose_X,
-                    'transpose_Y': transpose_Y,
-                })
+def inject_test(dim_x, dim_y, trans_x, trans_y):
+    test_name = ('TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
+        dim_x, dim_y, trans_x, trans_y))
+    shape_x, shape_y = generate_compatible_shapes(dim_x, dim_y, trans_x,
+                                                  trans_y)
+    globals()[test_name] = type(test_name, (Generator, OpTest), {
+        'shape_X': shape_x,
+        'shape_Y': shape_y,
+        'transpose_X': trans_x,
+        'transpose_Y': trans_y,
+    })
+
+
+for dim_X in (1, 2, 3):
+    for dim_Y in (1, 2, 3):
+        for transose_x in (False, True):
+            for transose_y in (False, True):
+                inject_test(dim_X, dim_Y, transose_x, transose_y)
 
 
 # Test case n-dim
@@ -149,7 +152,7 @@ def generate_compatible_shapes(dim, transpose_X, transpose_Y):
     return shape_X, shape_Y
 
 
-# Test case n-dim
+# # Test case n-dim
 for dim in [4]:
     for transpose_X in [False, True]:
         for transpose_Y in [False, True]:
diff --git a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
index f3dcca6b0107a9c4a6efcb0c0fd50324aaf92648..cfd6e63e12258a92447e68b4afbc7ead91b68cc1 100644
--- a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
@@ -18,7 +18,7 @@ import unittest
 import paddle.fluid.layers as layers
 import paddle.fluid.optimizer as optimizer
 from paddle.fluid.framework import Program, program_guard
-from paddle.fluid.memory_optimization_transpiler import memory_optimize
+from paddle.fluid.transpiler import memory_optimize
 
 
 class TestControlFlowGraph(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_mul_mkldnn_op.py
deleted file mode 100644
index 42d68ef376dc4a664a96ff5a24545c1997ee924a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_mul_mkldnn_op.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from test_mul_op import TestMulOp, TestMulOp2, TestFP16MulOp1, TestFP16MulOp2
-
-
-class TestMKLDNNMulOp(TestMulOp):
-    def init_op_test(self):
-        super(TestMKLDNNMulOp, self).setUp()
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNMulOp2(TestMulOp2):
-    def init_op_test(self):
-        super(TestMKLDNNMulOp2, self).setUp()
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNFP16MulOp1(TestFP16MulOp1):
-    def init_op_test(self):
-        super(TestMKLDNNFP16MulOp1, self).setUp()
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNFP16MulOp2(TestFP16MulOp2):
-    def init_op_test(self):
-        super(TestMKLDNNFP16MulOp2, self).setUp()
-        self.attrs = {"use_mkldnn": True}
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py
index d984393c89f44f5b9679a22bf7bb6182599233e3..bbc782c1bce302df68ab30013f3a7667e51ed479 100644
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
@@ -21,12 +21,10 @@ from op_test import OpTest
 class TestMulOp(OpTest):
     def setUp(self):
         self.op_type = "mul"
-        self.use_mkldnn = False
         self.inputs = {
-            'X': np.random.random((32, 84)).astype("float32"),
-            'Y': np.random.random((84, 100)).astype("float32")
+            'X': np.random.random((2, 5)).astype("float32"),
+            'Y': np.random.random((5, 3)).astype("float32")
         }
-        self.attrs = {'use_mkldnn': self.use_mkldnn}
         self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
 
     def test_check_output(self):
@@ -47,19 +45,17 @@ class TestMulOp(OpTest):
 class TestMulOp2(OpTest):
     def setUp(self):
         self.op_type = "mul"
-        self.use_mkldnn = False
         self.inputs = {
-            'X': np.random.random((15, 4, 12, 10)).astype("float32"),
-            'Y': np.random.random((4, 30, 8, 2, 9)).astype("float32")
+            'X': np.random.random((3, 4, 4, 3)).astype("float32"),
+            'Y': np.random.random((2, 6, 1, 2, 3)).astype("float32")
         }
         self.attrs = {
             'x_num_col_dims': 2,
             'y_num_col_dims': 2,
-            'use_mkldnn': self.use_mkldnn
         }
-        result = np.dot(self.inputs['X'].reshape(15 * 4, 12 * 10),
-                        self.inputs['Y'].reshape(4 * 30, 8 * 2 * 9))
-        result = result.reshape(15, 4, 8, 2, 9)
+        result = np.dot(self.inputs['X'].reshape(3 * 4, 4 * 3),
+                        self.inputs['Y'].reshape(2 * 6, 1 * 2 * 3))
+        result = result.reshape(3, 4, 1, 2, 3)
         self.outputs = {'Out': result}
 
     def test_check_output(self):
@@ -80,11 +76,9 @@ class TestMulOp2(OpTest):
 class TestFP16MulOp1(OpTest):
     def setUp(self):
         self.op_type = "mul"
-        self.use_mkldnn = False
-        x = np.random.random((32, 84)).astype("float16")
-        y = np.random.random((84, 100)).astype("float16")
-        self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)}
-        self.attrs = {'use_mkldnn': self.use_mkldnn}
+        x = np.random.random((3, 5)).astype("float16")
+        y = np.random.random((5, 4)).astype("float16")
+        self.inputs = {'X': x.view(np.float16), 'Y': y.view(np.float16)}
         self.outputs = {'Out': np.dot(x, y)}
 
     def test_check_output(self):
@@ -97,18 +91,15 @@ class TestFP16MulOp1(OpTest):
 class TestFP16MulOp2(OpTest):
     def setUp(self):
         self.op_type = "mul"
-        self.use_mkldnn = False
-        x = np.random.random((15, 4, 12, 10)).astype("float16")
-        y = np.random.random((4, 30, 8, 2, 9)).astype("float16")
-        self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)}
+        x = np.random.random((3, 4, 4, 3)).astype("float16")
+        y = np.random.random((2, 6, 1, 2, 3)).astype("float16")
+        self.inputs = {'X': x.view(np.float16), 'Y': y.view(np.float16)}
         self.attrs = {
             'x_num_col_dims': 2,
             'y_num_col_dims': 2,
-            'use_mkldnn': self.use_mkldnn
         }
-        result = np.dot(
-            x.reshape(15 * 4, 12 * 10), y.reshape(4 * 30, 8 * 2 * 9))
-        result = result.reshape(15, 4, 8, 2, 9)
+        result = np.dot(x.reshape(3 * 4, 4 * 3), y.reshape(2 * 6, 1 * 2 * 3))
+        result = result.reshape(3, 4, 1, 2, 3)
         self.outputs = {'Out': result}
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/test_network_with_dtype.py b/python/paddle/fluid/tests/unittests/test_network_with_dtype.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4835dd18405fc7a0d508a780a734922e0abd12c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_network_with_dtype.py
@@ -0,0 +1,74 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.executor import Executor
+
+BATCH_SIZE = 20
+
+
+class TestNetWithDtype(unittest.TestCase):
+    def setUp(self):
+        self.dtype = "float64"
+        self.init_dtype()
+
+    def run_net_on_place(self, place):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            x = fluid.layers.data(name='x', shape=[13], dtype=self.dtype)
+            y = fluid.layers.data(name='y', shape=[1], dtype=self.dtype)
+            y_predict = fluid.layers.fc(input=x, size=1, act=None)
+            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = fluid.layers.mean(cost)
+            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+            sgd_optimizer.minimize(avg_cost)
+
+        fetch_list = [avg_cost]
+        train_reader = paddle.batch(
+            paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE)
+        feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+        exe = fluid.Executor(place)
+        exe.run(startup)
+        for data in train_reader():
+            exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+            # the main program is runable, the datatype is fully supported
+            break
+
+    def init_dtype(self):
+        pass
+
+    def test_cpu(self):
+        place = fluid.CPUPlace()
+        self.run_net_on_place(place)
+
+    def test_gpu(self):
+        if not core.is_compiled_with_cuda():
+            return
+        place = fluid.CUDAPlace(0)
+        self.run_net_on_place(place)
+
+
+# TODO(dzhwinter): make sure the fp16 is runable
+# class TestFloat16(TestNetWithDtype):
+#     def init_dtype(self):
+#         self.dtype = "float16"
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py
index 779ae388f04496a7be9a6d5aa4e39b8245022925..c098a5a0cb0364f9ec93c95c1ef50912e574b3d9 100644
--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
@@ -63,7 +63,7 @@ class TestOperator(unittest.TestCase):
         self.assertEqual(mul_op.output("Out"), ["mul.out"])
         self.assertEqual(
             set(mul_op.attr_names),
-            set(["x_num_col_dims", "y_num_col_dims", "use_mkldnn"]))
+            set(["x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var"]))
         self.assertEqual(mul_op.has_attr("x_num_col_dims"), True)
         self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT)
         self.assertEqual(mul_op.attr("x_num_col_dims"), 1)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
deleted file mode 100644
index 9056f5e66fceb42397c9a923d802320dd772725b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ /dev/null
@@ -1,723 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy
-import unittest
-
-import paddle.fluid as fluid
-import paddle
-import paddle.dataset.mnist as mnist
-import paddle.dataset.wmt16 as wmt16
-
-
-def simple_fc_net(use_feed):
-    if use_feed:
-        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    else:
-        reader = fluid.layers.open_files(
-            filenames=['./mnist.recordio'],
-            shapes=[[-1, 784], [-1, 1]],
-            lod_levels=[0, 0],
-            dtypes=['float32', 'int64'],
-            thread_num=1,
-            for_parallel=True)
-        reader = fluid.layers.io.double_buffer(reader)
-        img, label = fluid.layers.read_file(reader)
-    hidden = img
-    for _ in xrange(4):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='tanh',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def fc_with_batchnorm(use_feed):
-    if use_feed:
-        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    else:
-        reader = fluid.layers.open_files(
-            filenames=['mnist.recordio'],
-            shapes=[[-1, 784], [-1, 1]],
-            lod_levels=[0, 0],
-            dtypes=['float32', 'int64'],
-            thread_num=1,
-            for_parallel=True)
-        reader = fluid.layers.io.double_buffer(reader)
-        img, label = fluid.layers.read_file(reader)
-
-    hidden = img
-    for _ in xrange(1):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='tanh',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-
-        hidden = fluid.layers.batch_norm(input=hidden)
-
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def squeeze_excitation(input, num_channels, reduction_ratio):
-    # pool = fluid.layers.pool2d(
-    #    input=input, pool_size=0, pool_type='avg', global_pooling=True)
-    conv = input
-    shape = conv.shape
-    reshape = fluid.layers.reshape(
-        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
-    pool = fluid.layers.reduce_mean(input=reshape, dim=2)
-
-    squeeze = fluid.layers.fc(input=pool,
-                              size=num_channels / reduction_ratio,
-                              act='relu')
-    excitation = fluid.layers.fc(input=squeeze,
-                                 size=num_channels,
-                                 act='sigmoid')
-    scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
-    return scale
-
-
-def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
-                  act=None):
-    conv = fluid.layers.conv2d(
-        input=input,
-        num_filters=num_filters,
-        filter_size=filter_size,
-        stride=stride,
-        padding=(filter_size - 1) / 2,
-        groups=groups,
-        act=None,
-        bias_attr=False)
-    return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1)
-
-
-def shortcut(input, ch_out, stride):
-    ch_in = input.shape[1]
-    if ch_in != ch_out:
-        if stride == 1:
-            filter_size = 1
-        else:
-            filter_size = 3
-        return conv_bn_layer(input, ch_out, filter_size, stride)
-    else:
-        return input
-
-
-def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
-    # The number of first 1x1 convolutional channels for each bottleneck build block
-    # was halved to reduce the compution cost.
-    conv0 = conv_bn_layer(
-        input=input, num_filters=num_filters, filter_size=1, act='relu')
-    conv1 = conv_bn_layer(
-        input=conv0,
-        num_filters=num_filters * 2,
-        filter_size=3,
-        stride=stride,
-        groups=cardinality,
-        act='relu')
-    conv2 = conv_bn_layer(
-        input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
-    scale = squeeze_excitation(
-        input=conv2,
-        num_channels=num_filters * 2,
-        reduction_ratio=reduction_ratio)
-
-    short = shortcut(input, num_filters * 2, stride)
-
-    return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
-
-
-def SE_ResNeXt50Small(batch_size=2, use_feed=False):
-    assert not use_feed, "SE_ResNeXt doesn't support feed yet"
-
-    img = fluid.layers.fill_constant(
-        shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
-    label = fluid.layers.fill_constant(
-        shape=[batch_size, 1], dtype='int64', value=0.0)
-
-    conv = conv_bn_layer(
-        input=img, num_filters=16, filter_size=3, stride=2, act='relu')
-    conv = conv_bn_layer(
-        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
-    conv = conv_bn_layer(
-        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
-    conv = fluid.layers.pool2d(
-        input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
-
-    cardinality = 32
-    reduction_ratio = 16
-    depth = [3, 4, 6, 3]
-    num_filters = [128, 256, 512, 1024]
-
-    for block in range(len(depth)):
-        for i in range(depth[block]):
-            conv = bottleneck_block(
-                input=conv,
-                num_filters=num_filters[block],
-                stride=2 if i == 0 and block != 0 else 1,
-                cardinality=cardinality,
-                reduction_ratio=reduction_ratio)
-
-    shape = conv.shape
-    reshape = fluid.layers.reshape(
-        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
-    pool = fluid.layers.reduce_mean(input=reshape, dim=2)
-    dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2)
-    # Classifier layer:
-    prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-import time
-
-
-class TestParallelExecutorBase(unittest.TestCase):
-    def check_network_convergence(self,
-                                  method,
-                                  memory_opt=True,
-                                  iter=50,
-                                  batch_size=None,
-                                  allow_op_delay=False,
-                                  feed_dict=None,
-                                  seed=None,
-                                  use_parallel_executor=True):
-        def run_executor(exe, feed, fetch_list, program=None):
-            if isinstance(exe, fluid.ParallelExecutor):
-                res = exe.run(fetch_list=fetch_list, feed=feed)
-            elif isinstance(exe, fluid.Executor):
-                if program is None:
-                    program = fluid.default_main_program()
-                res = exe.run(program=program, feed=feed, fetch_list=fetch_list)
-            else:
-                raise ValueError('Unkown type exe')
-            return res
-
-        main = fluid.Program()
-        startup = fluid.Program()
-        startup.random_seed = 1  # Fix random seed
-        with fluid.program_guard(main, startup):
-            if seed is not None:
-                startup.random_seed = seed
-            loss = method(use_feed=feed_dict is not None)
-            adam = fluid.optimizer.Adam()
-            adam.minimize(loss)
-            if memory_opt:
-                fluid.memory_optimize(main)
-            place = fluid.CUDAPlace(0)
-            startup_exe = fluid.Executor(place)
-            startup_exe.run(startup)
-
-            if use_parallel_executor:
-                exe = fluid.ParallelExecutor(
-                    True, loss_name=loss.name, allow_op_delay=allow_op_delay)
-            else:
-                exe = fluid.Executor(place=place)
-
-            if batch_size is not None:
-                batch_size *= fluid.core.get_cuda_device_count()
-            begin = time.time()
-            first_loss, = run_executor(
-                exe=exe, feed=feed_dict, fetch_list=[loss.name])
-            first_loss = numpy.array(first_loss)
-
-            for i in xrange(iter):
-                run_executor(exe=exe, feed=feed_dict, fetch_list=[])
-
-            last_loss, = run_executor(
-                exe=exe, feed=feed_dict, fetch_list=[loss.name])
-            end = time.time()
-
-            if batch_size is not None:
-                print "%.4f Instance per second" % (
-                    (batch_size * iter + 2) / (end - begin))
-
-            last_loss = numpy.array(last_loss)
-
-            print first_loss, last_loss
-            # self.assertGreater(first_loss[0], last_loss[0])
-            return first_loss, last_loss
-
-
-class TestMNIST(TestParallelExecutorBase):
-    @classmethod
-    def setUpClass(cls):
-        # Convert mnist to recordio file
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            reader = paddle.batch(mnist.train(), batch_size=4)
-            feeder = fluid.DataFeeder(
-                feed_list=[  # order is image and label
-                    fluid.layers.data(
-                        name='image', shape=[784]),
-                    fluid.layers.data(
-                        name='label', shape=[1], dtype='int64'),
-                ],
-                place=fluid.CPUPlace())
-            fluid.recordio_writer.convert_reader_to_recordio_file(
-                './mnist.recordio', reader, feeder)
-
-    def check_simple_fc_convergence(self):
-        self.check_network_convergence(simple_fc_net)
-        self.check_network_convergence(simple_fc_net, allow_op_delay=True)
-
-        img = numpy.zeros(shape=[32, 784], dtype='float32')
-        label = numpy.ones(shape=[32, 1], dtype='int64')
-        self.check_network_convergence(
-            simple_fc_net, feed_dict={"image": img,
-                                      "label": label})
-
-    def test_simple_fc(self):
-        self.check_simple_fc_convergence()
-
-    def check_simple_fc_parallel_accuracy(self):
-        img = numpy.zeros(shape=[32, 784], dtype='float32')
-        label = numpy.ones(shape=[32, 1], dtype='int64')
-        single_first_loss, single_last_loss = self.check_network_convergence(
-            method=simple_fc_net,
-            seed=1000,
-            feed_dict={"image": img,
-                       "label": label},
-            use_parallel_executor=False)
-        parallel_first_loss, parallel_last_loss = self.check_network_convergence(
-            method=simple_fc_net,
-            seed=1000,
-            feed_dict={"image": img,
-                       "label": label},
-            use_parallel_executor=True)
-
-        for p_f in parallel_first_loss:
-            self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6)
-        for p_l in parallel_last_loss:
-            self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6)
-
-    def test_simple_fc_parallel_accuracy(self):
-        self.check_simple_fc_parallel_accuracy()
-
-    def check_batchnorm_fc_convergence(self):
-        self.check_network_convergence(fc_with_batchnorm)
-        img = numpy.zeros(shape=[32, 784], dtype='float32')
-        label = numpy.ones(shape=[32, 1], dtype='int64')
-        self.check_network_convergence(
-            fc_with_batchnorm, feed_dict={"image": img,
-                                          "label": label})
-
-    def test_batchnorm_fc(self):
-        self.check_batchnorm_fc_convergence()
-
-
-class TestResnet(TestParallelExecutorBase):
-    # @classmethod
-    # def setUpClass(cls):
-    #     # import os
-    #     # if os.path.exists('./flowers.recordio'):
-    #     #     return
-    #     with fluid.program_guard(fluid.Program(), fluid.Program()):
-    #         reader = paddle.batch(flowers.train(), batch_size=4)
-    #         feeder = fluid.DataFeeder(
-    #             feed_list=[
-    #                 fluid.layers.data(
-    #                     name='image', shape=[3, 224, 224]),
-    #                 fluid.layers.data(
-    #                     name='label', shape=[1], dtype='int64'),
-    #             ],
-    #             place=fluid.CPUPlace())
-    #         fluid.recordio_writer.convert_reader_to_recordio_file(
-    #             "./flowers.recordio", reader, feeder, compressor=fluid.core.RecordIOWriter.Compressor.NoCompress)
-
-    def check_resnet_convergence(self):
-        import functools
-        batch_size = 2
-        self.check_network_convergence(
-            functools.partial(
-                SE_ResNeXt50Small, batch_size=batch_size),
-            iter=20,
-            batch_size=batch_size)
-
-    def test_resnet(self):
-        self.check_resnet_convergence()
-
-
-class ModelHyperParams(object):
-    # Dictionary size for source and target language. This model directly uses
-    # paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
-    # alreay been added, but the <pad> token is not added. Transformer requires
-    # sequences in a mini-batch are padded to have the same length. A <pad> token is
-    # added into the original dictionary in paddle.dateset.wmt16.
-
-    # size of source word dictionary.
-    src_vocab_size = 10000
-    # index for <pad> token in source language.
-    src_pad_idx = src_vocab_size
-
-    # size of target word dictionay
-    trg_vocab_size = 10000
-    # index for <pad> token in target language.
-    trg_pad_idx = trg_vocab_size
-
-    # position value corresponding to the <pad> token.
-    pos_pad_idx = 0
-
-    # max length of sequences. It should plus 1 to include position
-    # padding token for position encoding.
-    max_length = 50
-
-    # the dimension for word embeddings, which is also the last dimension of
-    # the input and output of multi-head attention, position-wise feed-forward
-    # networks, encoder and decoder.
-
-    d_model = 512
-    # size of the hidden layer in position-wise feed-forward networks.
-    d_inner_hid = 1024
-    # the dimension that keys are projected to for dot-product attention.
-    d_key = 64
-    # the dimension that values are projected to for dot-product attention.
-    d_value = 64
-    # number of head used in multi-head attention.
-    n_head = 8
-    # number of sub-layers to be stacked in the encoder and decoder.
-    n_layer = 6
-    # dropout rate used by all dropout layers.
-    dropout = 0.1
-
-
-import numpy as np
-
-
-def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
-    """
-    Pad the instances to the max sequence length in batch, and generate the
-    corresponding position data and attention bias. Then, convert the numpy
-    data to tensors and return a dict mapping names to tensors.
-    """
-
-    def __pad_batch_data(insts,
-                         pad_idx,
-                         is_target=False,
-                         return_pos=True,
-                         return_attn_bias=True,
-                         return_max_len=True):
-        """
-        Pad the instances to the max sequence length in batch, and generate the
-        corresponding position data and attention bias.
-        """
-        return_list = []
-        max_len = max(len(inst) for inst in insts)
-        inst_data = np.array(
-            [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
-        return_list += [inst_data.astype("int64").reshape([-1, 1])]
-        if return_pos:
-            inst_pos = np.array([[
-                pos_i + 1 if w_i != pad_idx else 0
-                for pos_i, w_i in enumerate(inst)
-            ] for inst in inst_data])
-
-            return_list += [inst_pos.astype("int64").reshape([-1, 1])]
-        if return_attn_bias:
-            if is_target:
-                # This is used to avoid attention on paddings and subsequent
-                # words.
-                slf_attn_bias_data = np.ones((inst_data.shape[0], max_len,
-                                              max_len))
-                slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
-                    [-1, 1, max_len, max_len])
-                slf_attn_bias_data = np.tile(slf_attn_bias_data,
-                                             [1, n_head, 1, 1]) * [-1e9]
-            else:
-                # This is used to avoid attention on paddings.
-                slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
-                                               (max_len - len(inst))
-                                               for inst in insts])
-                slf_attn_bias_data = np.tile(
-                    slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
-                    [1, n_head, max_len, 1])
-            return_list += [slf_attn_bias_data.astype("float32")]
-        if return_max_len:
-            return_list += [max_len]
-        return return_list if len(return_list) > 1 else return_list[0]
-
-    def data_to_tensor(data_list, name_list, input_dict, place):
-        assert len(data_list) == len(name_list)
-        for i in range(len(name_list)):
-            tensor = fluid.LoDTensor()
-            tensor.set(data_list[i], place)
-            input_dict[name_list[i]] = tensor
-
-    src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data(
-        [inst[0] for inst in insts], src_pad_idx, is_target=False)
-    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data(
-        [inst[1] for inst in insts], trg_pad_idx, is_target=True)
-    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
-                                [1, 1, trg_max_len, 1]).astype("float32")
-    lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False,
-                                False, False, False)
-    lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
-
-    return [
-        src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias,
-        trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
-    ]
-
-
-import transformer_model
-
-
-def transformer(use_feed):
-    assert not use_feed, "transfomer doesn't support feed yet"
-    return transformer_model.transformer(
-        ModelHyperParams.src_vocab_size + 1,
-        ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
-        ModelHyperParams.n_layer, ModelHyperParams.n_head,
-        ModelHyperParams.d_key, ModelHyperParams.d_value,
-        ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
-        ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
-        ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
-
-
-class TestTransformer(TestParallelExecutorBase):
-    @classmethod
-    def setUpClass(cls):
-        reader = paddle.batch(
-            wmt16.train(ModelHyperParams.src_vocab_size,
-                        ModelHyperParams.trg_vocab_size),
-            batch_size=transformer_model.batch_size)
-
-        with fluid.recordio_writer.create_recordio_writer(
-                "./wmt16.recordio") as writer:
-            for batch in reader():
-                for tensor in prepare_batch_input(
-                        batch, ModelHyperParams.src_pad_idx,
-                        ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head):
-                    t = fluid.LoDTensor()
-                    t.set(tensor, fluid.CPUPlace())
-                    writer.append_tensor(t)
-                writer.complete_append_tensor()
-
-    @unittest.skip("transformer is buggy in multi gpu")
-    def test_main(self):
-        self.check_network_convergence(transformer)
-
-
-class ParallelExecutorTestingDuringTraining(unittest.TestCase):
-    def check_network_convergence(self):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            loss = simple_fc_net(True)
-            test_program = main.clone(for_test=True)
-
-            opt = fluid.optimizer.SGD(learning_rate=0.001)
-            opt.minimize(loss)
-
-            batch_size = 32
-            image = numpy.random.normal(size=(batch_size,
-                                              784)).astype('float32')
-            label = numpy.random.randint(0, 10, (batch_size, 1), dtype="int64")
-
-            place = fluid.CUDAPlace(0)
-            exe = fluid.Executor(place)
-            exe.run(startup)
-            feed_dict = {'image': image, 'label': label}
-
-            train_exe = fluid.ParallelExecutor(
-                use_cuda=True, loss_name=loss.name, main_program=main)
-
-            test_exe = fluid.ParallelExecutor(
-                use_cuda=True,
-                main_program=test_program,
-                share_vars_from=train_exe)
-
-            for i in xrange(5):
-                test_loss, = test_exe.run([loss.name], feed=feed_dict)
-                test_loss = numpy.array(test_loss)
-
-                train_loss, = train_exe.run([loss.name], feed=feed_dict)
-                train_loss = numpy.array(train_loss)
-                self.assertTrue(
-                    numpy.allclose(
-                        train_loss, test_loss, atol=1e-8),
-                    "Train loss: " + str(train_loss) + "\n Test loss:" +
-                    str(test_loss))
-
-    def test_parallel(self):
-        self.check_network_convergence()
-
-
-import paddle.dataset.conll05 as conll05
-import paddle.fluid as fluid
-
-word_dict, verb_dict, label_dict = conll05.get_dict()
-word_dict_len = len(word_dict)
-label_dict_len = len(label_dict)
-pred_dict_len = len(verb_dict)
-mark_dict_len = 2
-word_dim = 32
-mark_dim = 5
-hidden_dim = 512
-depth = 8
-mix_hidden_lr = 1e-3
-embedding_name = 'emb'
-
-
-def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
-            is_sparse, **ignored):
-    # 8 features
-    predicate_embedding = fluid.layers.embedding(
-        input=predicate,
-        is_sparse=is_sparse,
-        size=[pred_dict_len, word_dim],
-        dtype='float32',
-        param_attr='vemb')
-
-    mark_embedding = fluid.layers.embedding(
-        input=mark,
-        is_sparse=is_sparse,
-        size=[mark_dict_len, mark_dim],
-        dtype='float32')
-
-    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
-    emb_layers = [
-        fluid.layers.embedding(
-            size=[word_dict_len, word_dim],
-            is_sparse=is_sparse,
-            input=x,
-            param_attr=fluid.ParamAttr(
-                name=embedding_name, trainable=False)) for x in word_input
-    ]
-    emb_layers.append(predicate_embedding)
-    emb_layers.append(mark_embedding)
-
-    hidden_0_layers = [
-        fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
-        for emb in emb_layers
-    ]
-
-    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
-
-    lstm_0 = fluid.layers.dynamic_lstm(
-        input=hidden_0,
-        size=hidden_dim,
-        candidate_activation='relu',
-        gate_activation='sigmoid',
-        cell_activation='sigmoid')
-
-    # stack L-LSTM and R-LSTM with direct edges
-    input_tmp = [hidden_0, lstm_0]
-
-    for i in range(1, depth):
-        mix_hidden = fluid.layers.sums(input=[
-            fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
-            fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
-        ])
-
-        lstm = fluid.layers.dynamic_lstm(
-            input=mix_hidden,
-            size=hidden_dim,
-            candidate_activation='relu',
-            gate_activation='sigmoid',
-            cell_activation='sigmoid',
-            is_reverse=((i % 2) == 1))
-
-        input_tmp = [mix_hidden, lstm]
-
-    feature_out = fluid.layers.sums(input=[
-        fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
-        fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
-    ])
-
-    return feature_out
-
-
-class TestCRFModel(unittest.TestCase):
-    def check_network_convergence(self, is_sparse):
-        main = fluid.Program()
-        startup = fluid.Program()
-        with fluid.program_guard(main, startup):
-            word = fluid.layers.data(
-                name='word_data', shape=[1], dtype='int64', lod_level=1)
-            predicate = fluid.layers.data(
-                name='verb_data', shape=[1], dtype='int64', lod_level=1)
-            ctx_n2 = fluid.layers.data(
-                name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
-            ctx_n1 = fluid.layers.data(
-                name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
-            ctx_0 = fluid.layers.data(
-                name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
-            ctx_p1 = fluid.layers.data(
-                name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
-            ctx_p2 = fluid.layers.data(
-                name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
-            mark = fluid.layers.data(
-                name='mark_data', shape=[1], dtype='int64', lod_level=1)
-
-            feature_out = db_lstm(**locals())
-            target = fluid.layers.data(
-                name='target', shape=[1], dtype='int64', lod_level=1)
-            crf_cost = fluid.layers.linear_chain_crf(
-                input=feature_out,
-                label=target,
-                param_attr=fluid.ParamAttr(
-                    name='crfw', learning_rate=1e-1))
-            avg_cost = fluid.layers.mean(crf_cost)
-
-            sgd_optimizer = fluid.optimizer.SGD(
-                learning_rate=fluid.layers.exponential_decay(
-                    learning_rate=0.01,
-                    decay_steps=100000,
-                    decay_rate=0.5,
-                    staircase=True))
-            sgd_optimizer.minimize(avg_cost)
-
-            train_data = paddle.batch(
-                paddle.reader.shuffle(
-                    paddle.dataset.conll05.test(), buf_size=8192),
-                batch_size=16)
-
-            place = fluid.CUDAPlace(0)
-            exe = fluid.Executor(place)
-            exe.run(startup)
-
-            pe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
-
-            feeder = fluid.DataFeeder(
-                feed_list=[
-                    word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate,
-                    mark, target
-                ],
-                place=fluid.CPUPlace())
-
-            data = train_data()
-            for i in xrange(10):
-                cur_batch = next(data)
-                print map(numpy.array,
-                          pe.run(feed=feeder.feed(cur_batch),
-                                 fetch_list=[avg_cost.name]))[0]
-
-    def test_update_sparse_parameter(self):
-        self.check_network_convergence(is_sparse=True)
-
-    def test_update_dense_parameter(self):
-        self.check_network_convergence(is_sparse=False)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
new file mode 100644
index 0000000000000000000000000000000000000000..66e138b03f3b170aca4fb2207438eb9af1783c33
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -0,0 +1,197 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.conll05 as conll05
+import paddle.fluid as fluid
+import unittest
+import paddle
+import numpy as np
+
+word_dict, verb_dict, label_dict = conll05.get_dict()
+word_dict_len = len(word_dict)
+label_dict_len = len(label_dict)
+pred_dict_len = len(verb_dict)
+mark_dict_len = 2
+word_dim = 32
+mark_dim = 5
+hidden_dim = 512
+depth = 8
+mix_hidden_lr = 1e-3
+embedding_name = 'emb'
+
+
+def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
+            is_sparse, **ignored):
+    # 8 features
+    predicate_embedding = fluid.layers.embedding(
+        input=predicate,
+        is_sparse=is_sparse,
+        size=[pred_dict_len, word_dim],
+        dtype='float32',
+        param_attr='vemb')
+
+    mark_embedding = fluid.layers.embedding(
+        input=mark,
+        is_sparse=is_sparse,
+        size=[mark_dict_len, mark_dim],
+        dtype='float32')
+
+    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+    emb_layers = [
+        fluid.layers.embedding(
+            size=[word_dict_len, word_dim],
+            is_sparse=is_sparse,
+            input=x,
+            param_attr=fluid.ParamAttr(
+                name=embedding_name, trainable=False)) for x in word_input
+    ]
+    emb_layers.append(predicate_embedding)
+    emb_layers.append(mark_embedding)
+
+    hidden_0_layers = [
+        fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
+        for emb in emb_layers
+    ]
+
+    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
+
+    lstm_0 = fluid.layers.dynamic_lstm(
+        input=hidden_0,
+        size=hidden_dim,
+        candidate_activation='relu',
+        gate_activation='sigmoid',
+        cell_activation='sigmoid')
+
+    # stack L-LSTM and R-LSTM with direct edges
+    input_tmp = [hidden_0, lstm_0]
+
+    for i in range(1, depth):
+        mix_hidden = fluid.layers.sums(input=[
+            fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
+            fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
+        ])
+
+        lstm = fluid.layers.dynamic_lstm(
+            input=mix_hidden,
+            size=hidden_dim,
+            candidate_activation='relu',
+            gate_activation='sigmoid',
+            cell_activation='sigmoid',
+            is_reverse=((i % 2) == 1))
+
+        input_tmp = [mix_hidden, lstm]
+
+    feature_out = fluid.layers.sums(input=[
+        fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
+        fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
+    ])
+
+    return feature_out
+
+
+class TestCRFModel(unittest.TestCase):
+    def check_network_convergence(self, is_sparse, build_strategy=None):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            word = fluid.layers.data(
+                name='word_data', shape=[1], dtype='int64', lod_level=1)
+            predicate = fluid.layers.data(
+                name='verb_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_n2 = fluid.layers.data(
+                name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_n1 = fluid.layers.data(
+                name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_0 = fluid.layers.data(
+                name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_p1 = fluid.layers.data(
+                name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_p2 = fluid.layers.data(
+                name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+            mark = fluid.layers.data(
+                name='mark_data', shape=[1], dtype='int64', lod_level=1)
+
+            feature_out = db_lstm(**locals())
+            target = fluid.layers.data(
+                name='target', shape=[1], dtype='int64', lod_level=1)
+            crf_cost = fluid.layers.linear_chain_crf(
+                input=feature_out,
+                label=target,
+                param_attr=fluid.ParamAttr(
+                    name='crfw', learning_rate=1e-1))
+            avg_cost = fluid.layers.mean(crf_cost)
+
+            sgd_optimizer = fluid.optimizer.SGD(
+                learning_rate=fluid.layers.exponential_decay(
+                    learning_rate=0.01,
+                    decay_steps=100000,
+                    decay_rate=0.5,
+                    staircase=True))
+            sgd_optimizer.minimize(avg_cost)
+
+            train_data = paddle.batch(
+                paddle.reader.shuffle(
+                    paddle.dataset.conll05.test(), buf_size=8192),
+                batch_size=16)
+
+            place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup)
+
+            pe = fluid.ParallelExecutor(
+                use_cuda=True,
+                loss_name=avg_cost.name,
+                build_strategy=build_strategy)
+
+            feeder = fluid.DataFeeder(
+                feed_list=[
+                    word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate,
+                    mark, target
+                ],
+                place=fluid.CPUPlace())
+
+            data = train_data()
+            for i in xrange(10):
+                cur_batch = next(data)
+                print map(np.array,
+                          pe.run(feed=feeder.feed(cur_batch),
+                                 fetch_list=[avg_cost.name]))[0]
+
+    def test_update_sparse_parameter_all_reduce(self):
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
+        self.check_network_convergence(
+            is_sparse=True, build_strategy=build_strategy)
+
+    def test_update_dense_parameter_all_reduce(self):
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
+        self.check_network_convergence(
+            is_sparse=False, build_strategy=build_strategy)
+
+    def test_update_sparse_parameter_reduce(self):
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
+        self.check_network_convergence(
+            is_sparse=True, build_strategy=build_strategy)
+
+    def test_update_dense_parameter_reduce(self):
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
+        self.check_network_convergence(
+            is_sparse=False, build_strategy=build_strategy)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
new file mode 100644
index 0000000000000000000000000000000000000000..24f8d28c0304a77a99213374b25d0db728eca265
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.flowers as flowers
+import math
+import paddle.fluid as fluid
+import unittest
+import numpy as np
+import paddle
+
+
+def Lenet(data, class_dim):
+    conv1 = fluid.layers.conv2d(data, 32, 5, 1, act=None)
+    bn1 = fluid.layers.batch_norm(conv1, act='relu')
+    pool1 = fluid.layers.pool2d(bn1, 2, 'max', 2)
+    conv2 = fluid.layers.conv2d(pool1, 50, 5, 1, act=None)
+    bn2 = fluid.layers.batch_norm(conv2, act='relu')
+    pool2 = fluid.layers.pool2d(bn2, 2, 'max', 2)
+
+    fc1 = fluid.layers.fc(pool2, size=500, act='relu')
+    fc2 = fluid.layers.fc(fc1, size=class_dim, act='softmax')
+
+    return fc2
+
+
+class TestFetchOp(unittest.TestCase):
+    def parallel_exe(self, train_inputs, seed):
+        main = fluid.Program()
+        startup = fluid.Program()
+        startup.random_seed = seed
+        with fluid.program_guard(main, startup):
+            data = fluid.layers.data(
+                name='image', shape=[3, 224, 224], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            out = Lenet(data, class_dim=102)
+            loss = fluid.layers.cross_entropy(input=out, label=label)
+            loss = fluid.layers.mean(loss)
+
+            opt = fluid.optimizer.Momentum(
+                learning_rate=0.1,
+                momentum=0.9,
+                regularization=fluid.regularizer.L2Decay(1e-4))
+
+            opt.minimize(loss)
+
+            # TODO(zcd): I found that onece the memory optimizer is open,
+            # parallel_exe doesn't fetch some variable, such as conv2d_0.b_0@GRAD,
+            # conv2d_1.b_0@GRAD. Those variables should not be pruned.
+            # fluid.memory_optimize(main)
+
+            place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup)
+
+            feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
+            pe = fluid.ParallelExecutor(
+                use_cuda=True, loss_name=loss.name, main_program=main)
+
+            fetch_list = []
+            all_vars = main.global_block().vars
+            for k, v in all_vars.iteritems():
+                if 'tmp' not in k and k[0] is not '_' or v.persistable:
+                    fetch_list.append(k)
+
+            for data in train_inputs:
+                ret = pe.run(fetch_list, feed=feeder.feed(data))
+                for i in range(len(fetch_list)):
+                    assert not math.isnan(np.sum(ret[i])) and \
+                           not math.isinf(np.sum(ret[i]))
+
+    def test_fetch_op(self):
+        tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16)
+        tst_reader_iter = tst_reader()
+
+        iters = 3
+        train_inputs = []
+        for i in range(iters):
+            train_inputs.append(tst_reader_iter.next())
+
+        self.parallel_exe(train_inputs, seed=1)
+
+
+class TestFeedParallel(unittest.TestCase):
+    def test_main(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        startup.random_seed = 1
+        with fluid.scope_guard(fluid.core.Scope()):
+            with fluid.program_guard(main, startup):
+                data = fluid.layers.data(
+                    name='image', shape=[3, 224, 224], dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+                out = Lenet(data, class_dim=102)
+                loss = fluid.layers.cross_entropy(input=out, label=label)
+                loss = fluid.layers.mean(loss)
+                opt = fluid.optimizer.Momentum(
+                    learning_rate=0.1,
+                    momentum=0.9,
+                    regularization=fluid.regularizer.L2Decay(1e-4))
+
+                opt.minimize(loss)
+        place = fluid.CUDAPlace(0)
+        feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
+        reader = feeder.decorate_reader(
+            paddle.batch(
+                flowers.train(), batch_size=16), multi_devices=True)
+        exe = fluid.Executor(place)
+        exe.run(startup)
+        pe = fluid.ParallelExecutor(
+            use_cuda=True, loss_name=loss.name, main_program=main)
+
+        for batch_id, data in enumerate(reader()):
+            loss_np = np.array(pe.run(feed=data, fetch_list=[loss.name])[0])
+            print batch_id, loss_np
+            if batch_id == 2:
+                break
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..015703c3e25f4e11e64ab6a7de99da12bee608f6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from parallel_executor_test_base import TestParallelExecutorBase
+import paddle.fluid as fluid
+import numpy as np
+import paddle
+import paddle.dataset.mnist as mnist
+import unittest
+
+MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio"
+
+
+def simple_fc_net(use_feed):
+    if use_feed:
+        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    else:
+        reader = fluid.layers.open_files(
+            filenames=[MNIST_RECORDIO_FILE],
+            shapes=[[-1, 784], [-1, 1]],
+            lod_levels=[0, 0],
+            dtypes=['float32', 'int64'],
+            thread_num=1,
+            for_parallel=True)
+        reader = fluid.layers.io.double_buffer(reader)
+        img, label = fluid.layers.read_file(reader)
+    hidden = img
+    for _ in xrange(4):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='tanh',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+def fc_with_batchnorm(use_feed):
+    if use_feed:
+        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    else:
+        reader = fluid.layers.open_files(
+            filenames=[MNIST_RECORDIO_FILE],
+            shapes=[[-1, 784], [-1, 1]],
+            lod_levels=[0, 0],
+            dtypes=['float32', 'int64'],
+            thread_num=1,
+            for_parallel=True)
+        reader = fluid.layers.io.double_buffer(reader)
+        img, label = fluid.layers.read_file(reader)
+
+    hidden = img
+    for _ in xrange(1):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='tanh',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+
+        hidden = fluid.layers.batch_norm(input=hidden)
+
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class TestMNIST(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        # Convert mnist to recordio file
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(mnist.train(), batch_size=4)
+            feeder = fluid.DataFeeder(
+                feed_list=[  # order is image and label
+                    fluid.layers.data(
+                        name='image', shape=[784]),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            fluid.recordio_writer.convert_reader_to_recordio_file(
+                MNIST_RECORDIO_FILE, reader, feeder)
+
+    def check_simple_fc_convergence(self, balance_parameter_opt_between_cards):
+        self.check_network_convergence(simple_fc_net)
+        self.check_network_convergence(simple_fc_net, allow_op_delay=True)
+
+        img = np.zeros(shape=[32, 784], dtype='float32')
+        label = np.ones(shape=[32, 1], dtype='int64')
+        self.check_network_convergence(
+            simple_fc_net,
+            feed_dict={"image": img,
+                       "label": label},
+            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
+        )
+
+    def test_simple_fc(self):
+        self.check_simple_fc_convergence(False)
+
+    def test_simple_fc_with_new_strategy(self):
+        self.check_simple_fc_convergence(True)
+
+    def check_simple_fc_parallel_accuracy(self,
+                                          balance_parameter_opt_between_cards):
+        img = np.zeros(shape=[32, 784], dtype='float32')
+        label = np.ones(shape=[32, 1], dtype='int64')
+        single_first_loss, single_last_loss = self.check_network_convergence(
+            method=simple_fc_net,
+            seed=1000,
+            feed_dict={"image": img,
+                       "label": label},
+            use_parallel_executor=False)
+        parallel_first_loss, parallel_last_loss = self.check_network_convergence(
+            method=simple_fc_net,
+            seed=1000,
+            feed_dict={"image": img,
+                       "label": label},
+            use_parallel_executor=True,
+            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
+        )
+
+        for p_f in parallel_first_loss:
+            self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6)
+        for p_l in parallel_last_loss:
+            self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6)
+
+    def test_simple_fc_parallel_accuracy(self):
+        self.check_simple_fc_parallel_accuracy(False)
+
+    def test_simple_fc_parallel_accuracy_with_new_strategy(self):
+        self.check_simple_fc_parallel_accuracy(True)
+
+    def check_batchnorm_fc_convergence(self,
+                                       balance_parameter_opt_between_cards):
+        self.check_network_convergence(fc_with_batchnorm)
+        img = np.zeros(shape=[32, 784], dtype='float32')
+        label = np.ones(shape=[32, 1], dtype='int64')
+        self.check_network_convergence(
+            fc_with_batchnorm,
+            feed_dict={"image": img,
+                       "label": label},
+            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
+        )
+
+    def test_batchnorm_fc(self):
+        self.check_batchnorm_fc_convergence(False)
+
+    def test_batchnorm_fc_with_new_strategy(self):
+        self.check_batchnorm_fc_convergence(True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3fa140cbb7994a36d2cbee26d598165f1f771d2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+from parallel_executor_test_base import TestParallelExecutorBase
+import unittest
+
+
+def squeeze_excitation(input, num_channels, reduction_ratio):
+    # pool = fluid.layers.pool2d(
+    #    input=input, pool_size=0, pool_type='avg', global_pooling=True)
+    conv = input
+    shape = conv.shape
+    reshape = fluid.layers.reshape(
+        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
+    pool = fluid.layers.reduce_mean(input=reshape, dim=2)
+
+    squeeze = fluid.layers.fc(input=pool,
+                              size=num_channels / reduction_ratio,
+                              act='relu')
+    excitation = fluid.layers.fc(input=squeeze,
+                                 size=num_channels,
+                                 act='sigmoid')
+    scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
+    return scale
+
+
+def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
+                  act=None):
+    conv = fluid.layers.conv2d(
+        input=input,
+        num_filters=num_filters,
+        filter_size=filter_size,
+        stride=stride,
+        padding=(filter_size - 1) / 2,
+        groups=groups,
+        act=None,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1)
+
+
+def shortcut(input, ch_out, stride):
+    ch_in = input.shape[1]
+    if ch_in != ch_out:
+        if stride == 1:
+            filter_size = 1
+        else:
+            filter_size = 3
+        return conv_bn_layer(input, ch_out, filter_size, stride)
+    else:
+        return input
+
+
+def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
+    # The number of first 1x1 convolutional channels for each bottleneck build block
+    # was halved to reduce the compution cost.
+    conv0 = conv_bn_layer(
+        input=input, num_filters=num_filters, filter_size=1, act='relu')
+    conv1 = conv_bn_layer(
+        input=conv0,
+        num_filters=num_filters * 2,
+        filter_size=3,
+        stride=stride,
+        groups=cardinality,
+        act='relu')
+    conv2 = conv_bn_layer(
+        input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
+    scale = squeeze_excitation(
+        input=conv2,
+        num_channels=num_filters * 2,
+        reduction_ratio=reduction_ratio)
+
+    short = shortcut(input, num_filters * 2, stride)
+
+    return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
+
+
+def SE_ResNeXt50Small(batch_size=2, use_feed=False):
+    assert not use_feed, "SE_ResNeXt doesn't support feed yet"
+
+    img = fluid.layers.fill_constant(
+        shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
+    label = fluid.layers.fill_constant(
+        shape=[batch_size, 1], dtype='int64', value=0.0)
+
+    conv = conv_bn_layer(
+        input=img, num_filters=16, filter_size=3, stride=2, act='relu')
+    conv = conv_bn_layer(
+        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
+    conv = conv_bn_layer(
+        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
+    conv = fluid.layers.pool2d(
+        input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+
+    cardinality = 32
+    reduction_ratio = 16
+    depth = [3, 4, 6, 3]
+    num_filters = [128, 256, 512, 1024]
+
+    for block in range(len(depth)):
+        for i in range(depth[block]):
+            conv = bottleneck_block(
+                input=conv,
+                num_filters=num_filters[block],
+                stride=2 if i == 0 and block != 0 else 1,
+                cardinality=cardinality,
+                reduction_ratio=reduction_ratio)
+
+    shape = conv.shape
+    reshape = fluid.layers.reshape(
+        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
+    pool = fluid.layers.reduce_mean(input=reshape, dim=2)
+    dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2)
+    # Classifier layer:
+    prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class TestResnet(TestParallelExecutorBase):
+    def check_resnet_convergence(self, balance_parameter_opt_between_cards):
+        import functools
+        batch_size = 2
+        self.check_network_convergence(
+            functools.partial(
+                SE_ResNeXt50Small, batch_size=batch_size),
+            iter=20,
+            batch_size=batch_size,
+            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
+        )
+
+    def test_resnet(self):
+        self.check_resnet_convergence(False)
+
+    def test_resnet_with_new_strategy(self):
+        self.check_resnet_convergence(True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..93a5f767867d68110cf7b8f441cc740ecd843cf9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import numpy as np
+import unittest
+
+
+def simple_fc_net():
+    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = img
+    for _ in xrange(4):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='tanh',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class ParallelExecutorTestingDuringTraining(unittest.TestCase):
+    def check_network_convergence(self, build_strategy=None):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = simple_fc_net()
+            test_program = main.clone(for_test=True)
+
+            opt = fluid.optimizer.SGD(learning_rate=0.001)
+            opt.minimize(loss)
+
+            batch_size = 32
+            image = np.random.normal(size=(batch_size, 784)).astype('float32')
+            label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")
+
+            place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup)
+            feed_dict = {'image': image, 'label': label}
+
+            train_exe = fluid.ParallelExecutor(
+                use_cuda=True,
+                loss_name=loss.name,
+                main_program=main,
+                build_strategy=build_strategy)
+
+            test_exe = fluid.ParallelExecutor(
+                use_cuda=True,
+                main_program=test_program,
+                share_vars_from=train_exe,
+                build_strategy=build_strategy)
+
+            for i in xrange(5):
+                test_loss, = test_exe.run([loss.name], feed=feed_dict)
+                test_loss = np.array(test_loss)
+
+                train_loss, = train_exe.run([loss.name], feed=feed_dict)
+                train_loss = np.array(train_loss)
+                self.assertTrue(
+                    np.allclose(
+                        train_loss, test_loss, atol=1e-8),
+                    "Train loss: " + str(train_loss) + "\n Test loss:" +
+                    str(test_loss))
+
+    def test_parallel_testing(self):
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
+        self.check_network_convergence(build_strategy)
+
+    def test_parallel_testing_with_new_strategy(self):
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
+        self.check_network_convergence(build_strategy)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c81df66d987f3d3856af0e19fc935df7de2edacc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import transformer_model
+import numpy as np
+from parallel_executor_test_base import TestParallelExecutorBase
+import unittest
+import paddle
+import paddle.dataset.wmt16 as wmt16
+
+WMT16_RECORDIO_FILE = "./wmt16_test_pe.recordio"
+
+
+class ModelHyperParams(object):
+    # Dictionary size for source and target language. This model directly uses
+    # paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
+    # alreay been added, but the <pad> token is not added. Transformer requires
+    # sequences in a mini-batch are padded to have the same length. A <pad> token is
+    # added into the original dictionary in paddle.dateset.wmt16.
+
+    # size of source word dictionary.
+    src_vocab_size = 10000
+    # index for <pad> token in source language.
+    src_pad_idx = src_vocab_size
+
+    # size of target word dictionay
+    trg_vocab_size = 10000
+    # index for <pad> token in target language.
+    trg_pad_idx = trg_vocab_size
+
+    # position value corresponding to the <pad> token.
+    pos_pad_idx = 0
+
+    # max length of sequences. It should plus 1 to include position
+    # padding token for position encoding.
+    max_length = 50
+
+    # the dimension for word embeddings, which is also the last dimension of
+    # the input and output of multi-head attention, position-wise feed-forward
+    # networks, encoder and decoder.
+
+    d_model = 512
+    # size of the hidden layer in position-wise feed-forward networks.
+    d_inner_hid = 1024
+    # the dimension that keys are projected to for dot-product attention.
+    d_key = 64
+    # the dimension that values are projected to for dot-product attention.
+    d_value = 64
+    # number of head used in multi-head attention.
+    n_head = 8
+    # number of sub-layers to be stacked in the encoder and decoder.
+    n_layer = 6
+    # dropout rate used by all dropout layers.
+    dropout = 0.1
+
+
+def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and attention bias. Then, convert the numpy
+    data to tensors and return a dict mapping names to tensors.
+    """
+
+    def __pad_batch_data(insts,
+                         pad_idx,
+                         is_target=False,
+                         return_pos=True,
+                         return_attn_bias=True,
+                         return_max_len=True):
+        """
+        Pad the instances to the max sequence length in batch, and generate the
+        corresponding position data and attention bias.
+        """
+        return_list = []
+        max_len = max(len(inst) for inst in insts)
+        inst_data = np.array(
+            [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
+        return_list += [inst_data.astype("int64").reshape([-1, 1])]
+        if return_pos:
+            inst_pos = np.array([[
+                pos_i + 1 if w_i != pad_idx else 0
+                for pos_i, w_i in enumerate(inst)
+            ] for inst in inst_data])
+
+            return_list += [inst_pos.astype("int64").reshape([-1, 1])]
+        if return_attn_bias:
+            if is_target:
+                # This is used to avoid attention on paddings and subsequent
+                # words.
+                slf_attn_bias_data = np.ones((inst_data.shape[0], max_len,
+                                              max_len))
+                slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
+                    [-1, 1, max_len, max_len])
+                slf_attn_bias_data = np.tile(slf_attn_bias_data,
+                                             [1, n_head, 1, 1]) * [-1e9]
+            else:
+                # This is used to avoid attention on paddings.
+                slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
+                                               (max_len - len(inst))
+                                               for inst in insts])
+                slf_attn_bias_data = np.tile(
+                    slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
+                    [1, n_head, max_len, 1])
+            return_list += [slf_attn_bias_data.astype("float32")]
+        if return_max_len:
+            return_list += [max_len]
+        return return_list if len(return_list) > 1 else return_list[0]
+
+    src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data(
+        [inst[0] for inst in insts], src_pad_idx, is_target=False)
+    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data(
+        [inst[1] for inst in insts], trg_pad_idx, is_target=True)
+    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
+                                [1, 1, trg_max_len, 1]).astype("float32")
+    lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False,
+                                False, False, False)
+    lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
+
+    return [
+        src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias,
+        trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
+    ]
+
+
+def transformer(use_feed):
+    assert not use_feed, "transfomer doesn't support feed yet"
+    return transformer_model.transformer(
+        ModelHyperParams.src_vocab_size + 1,
+        ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
+        ModelHyperParams.n_layer, ModelHyperParams.n_head,
+        ModelHyperParams.d_key, ModelHyperParams.d_value,
+        ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
+        ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
+        ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
+
+
+class TestTransformer(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        reader = paddle.batch(
+            wmt16.train(ModelHyperParams.src_vocab_size,
+                        ModelHyperParams.trg_vocab_size),
+            batch_size=transformer_model.batch_size)
+
+        with fluid.recordio_writer.create_recordio_writer(
+                WMT16_RECORDIO_FILE) as writer:
+            for batch in reader():
+                for tensor in prepare_batch_input(
+                        batch, ModelHyperParams.src_pad_idx,
+                        ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head):
+                    t = fluid.LoDTensor()
+                    t.set(tensor, fluid.CPUPlace())
+                    writer.append_tensor(t)
+                writer.complete_append_tensor()
+
+    @unittest.skip("transformer is buggy in multi gpu")
+    def test_main(self):
+        self.check_network_convergence(transformer)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..2105d320665367e3ec1bfd7b3a353a144c91244f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
@@ -0,0 +1,68 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def PolygonBoxRestore(input):
+    shape = input.shape
+    batch_size = shape[0]
+    geo_channels = shape[1]
+    h = shape[2]
+    w = shape[3]
+    h_indexes = np.array(range(h) * w).reshape(
+        [w, h]).transpose()[np.newaxis, :]  # [1, h, w]
+    w_indexes = np.array(range(w) * h).reshape(
+        [h, w])[np.newaxis, :]  # [1, h, w]
+    indexes = np.concatenate(
+        (w_indexes, h_indexes))[np.newaxis, :]  # [1, 2, h, w]
+    indexes = indexes.repeat(
+        [geo_channels / 2],
+        axis=0)[np.newaxis, :]  # [1, geo_channels/2, 2, h, w]
+    indexes = indexes.repeat(
+        [batch_size], axis=0)  # [batch_size, geo_channels/2, 2, h, w]
+    return indexes.reshape(
+        input.shape) - input  # [batch_size, geo_channels, h, w]
+
+
+class TestPolygonBoxRestoreOp(OpTest):
+    def config(self):
+        self.input_shape = (1, 8, 2, 2)
+
+    def setUp(self):
+        self.config()
+        self.op_type = "polygon_box_transform"
+        input = np.random.random(self.input_shape).astype("float32")
+        self.inputs = {'Input': input}
+        output = PolygonBoxRestore(input)
+        self.outputs = {'Output': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestCase1(TestPolygonBoxRestoreOp):
+    def config(self):
+        self.input_shape = (2, 10, 3, 2)
+
+
+class TestCase2(TestPolygonBoxRestoreOp):
+    def config(self):
+        self.input_shape = (3, 12, 4, 5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_preprocessor.py b/python/paddle/fluid/tests/unittests/test_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbf1a7e0c50a87cd43507ffdb94109873cf4e5d9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_preprocessor.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+import paddle.v2.dataset.mnist as mnist
+
+
+class TestPreprocessor(unittest.TestCase):
+    def setUp(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(mnist.train(), batch_size=32)
+            feeder = fluid.DataFeeder(
+                feed_list=[  # order is image and label
+                    fluid.layers.data(
+                        name='image', shape=[784]),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            self.num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
+                './mnist_for_preprocessor_test.recordio', reader, feeder)
+
+    def test_main(self):
+        N = 10
+
+        img_expected_res = []
+        lbl_expected_res = []
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data_file = fluid.layers.io.open_recordio_file(
+                './mnist_for_preprocessor_test.recordio',
+                shapes=[[-1, 784], [-1, 1]],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'])
+            img, lbl = fluid.layers.io.read_file(data_file)
+
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            else:
+                place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for _ in range(N):
+                img_v, lbl_v = exe.run(fetch_list=[img, lbl])
+                img_expected_res.append(img_v / 2)
+                lbl_expected_res.append(lbl_v + 1)
+
+        img_actual_res = []
+        lbl_actual_res = []
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data_file = fluid.layers.io.open_recordio_file(
+                './mnist_for_preprocessor_test.recordio',
+                shapes=[[-1, 784], [-1, 1]],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'])
+            preprocessor = fluid.layers.io.Preprocessor(reader=data_file)
+            with preprocessor.block():
+                img, lbl = preprocessor.inputs()
+                img_out = img / 2
+                lbl_out = lbl + 1
+                preprocessor.outputs(img_out, lbl_out)
+
+            data_file = fluid.layers.io.double_buffer(preprocessor())
+            img, lbl = fluid.layers.io.read_file(data_file)
+
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            else:
+                place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for _ in range(N):
+                img_v, lbl_v = exe.run(fetch_list=[img, lbl])
+                img_actual_res.append(img_v)
+                lbl_actual_res.append(lbl_v)
+
+        for idx in range(N):
+            np.allclose(img_expected_res[idx], img_actual_res[idx])
+            np.allclose(lbl_expected_res[idx], lbl_actual_res[idx])
diff --git a/python/paddle/fluid/tests/unittests/test_random_crop_op.py b/python/paddle/fluid/tests/unittests/test_random_crop_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c708d0386da4028f1f3d177d0a3fd494c077c6e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_random_crop_op.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+
+
+class TestRandomCropOp(OpTest):
+    def setUp(self):
+        to_crop = np.array([[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]] *
+                           5).astype("float32")
+        self.possible_res = [
+            np.array([[1, 2, 3], [5, 6, 7]]), np.array([[2, 3, 4], [6, 7, 8]]),
+            np.array([[5, 6, 7], [9, 10, 11]]),
+            np.array([[6, 7, 8], [10, 11, 12]])
+        ]
+        self.op_type = "random_crop"
+        self.inputs = {'X': to_crop, 'Seed': np.array([10])}
+        self.outputs = {'Out': np.array([]), 'SeedOut': np.array([])}
+        self.attrs = {'shape': [2, 3]}
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)
+
+    def verify_output(self, outs):
+        out = np.array(outs[1])
+        for ins in out[:]:
+            is_equal = [(ins == res).all() for res in self.possible_res]
+            self.assertIn(True, is_equal)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index 9b0cc3534dc551e7fdf7ef8111cad1c172f8bfa4..865c2b7df085aa6a6cb0d6eb461c342ce08695cd 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -34,8 +34,10 @@ class TestMeanOp(OpTest):
     def setUp(self):
         self.op_type = "reduce_mean"
         self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
-        self.attrs = {'dim': 1}
-        self.outputs = {'Out': self.inputs['X'].mean(axis=self.attrs['dim'])}
+        self.attrs = {'dim': [1]}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
+        }
 
     def test_check_output(self):
         self.check_output()
@@ -50,8 +52,10 @@ class TestMaxOp(OpTest):
     def setUp(self):
         self.op_type = "reduce_max"
         self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.attrs = {'dim': -1}
-        self.outputs = {'Out': self.inputs['X'].max(axis=self.attrs['dim'])}
+        self.attrs = {'dim': [-1]}
+        self.outputs = {
+            'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim']))
+        }
 
     def test_check_output(self):
         self.check_output()
@@ -63,8 +67,10 @@ class TestMinOp(OpTest):
     def setUp(self):
         self.op_type = "reduce_min"
         self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.attrs = {'dim': 2}
-        self.outputs = {'Out': self.inputs['X'].min(axis=self.attrs['dim'])}
+        self.attrs = {'dim': [2]}
+        self.outputs = {
+            'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim']))
+        }
 
     def test_check_output(self):
         self.check_output()
@@ -87,9 +93,10 @@ class TestKeepDimReduce(OpTest):
     def setUp(self):
         self.op_type = "reduce_sum"
         self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.attrs = {'dim': -2, 'keep_dim': True}
+        self.attrs = {'dim': [-2], 'keep_dim': True}
         self.outputs = {
-            'Out': self.inputs['X'].sum(axis=self.attrs['dim'], keepdims=True)
+            'Out':
+            self.inputs['X'].sum(axis=tuple(self.attrs['dim']), keepdims=True)
         }
 
     def test_check_output(self):
@@ -126,5 +133,67 @@ class TestReduceAll(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+## reduction in multi dims
+class TestReduceMeanOpMultiAxises(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
+        self.attrs = {'dim': [1, 2]}
+        self.outputs = {'Out': self.inputs['X'].mean(axis=(1, 2))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestReduceMaxOpMultiAxises(OpTest):
+    """Remove Max with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.attrs = {'dim': [-2, -1]}
+        self.outputs = {
+            'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestReduceMinOpMultiAxises(OpTest):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.attrs = {'dim': [1, 2]}
+        self.outputs = {
+            'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestKeepDimReduceSumMultiAxises(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.attrs = {'dim': [-2, -1], 'keep_dim': True}
+        self.outputs = {
+            'Out':
+            self.inputs['X'].sum(axis=tuple(self.attrs['dim']), keepdims=True)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reverse_op.py b/python/paddle/fluid/tests/unittests/test_reverse_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..f845575a02869f08299d76b5600074598ca27f6c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_reverse_op.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestReverseOp(OpTest):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4)).astype('float32')
+        self.axis = [0]
+
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = "reverse"
+        self.inputs = {"X": self.x}
+        self.attrs = {'axis': self.axis}
+        out = self.x
+        for a in self.axis:
+            out = np.flip(out, axis=a)
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestCase0(TestReverseOp):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4)).astype('float32')
+        self.axis = [1]
+
+
+class TestCase1(TestReverseOp):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4)).astype('float32')
+        self.axis = [0, 1]
+
+
+class TestCase2(TestReverseOp):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4, 5)).astype('float32')
+        self.axis = [0, 2]
+
+
+class TestCase3(TestReverseOp):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4, 5)).astype('float32')
+        self.axis = [1, 2]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_shape_op.py b/python/paddle/fluid/tests/unittests/test_shape_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..a62ee050075cb8c9f8817c142825a89c24bdfedf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_shape_op.py
@@ -0,0 +1,47 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestShapeOp(OpTest):
+    def setUp(self):
+        self.op_type = "shape"
+        self.config()
+        self.shape = [2, 3]
+        input = np.zeros(self.shape)
+        self.inputs = {'Input': input}
+        self.outputs = {'Out': np.array(self.shape)}
+
+    def config(self):
+        self.shape = [2, 3]
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class case1(TestShapeOp):
+    def config(self):
+        self.shape = [2]
+
+
+class case2(TestShapeOp):
+    def config(self):
+        self.shape = [1, 2, 3]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ae2844e295194f95701e1cdccd43bf919bf964f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
@@ -0,0 +1,80 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle.fluid as fluid
+from paddle.fluid.transpiler.distribute_transpiler import delete_ops
+
+from transpiler_test import TranspilerTest
+
+
+class TestSimpleDistTranspiler(TranspilerTest):
+    def setUp(self):
+        self.current_pserver_ep = "127.0.0.1:6175"
+
+    def test_simple_transpiler(self):
+        np.random.seed(1)
+
+        trainer = self.get_trainer()
+        pserver, startup = self.get_pserver(self.current_pserver_ep)
+        self.assertEqual([op.type for op in trainer.global_block().ops],
+                         self.get_expect_trainer_ops())
+
+        self.assertEqual(len(pserver.blocks), 2)
+        # block0: listen_and_serv
+        self.assertEqual([op.type for op in pserver.blocks[0].ops],
+                         ["listen_and_serv"])
+        # block1: optimize pass
+        self.assertEqual([op.type for op in pserver.blocks[1].ops],
+                         ["sum", "scale", "sgd"])
+
+        # confirm startup program
+        self.assertEqual([op.type for op in startup.global_block().ops],
+                         ["fill_constant", "uniform_random", "uniform_random"])
+
+        # the variable #fc_w will NOT be splited
+        fc_w_var = startup.global_block().var("fc_w@GRAD")
+        self.assertEqual(fc_w_var.shape, (1000, 1000))
+
+        fc_w_var = startup.global_block().var("fc_w@GRAD.trainer_0")
+        self.assertEqual(fc_w_var.shape, (1000, 1000))
+
+    def get_expect_trainer_ops(self):
+        trainer = fluid.Program()
+
+        with fluid.program_guard(trainer):
+            optimize_ops, params_grads = self.net_conf()
+
+        delete_ops(trainer.global_block(), optimize_ops)
+        ops = [op.type for op in trainer.global_block().ops] + [
+            "send_vars", "send_barrier", "recv", "recv", "fetch_barrier"
+        ]
+        ops.insert(ops.index("elementwise_add_grad") + 1, "send_vars")
+        return ops
+
+    def _transpiler_instance(self):
+        main = self.get_main_program()
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            self.trainer_id,
+            program=main,
+            pservers=self.pserver_eps,
+            trainers=self.trainers,
+            slice_var_up=False)
+        return t
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_split_var.py b/python/paddle/fluid/tests/unittests/test_slice_var.py
similarity index 71%
rename from python/paddle/fluid/tests/unittests/test_split_var.py
rename to python/paddle/fluid/tests/unittests/test_slice_var.py
index 104ceb4fe7beb70b9016f57cef0ef895a3eb8ba6..82305b23a1a1e2cee8cef6b291d848581fe5b509 100644
--- a/python/paddle/fluid/tests/unittests/test_split_var.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_var.py
@@ -14,22 +14,14 @@
 
 import math
 import unittest
-from paddle.fluid.distribute_transpiler import split_dense_variable
+from paddle.fluid.transpiler.distribute_transpiler import slice_variable
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import random
 
 
-class TestSplitVar(unittest.TestCase):
-    def test_check_output(self):
-        # split below shapes to 10 servers
-        shapes = [[3, 5], [1024], [28, 784], [8, 1020], [800, 10]]
-        expected_sizes = [
-            [15], [1024],
-            [2352, 2352, 2352, 2352, 2352, 2352, 2352, 2352, 2352, 784],
-            [2040, 2040, 2040, 2040],
-            [1150, 1150, 1150, 1150, 1150, 1150, 1100]
-        ]
+class TestSliceVar(unittest.TestCase):
+    def check_slice_output(self, shapes, expected_sizes, min_size):
         var_list = []
         program = fluid.Program()
         for shape in shapes:
@@ -39,7 +31,7 @@ class TestSplitVar(unittest.TestCase):
                 # dtype=core.VarDesc.VarType.LOD_TENSOR,
                 shape=shape)
             var_list.append(var)
-        blocks = split_dense_variable(var_list, 10)
+        blocks = slice_variable(var_list, 10, min_size)
         all_sizes = []
         for s in expected_sizes:
             for s2 in s:
@@ -48,6 +40,25 @@ class TestSplitVar(unittest.TestCase):
             varname, block_id, size = block_str.split(":")
             self.assertEqual(int(size), all_sizes[i])
 
+    def test_1k(self):
+        shapes = [[3, 5], [1024], [28, 784], [8, 1020], [800, 10]]
+        expected_sizes = [
+            [15], [1024],
+            [2352, 2352, 2352, 2352, 2352, 2352, 2352, 2352, 2352, 784],
+            [2040, 2040, 2040, 2040],
+            [1150, 1150, 1150, 1150, 1150, 1150, 1100]
+        ]
+
+        self.check_slice_output(shapes, expected_sizes, 1024)
+
+    def test_check_output_8k(self):
+        shapes = [[3, 5], [1024], [28, 784], [8, 1020], [800, 10],
+                  [6, 33, 33, 33]]
+        expected_sizes = [[15], [1024], [10976, 10976], [8160], [8000],
+                          [35937, 35937, 35937, 35937, 35937, 35937]]
+
+        self.check_slice_output(shapes, expected_sizes, 8192)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dc94a80c9d3999d34fdf0edbf82ffe297bd95d7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
@@ -0,0 +1,182 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+
+
+def as_lodtensor(np_array, lod, place):
+    tensor = core.LoDTensor()
+    tensor.set(np_value, place)
+    if lod is not None:
+        tensor.set_lod(lod)
+    return tensor
+
+
+def create_op(scope, op_type, inputs, outputs, attrs):
+    kwargs = dict()
+
+    op_maker = core.op_proto_and_checker_maker
+    op_role_attr_name = op_maker.kOpRoleAttrName()
+
+    if op_role_attr_name not in attrs:
+        attrs[op_role_attr_name] = int(op_maker.OpRole.Forward)
+
+    def __create_var__(name, var_name):
+        scope.var(var_name).get_tensor()
+        kwargs[name].append(var_name)
+
+    for in_name, in_dup in Operator.get_op_inputs(op_type):
+        if in_name in inputs:
+            kwargs[in_name] = []
+            if in_dup:
+                sub_in = inputs[in_name]
+                for item in sub_in:
+                    sub_in_name, _ = item[0], item[1]
+                    __create_var__(in_name, sub_in_name)
+            else:
+                __create_var__(in_name, in_name)
+
+    for out_name, out_dup in Operator.get_op_outputs(op_type):
+        if out_name in outputs:
+            kwargs[out_name] = []
+            if out_dup:
+                sub_out = outputs[out_name]
+                for item in sub_out:
+                    sub_out_name, _ = item[0], item[1]
+                    __create_var__(out_name, sub_out_name)
+            else:
+                __create_var__(out_name, out_name)
+
+    for attr_name in Operator.get_op_attr_names(op_type):
+        if attr_name in attrs:
+            kwargs[attr_name] = attrs[attr_name]
+
+    return Operator(op_type, **kwargs)
+
+
+def set_input(scope, op, inputs, place):
+    def __set_input__(var_name, var):
+        if isinstance(var, tuple) or isinstance(var, np.ndarray):
+            tensor = scope.find_var(var_name).get_tensor()
+            if isinstance(var, tuple):
+                tensor.set_lod(var[1])
+                var = var[0]
+            tensor.set_dims(var.shape)
+            tensor.set(var, place)
+        elif isinstance(var, float):
+            scope.find_var(var_name).set_float(var)
+        elif isinstance(var, int):
+            scope.find_var(var_name).set_int(var)
+
+    for in_name, in_dup in Operator.get_op_inputs(op.type()):
+        if in_name in inputs:
+            if in_dup:
+                sub_in = inputs[in_name]
+                for item in sub_in:
+                    sub_in_name, sub_in_val = item[0], item[1]
+                    __set_input__(sub_in_name, sub_in_val)
+            else:
+                __set_input__(in_name, inputs[in_name])
+
+
+def append_input_output(block, op_proto, np_list, is_input, dtype):
+    '''Insert VarDesc and generate Python variable instance'''
+    proto_list = op_proto.inputs if is_input else op_proto.outputs
+
+    def create_var(block, name, np_list, var_proto):
+        dtype = None
+        shape = None
+        lod_level = None
+        if name not in np_list:
+            assert var_proto.intermediate, "{} not found".format(name)
+        else:
+            np_value = np_list[name]
+            if isinstance(np_value, tuple):
+                dtype = np_value[0].dtype
+                # output shape, lod should be infered from input.
+                if is_input:
+                    shape = list(np_value[0].shape)
+                    lod_level = len(np_value[1])
+            else:
+                dtype = np_value.dtype
+                if is_input:
+                    shape = list(np_value.shape)
+                    lod_level = 0
+        return block.create_var(
+            dtype=dtype, shape=shape, lod_level=lod_level, name=name)
+
+    var_dict = {}
+    for var_proto in proto_list:
+        var_name = str(var_proto.name)
+        if is_input:
+            if (var_name not in np_list) and var_proto.dispensable:
+                continue
+            assert (var_name in np_list) or (var_proto.dispensable), \
+                "Missing {} as input".format(var_name)
+        if var_proto.duplicable:
+            assert isinstance(np_list[var_name], list), \
+                "Duplicable {} should be set as list".format(var_name)
+            var_list = []
+            for (name, np_value) in np_list[var_name]:
+                var_list.append(
+                    create_var(block, name, {name: np_value}, var_proto))
+            var_dict[var_name] = var_list
+        else:
+            var_dict[var_name] = create_var(block, var_name, np_list, var_proto)
+
+    return var_dict
+
+
+def append_loss_ops(block, output_names):
+    mean_inputs = map(block.var, output_names)
+    # for item in mean_inputs:
+    #     print(item)
+    #     print("Item", item.dtype)
+
+    if len(mean_inputs) == 1:
+        loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1])
+        op = block.append_op(
+            inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean')
+        op.desc.infer_var_type(block.desc)
+        op.desc.infer_shape(block.desc)
+    else:
+        avg_sum = []
+        for cur_loss in mean_inputs:
+            cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1])
+            op = block.append_op(
+                inputs={"X": [cur_loss]},
+                outputs={"Out": [cur_avg_loss]},
+                type="mean")
+            op.desc.infer_var_type(block.desc)
+            op.desc.infer_shape(block.desc)
+            avg_sum.append(cur_avg_loss)
+
+        loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1])
+        op_sum = block.append_op(
+            inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum')
+        op_sum.desc.infer_var_type(block.desc)
+        op_sum.desc.infer_shape(block.desc)
+
+        loss = block.create_var(dtype=loss_sum.dtype, shape=[1])
+        op_loss = block.append_op(
+            inputs={"X": loss_sum},
+            outputs={"Out": loss},
+            type='scale',
+            attrs={'scale': 1.0 / float(len(avg_sum))})
+        op_loss.desc.infer_var_type(block.desc)
+        op_loss.desc.infer_shape(block.desc)
+    return loss
diff --git a/python/paddle/fluid/tests/unittests/transpiler_test.py b/python/paddle/fluid/tests/unittests/transpiler_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d84c5d9c41c705cf6d14cc0b5a8c692b0d646337
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/transpiler_test.py
@@ -0,0 +1,73 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.layers as layers
+
+
+class TranspilerTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        self.trainer_id = 0
+        self.trainers = 2
+        self.pservers = 2
+        self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175"
+
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'))
+
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+
+        optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+        return optimize_ops, params_grads
+
+    def get_main_program(self):
+        main = fluid.Program()
+
+        with fluid.program_guard(main):
+            self.net_conf()
+
+        return main
+
+    def get_trainer(self):
+        return self._transpiler_instance().get_trainer_program()
+
+    def get_pserver(self, ep):
+        t = self._transpiler_instance()
+        pserver = t.get_pserver_program(ep)
+        startup = t.get_startup_program(ep, pserver)
+        return pserver, startup
+
+    def _transpiler_instance(self):
+        main = self.get_main_program()
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            self.trainer_id,
+            program=main,
+            pservers=self.pserver_eps,
+            trainers=self.trainers)
+        return t
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 1cbecd69e59882212d623b7fcaf741f0370a7a15..cdacb419863518cc0606903ed8eb79f0d2bc9e40 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -12,17 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import contextlib
 import os
+
 import core
-import framework
-import executor
+
 import data_feeder
-import contextlib
+import executor
+import framework
 import io
-import transpiler
-
 # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
 import optimizer as opt_module
+import parallel_executor
 from transpiler import distribute_transpiler
 
 __all__ = [
@@ -48,43 +49,86 @@ class BeginStepEvent(object):
     def __init__(self, epoch_id, step_id):
         self.epoch = epoch_id
         self.step = step_id
+        self.fetch_metrics = True
 
 
 class EndStepEvent(object):
-    def __init__(self, epoch_id, step_id):
+    def __init__(self, epoch_id, step_id, metrics):
         self.epoch = epoch_id
         self.step = step_id
+        self.metrics = metrics
+
+
+def check_and_get_place(place):
+    """
+    Check the type of place or get the default place
+    Args:
+        place(None|core.CUDAPlace|core.CPUPlace): the place that trainer will be executed on.
+
+    Raises:
+        TypeError if the type mismatched.
+
+    Returns:
+        the original place if it is not None.
+        if fluid is compiled with CUDA, returns CUDAPlace(0) by default.
+        Otherwise returns CPUPlace by default.
+    """
+    if place is None:
+        if core.is_compiled_with_cuda():
+            return core.CUDAPlace(0)
+        else:
+            return core.CPUPlace()
+    else:
+        if not isinstance(place, core.CUDAPlace) and not isinstance(
+                place, core.CPUPlace):
+            raise TypeError("Place should be either CUDAPlace or CPUPlace")
+        return place
 
 
 class Trainer(object):
     """
 
     Args:
-        program_func(callable): A function which will return loss. The loss must be a scaler.
-        optimizer(optimizer.Optimizer): The optimizer should be an instance of Optimizer
+        train_func(callable): A function which will return loss. The loss must be a scalar.
+        optimizer_func(callable): A function that returns an Optimizer object.
         place: The device place of this trainer.
     """
 
-    def __init__(self, program_func, optimizer, param_path=None, place=None):
+    def __init__(self,
+                 train_func,
+                 optimizer_func,
+                 param_path=None,
+                 place=None,
+                 parallel=False):
+        self.__stop = False
+        self.parallel = parallel
         # 1. we need to generate a framework.Program by calling
         # program_func. Reference: fluid.program_guard in
         # test_word2vec.py
+
         self.scope = core.Scope()
 
         self.startup_program = framework.Program()
         self.train_program = framework.Program()
 
         with framework.program_guard(self.train_program, self.startup_program):
-            loss = program_func()
+            program_func_outs = train_func()
+            self.train_func_outputs = program_func_outs if isinstance(
+                program_func_outs, list) else [program_func_outs]
+            self.test_program = self.train_program.clone()
+
+            # The fisrt element of program_func_outs is loss.
+            loss = self.train_func_outputs[0]
+
+            optimizer = optimizer_func()
             if not isinstance(optimizer, opt_module.Optimizer):
                 raise TypeError(
                     "The optimizer should be an instance of Optimizer")
-
             optimize_ops, params_grads = optimizer.minimize(loss)
 
-        self.place = Trainer._check_and_get_place(place)
+        self.place = check_and_get_place(place)
 
-        self.dist_transpile_if_necessary(optimize_ops, params_grads)
+        self._dist_transpile_if_necessary(optimize_ops, params_grads)
 
         # 2. move the default_main_program to self.program and run the
         # default_startup program on an empty core.Scope()
@@ -97,7 +141,40 @@ class Trainer(object):
             # load params from param_path into scope
             io.load_persistables(exe, dirname=param_path)
 
-    def dist_transpile_if_necessary(self, optimize_ops, params_grads):
+    def _transpile_nccl2_dist(self):
+        # PADDLE_TRAINER_IPS
+        if "PADDLE_TRAINER_IPS" not in os.environ:
+            self.nccl_id_var = None
+        else:
+            self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+            port = os.getenv("PADDLE_PSERVER_PORT")
+            worker_ips = os.getenv("PADDLE_TRAINER_IPS")
+            worker_endpoints = []
+            for ip in worker_ips.split(","):
+                worker_endpoints.append(':'.join([ip, port]))
+            self.num_trainers = len(worker_endpoints)
+            current_endpoint = os.getenv("POD_IP") + ":" + port
+            worker_endpoints.remove(current_endpoint)
+            # TODO(wuyi): use self.nccl_id_var, self.num_trainers and self.trainer_id
+            # in ParallelExecutor to start
+            # distributed training using NCCL2
+            self.nccl_id_var = self.startup_program.global_block().create_var(
+                name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
+            self.startup_program.global_block().append_op(
+                type="gen_nccl_id",
+                inputs={},
+                outputs={"NCCLID": self.nccl_id_var},
+                attrs={
+                    "endpoint": current_endpoint,
+                    "endpoint_list": worker_endpoints,
+                    "trainer_id": self.trainer_id
+                })
+
+    def _dist_transpile_if_necessary(self, optimize_ops, params_grads):
+        self._transpile_nccl2_dist()
+        if self.nccl_id_var != None:
+            return
+
         if "PADDLE_TRAINING_ROLE" not in os.environ:
             return
 
@@ -135,12 +212,13 @@ class Trainer(object):
                     'TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
                 )
 
-    def train(self,
-              num_epochs,
-              event_handler,
-              reader=None,
-              parallel=False,
-              feed_order=None):
+    def stop(self):
+        """
+        stop training
+        """
+        self.__stop = True
+
+    def train(self, num_epochs, event_handler, reader=None, feed_order=None):
         """
         Train the model.
 
@@ -148,60 +226,43 @@ class Trainer(object):
             num_epochs: The number of epoch. An epoch will process all data in reader
             event_handler: The event handler. A function with type (ev:Event)->void
             reader:
-            parallel: True if use multi-CPUs or multi-GPUs
             feed_order: Feeding order of reader. None will following the defining
                 order in program
 
         Returns:
 
         """
-        if parallel:
-            raise NotImplementedError(
-                "Parallel Executor version of trainer is not implemented")
-
         training_role = os.getenv("PADDLE_TRAINING_ROLE", "")
         if training_role == "PSERVER":
             with self._prog_and_scope_guard():
                 exe = executor.Executor(self.place)
                 exe.run()
                 return
+        if self.parallel:
+            self._train_by_parallel_executor(num_epochs, event_handler, reader,
+                                             feed_order)
+        else:
+            self._train_by_executor(num_epochs, event_handler, reader,
+                                    feed_order)
 
-        self._train_by_executor(num_epochs, event_handler, reader, feed_order)
-
-    def test(self, reader):
-        pass
-
-    def save_params(self, param_path):
-        # reference: save_persistables in io.py
-        exe = executor.Executor(self.place)
-        io.save_persistables(
-            exe, dirname=param_path, main_program=self.startup_program)
-
-    @staticmethod
-    def _check_and_get_place(place):
+    def test(self, reader, feed_order):
         """
-        Check the type of place or get the default place
+        Test the model on given test data
+
         Args:
-            place(None|core.CUDAPlace|core.CPUPlace): the place that trainer will be executed on.
+            reader: The reader that yields test data.
+            feed_order: Feeding order of reader. None will following the defining
+                order in program
+        """
 
-        Raises:
-            TypeError if the type mismatched.
+        return self._test_by_executor(reader, feed_order,
+                                      self.train_func_outputs)
 
-        Returns:
-            the original place if it is not None.
-            if fluid is compiled with CUDA, returns CUDAPlace(0) by default.
-            Otherwise returns CPUPlace by default.
-        """
-        if place is None:
-            if core.is_compiled_with_cuda():
-                return core.CUDAPlace(0)
-            else:
-                return core.CPUPlace()
-        else:
-            if not isinstance(place, core.CUDAPlace) and not isinstance(
-                    place, core.CPUPlace):
-                raise TypeError("Place should be either CUDAPlace or CPUPlace")
-            return place
+    def save_params(self, param_path):
+        # reference: save_persistables in io.py
+        with self._prog_and_scope_guard():
+            exe = executor.Executor(self.place)
+            io.save_persistables(exe, dirname=param_path)
 
     @contextlib.contextmanager
     def _prog_and_scope_guard(self):
@@ -225,26 +286,88 @@ class Trainer(object):
 
         """
         with self._prog_and_scope_guard():
+            feed_var_list = build_feed_var_list(self.train_program, feed_order)
+            feeder = data_feeder.DataFeeder(
+                feed_list=feed_var_list, place=self.place)
             exe = executor.Executor(self.place)
-            if feed_order is None:
-                feed_var_list = [
-                    var
-                    for var in self.train_program.global_block(
-                    ).vars.itervalues()
-                    if hasattr(var, 'is_data') and var.is_data
-                ]
-            else:
-                feed_var_list = [
-                    self.train_program.global_block().var(var_name)
-                    for var_name in feed_order
-                ]
-
+            reader = feeder.decorate_reader(reader, multi_devices=False)
+            self._train_by_any_executor(event_handler, exe, num_epochs, reader)
+
+    def _train_by_any_executor(self, event_handler, exe, num_epochs, reader):
+        for epoch_id in range(num_epochs):
+            event_handler(BeginEpochEvent(epoch_id))
+            for step_id, data in enumerate(reader()):
+                if self.__stop:
+                    return
+                begin_event = BeginStepEvent(epoch_id, step_id)
+                event_handler(begin_event)
+                if begin_event.fetch_metrics:
+                    metrics = exe.run(feed=data,
+                                      fetch_list=[
+                                          var.name
+                                          for var in self.train_func_outputs
+                                      ])
+                else:
+                    metrics = exe.run(feed=data, fetch_list=[])
+                event_handler(EndStepEvent(epoch_id, step_id, metrics))
+            event_handler(EndEpochEvent(epoch_id))
+
+    def _test_by_executor(self, reader, feed_order, fetch_list):
+        with executor.scope_guard(self.scope):
+            feed_var_list = build_feed_var_list(self.test_program, feed_order)
+            feeder = data_feeder.DataFeeder(
+                feed_list=feed_var_list, place=self.place)
+            exe = executor.Executor(self.place)
+            accumulated = len(fetch_list) * [0]
+            count = 0
+            for data in reader():
+                outs = exe.run(program=self.test_program,
+                               feed=feeder.feed(data),
+                               fetch_list=fetch_list)
+                accumulated = [x[0] + x[1][0] for x in zip(accumulated, outs)]
+                count += 1
+
+            return [x / count for x in accumulated]
+
+    def _train_by_parallel_executor(self, num_epochs, event_handler, reader,
+                                    feed_order):
+        with self._prog_and_scope_guard():
+            pe = self._get_or_create_parallel_executor()
+            feed_var_list = build_feed_var_list(self.train_program, feed_order)
             feeder = data_feeder.DataFeeder(
                 feed_list=feed_var_list, place=self.place)
-            for epoch_id in range(num_epochs):
-                event_handler(BeginEpochEvent(epoch_id))
-                for step_id, data in enumerate(reader()):
-                    event_handler(BeginStepEvent(epoch_id, step_id))
-                    exe.run(feed=feeder.feed(data), fetch_list=[])
-                    event_handler(EndStepEvent(epoch_id, step_id))
-                event_handler(EndEpochEvent(epoch_id))
+            reader = feeder.decorate_reader(reader, multi_devices=True)
+            self._train_by_any_executor(event_handler, pe, num_epochs, reader)
+
+    def _get_parallel_executor(self):
+        return getattr(self, 'parallel_executor', None)
+
+    def _get_or_create_parallel_executor(self):
+        if self._get_parallel_executor() is None:
+            self.parallel_executor = parallel_executor.ParallelExecutor(
+                use_cuda=isinstance(self.place, core.CUDAPlace),
+                loss_name=self.train_func_outputs[0].name)
+        return self._get_parallel_executor()
+
+
+def build_feed_var_list(program, feed_order):
+    if not isinstance(program, framework.Program):
+        raise TypeError("The 'program' should be an object of Program")
+
+    if isinstance(feed_order, list):
+        feed_var_list = [
+            program.global_block().var(var_name) for var_name in feed_order
+        ]
+    else:
+        if not isinstance(feed_order, dict):
+            raise TypeError(
+                "The 'feed_order' should be either None, list or dict.")
+        if not sorted(feed_order.values()) == range(len(feed_order)):
+            raise ValueError(
+                "The values of 'feed_order' should be a permutation of [0, len(feed_order))"
+            )
+        sorted_pair_list = sorted(feed_order.items(), key=lambda item: item[1])
+        feed_var_list = [
+            program.global_block().var(pair[0]) for pair in sorted_pair_list
+        ]
+    return feed_var_list
diff --git a/python/paddle/fluid/transpiler/__init__.py b/python/paddle/fluid/transpiler/__init__.py
index 6d3c1b947f4acb1335b25e6eb0099d5d532c895a..cf18090f71f34be5105498f5846dbcdf15ab2e3f 100644
--- a/python/paddle/fluid/transpiler/__init__.py
+++ b/python/paddle/fluid/transpiler/__init__.py
@@ -11,12 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from distribute_transpiler import DistributeTranspiler
 from inference_transpiler import InferenceTranspiler
 from memory_optimization_transpiler import memory_optimize, release_memory
-from distribute_transpiler_simple import SimpleDistributeTranspiler
+from ps_dispatcher import HashName, RoundRobin
 
 __all__ = [
-    "DistributeTranspiler", "InferenceTranspiler", "SimpleDistributeTranspiler",
-    "memory_optimize", "release_memory"
+    "DistributeTranspiler", "InferenceTranspiler", "memory_optimize",
+    "release_memory", "HashName", "RoundRobin"
 ]
diff --git a/python/paddle/fluid/transpiler/details/__init__.py b/python/paddle/fluid/transpiler/details/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc597c33849dc06cc975b245099672f64c3539d3
--- /dev/null
+++ b/python/paddle/fluid/transpiler/details/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from program_utils import *
+from ufind import *
diff --git a/python/paddle/fluid/transpiler/details/program_utils.py b/python/paddle/fluid/transpiler/details/program_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f10b496306a002ee131d01798a0698b807d379ca
--- /dev/null
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def delete_ops(block, ops):
+    try:
+        start = list(block.ops).index(ops[0])
+        end = list(block.ops).index(ops[-1])
+        [block.remove_op(start) for _ in xrange(end - start + 1)]
+    except Exception, e:
+        raise e
+    block.program.sync_with_cpp()
+
+
+def find_op_by_input_arg(block, arg_name):
+    for index, op in enumerate(block.ops):
+        if arg_name in op.input_arg_names:
+            return index
+    return -1
+
+
+def find_op_by_output_arg(block, arg_name):
+    for index, op in enumerate(block.ops):
+        if arg_name in op.output_arg_names:
+            return index
+    return -1
diff --git a/python/paddle/fluid/transpiler/details/ufind.py b/python/paddle/fluid/transpiler/details/ufind.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e30d0e3f9c5712c494daf17b2b4bcec86f69c23
--- /dev/null
+++ b/python/paddle/fluid/transpiler/details/ufind.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class UnionFind(object):
+    """ Union-find data structure.
+
+    Union-find is a data structure that keeps track of a set of elements partitioned
+    into a number of disjoint (non-overlapping) subsets.
+
+    Reference:
+    https://en.wikipedia.org/wiki/Disjoint-set_data_structure
+
+    Args:
+      elements(list): The initialize element list.
+    """
+
+    def __init__(self, elementes=None):
+        self._parents = []  # index -> parent index
+        self._index = {}  # element -> index
+        self._curr_idx = 0
+        if not elementes:
+            elementes = []
+        for ele in elementes:
+            self._parents.append(self._curr_idx)
+            self._index.update({ele: self._curr_idx})
+            self._curr_idx += 1
+
+    def find(self, x):
+        # Find the root index of given element x,
+        # execute the path compress while findind the root index
+        if not x in self._index:
+            return -1
+        idx = self._index[x]
+        while idx != self._parents[idx]:
+            t = self._parents[idx]
+            self._parents[idx] = self._parents[t]
+            idx = t
+        return idx
+
+    def union(self, x, y):
+        # Union two given element
+        x_root = self.find(x)
+        y_root = self.find(y)
+
+        if x_root == y_root:
+            return
+        self._parents[x_root] = y_root
+
+    def is_connected(self, x, y):
+        # If two given elements have the same root index,
+        # then they are connected.
+        return self.find(x) == self.find(y)
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 640ac9f085e6dc83bb04faafadf4846089ad3e29..27992df462ffd00ddf445538cc508b4232712481 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -11,18 +11,49 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+Transpile the program to distributed data-parallelism programs.
+The main_program will be transformed to use a remote parameter server
+to do parameter optimization. And the optimization graph will be put
+into a parameter server program.
+
+Use different methods to split trainable variables to different
+parameter servers.
+
+Steps to transpile trainer:
+1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
+2. rename splited grad variables to add trainer_id suffix ".trainer_%d".
+3. modify trainer program add split_op to each grad variable.
+4. append send_op to send splited variables to server and fetch
+    params(splited blocks or origin param) from server.
+5. append concat_op to merge splited blocks to update local weights.
+
+Steps to transpile pserver:
+1. create new program for parameter server.
+2. create params and grad variables that assigned to current server instance.
+3. create a sub-block in the server side program
+4. append ops that should run on current server instance.
+5. add listen_and_serv op
+"""
 
 from __future__ import print_function
 
 import math
+import numpy as np
 
-import distributed_splitter as splitter
-from .. import core
-from ..framework import Program, default_main_program, Variable, Parameter
+from ps_dispatcher import RoundRobin, HashName, PSDispatcher
+from .. import core, framework
+from ..framework import Program, default_main_program, \
+                        default_startup_program, \
+                        Variable, Parameter, grad_var_name
+from details import *
 
 LOOKUP_TABLE_TYPE = "lookup_table"
 LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
-RPC_CLIENT_VAR_NAME = "RPC_CLIENT_VAR"
+OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
+RPC_OP_ROLE_ATTR_NAME = op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName(
+)
+RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC
 
 
 class VarBlock:
@@ -36,85 +67,37 @@ class VarBlock:
         return "%s:%d:%d" % (self.varname, self.offset, self.size)
 
 
-class UnionFind(object):
-    """ Union-find data structure.
-
-    Union-find is a data structure that keeps track of a set of elements partitioned
-    into a number of disjoint (non-overlapping) subsets.
-
-    Reference:
-    https://en.wikipedia.org/wiki/Disjoint-set_data_structure
-
-    Args:
-      elements(list): The initialize element list.
-    """
-
-    def __init__(self, elementes=None):
-        self._parents = []  # index -> parent index
-        self._index = {}  # element -> index
-        self._curr_idx = 0
-        if not elementes:
-            elementes = []
-        for ele in elementes:
-            self._parents.append(self._curr_idx)
-            self._index.update({ele: self._curr_idx})
-            self._curr_idx += 1
-
-    def find(self, x):
-        # Find the root index of given element x,
-        # execute the path compress while findind the root index
-        if not x in self._index:
-            return -1
-        idx = self._index[x]
-        while idx != self._parents[idx]:
-            t = self._parents[idx]
-            self._parents[idx] = self._parents[t]
-            idx = t
-        return idx
-
-    def union(self, x, y):
-        # Union two given element
-        x_root = self.find(x)
-        y_root = self.find(y)
-
-        if x_root == y_root:
-            return
-        self._parents[x_root] = y_root
-
-    def is_connected(self, x, y):
-        # If two given elements have the same root index,
-        # then they are connected.
-        return self.find(x) == self.find(y)
-
-
 def same_or_split_var(p_name, var_name):
     return p_name == var_name or p_name.startswith(var_name + ".block")
 
 
-def split_dense_variable(var_list,
-                         pserver_count,
-                         min_block_size=1024,
-                         max_block_size=1048576):
+def slice_variable(var_list, slice_count, min_block_size=8192):
     """
-        We may need to split dense tensor to one or more blocks and put
-        them equally onto parameter server. One block is a sub-tensor
-        aligned by dim[0] of the tensor.
-
-        We need to have a minimal block size so that the calculations in
-        the parameter server side can gain better performance. By default
-        minimum block size is 1024. The max block size is used to prevent
-        very large blocks that may cause send error.
-        :return: A list of VarBlocks. Each VarBlock specifies a shard of
-           the var.
+    We may need to split dense tensor to one or more blocks and put
+    them equally onto parameter server. One block is a sub-tensor
+    aligned by dim[0] of the tensor.
+
+    We need to have a minimal block size so that the calculations in
+    the parameter server side can gain better performance. By default
+    minimum block size 8K elements (maybe 16bit or 32bit or 64bit).
+
+    Args:
+        var_list (list): List of variables.
+        slice_count (int): Numel of count that variables will be sliced, which
+            could be the pserver services' count.
+        min_block_size (int): Minimum splitted block size.
+    Returns:
+        blocks (list[(varname, block_id, current_block_size)]): A list
+            of VarBlocks. Each VarBlock specifies a shard of the var.
     """
     blocks = []
     for var in var_list:
-        split_count = pserver_count
+        split_count = slice_count
         var_numel = reduce(lambda x, y: x * y, var.shape)
         max_pserver_count = int(math.floor(var_numel / float(min_block_size)))
         if max_pserver_count == 0:
             max_pserver_count = 1
-        if max_pserver_count < pserver_count:
+        if max_pserver_count < slice_count:
             split_count = max_pserver_count
         block_size = int(math.ceil(var_numel / float(split_count)))
 
@@ -134,84 +117,15 @@ def split_dense_variable(var_list,
     return blocks
 
 
-def delete_ops(block, ops):
-    try:
-        start = list(block.ops).index(ops[0])
-        end = list(block.ops).index(ops[-1])
-        [block.remove_op(start) for _ in xrange(end - start + 1)]
-    except Exception, e:
-        raise e
-    block.program.sync_with_cpp()
-
-
 class DistributeTranspiler:
-    def transpile(self,
-                  trainer_id,
-                  program=None,
-                  pservers="127.0.0.1:6174",
-                  trainers=1,
-                  split_method=splitter.round_robin,
-                  sync_mode=True):
-        """
-            Transpile the program to distributed data-parallelism programs.
-            The main_program will be transformed to use a remote parameter server
-            to do parameter optimization. And the optimization graph will be put
-            into a parameter server program.
-
-            Use different methods to split trainable variables to different
-            parameter servers.
-
-            Steps to transpile trainer:
-            1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
-            2. rename splited grad variables to add trainer_id suffix ".trainer_%d".
-            3. modify trainer program add split_op to each grad variable.
-            4. append send_op to send splited variables to server and fetch
-               params(splited blocks or origin param) from server.
-            5. append concat_op to merge splited blocks to update local weights.
-
-            Steps to transpile pserver:
-            1. create new program for parameter server.
-            2. create params and grad variables that assigned to current server instance.
-            3. create a sub-block in the server side program
-            4. append ops that should run on current server instance.
-            5. add listen_and_serv op
-
-            :param trainer_id: one unique id for each trainer in a job.
-            :type trainer_id: int
-            :param program: program to transpile, default is default_main_program
-            :type program: Program
-            :param pservers: parameter server endpoints like "m1:6174,m2:6174"
-            :type pservers: string
-            :param trainers: total number of workers/trainers in the job
-            :type trainers: int
-            :param split_method: A function to determin how to split variables
-                to different servers equally.
-            :type split_method: function
-            :type sync_mode: boolean default True
-            :param sync_mode: if sync_mode is set True, it means that dist transpiler
-            will transpile the program into sync_mode pserver and trainer program.
-        """
-        assert (callable(split_method))
-        if program is None:
-            program = default_main_program()
-        self.origin_program = program
-        self.trainer_num = trainers
-        self.sync_mode = sync_mode
-        # TODO(typhoonzero): currently trainer_id is fetched from cluster system
-        # like Kubernetes, we should port this to use etcd later when developing
-        # fluid distributed training with fault-tolerance.
-        self.trainer_id = trainer_id
-        pserver_endpoints = pservers.split(",")
-        self.pserver_endpoints = pserver_endpoints
-        self.optimize_ops, params_grads = self._get_optimize_pass()
-
+    def _has_distributed_lookup_table(self):
         # process lookup_table_op
         # 1. check all lookup_table_op is distributed
         # 2. check all lookup_table_op share the same table.
         distributed_lookup_table_ops = []
         # support only one distributed_lookup_table now
         self.table_name = None
-        for op in program.global_block().ops:
+        for op in self.origin_program.global_block().ops:
             if op.type == LOOKUP_TABLE_TYPE:
                 if op.attrs['is_distributed'] is True:
                     if self.table_name is None:
@@ -224,91 +138,242 @@ class DistributeTranspiler:
                     if self.table_name is not None:
                         assert op.input("W")[0] != self.table_name
 
-        self.has_distributed_lookup_table = len(
-            distributed_lookup_table_ops) > 0
-
-        # step1: For large parameters and gradients, split them into smaller
-        # blocks.
-        param_list = []
-        grad_list = []
-        for p, g in params_grads:
-            # skip parameter marked not trainable
-            if type(p) == Parameter and p.trainable == False:
-                continue
-            param_list.append(p)
-            grad_list.append(g)
+        return len(distributed_lookup_table_ops) > 0
 
+    def _update_dist_lookup_table_vars(self, param_list, grad_list,
+                                       params_grads):
+        # TODO(wuyi): put find a way to put dist lookup table stuff all together.
+        # update self.table_param_grad and self.trainer_side_table_grad_list
+        program = self.origin_program
         if self.has_distributed_lookup_table:
             param_list = [
                 param for param in param_list if param.name != self.table_name
             ]
             grad_list = [
                 grad for grad in grad_list
-                if grad.name != framework.grad_var_name(self.table_name)
+                if grad.name != grad_var_name(self.table_name)
             ]
             self.table_param_grad = [
                 param_grad for param_grad in params_grads
                 if param_grad[0].name == self.table_name
             ][0]
             table_grad_var = self.table_param_grad[1]
-            self.table_grad_list = [
-                program.global_block().create_var(
-                    name="%s.trainer_%d.pserver_%d" %
-                    (table_grad_var.name, trainer_id, index),
-                    type=table_grad_var.type,
-                    shape=table_grad_var.shape,
-                    dtype=table_grad_var.dtype)
-                for index in range(len(self.pserver_endpoints))
-            ]
+            if self.sync_mode:
+                self.trainer_side_table_grad_list = [
+                    program.global_block().create_var(
+                        name="%s.trainer_%d.pserver_%d" %
+                        (table_grad_var.name, self.trainer_id, index),
+                        type=table_grad_var.type,
+                        shape=table_grad_var.shape,
+                        dtype=table_grad_var.dtype)
+                    for index in range(len(self.pserver_endpoints))
+                ]
+            else:
+                self.trainer_side_table_grad_list = [
+                    program.global_block().create_var(
+                        name="%s.pserver_%d" % (table_grad_var.name, index),
+                        type=table_grad_var.type,
+                        shape=table_grad_var.shape,
+                        dtype=table_grad_var.dtype)
+                    for index in range(len(self.pserver_endpoints))
+                ]
+
+    def _init_splited_vars(self, slice_var_up):
+        # update these mappings for further transpile:
+        # 1. param_var_mapping: param var name -> [splited params vars]
+        # 2. grad_var_mapping: grad var name -> [splited grads vars]
+        # 3. grad_param_mapping: grad.blockx -> param.blockx
+        # 4. param_grad_ep_mapping: ep -> {"params": [], "grads": []}
+
+        param_list = []
+        grad_list = []
+        param_grad_set = set()
+        for p, g in self.params_grads:
+            # skip parameter marked not trainable
+            if type(p) == Parameter and p.trainable == False:
+                continue
+            if p.name not in param_grad_set:
+                param_list.append(p)
+                param_grad_set.add(p.name)
+            if g.name not in param_grad_set:
+                grad_list.append(g)
+                param_grad_set.add(g.name)
+
+        self._update_dist_lookup_table_vars(param_list, grad_list,
+                                            self.params_grads)
+
+        if slice_var_up:
+            # when we slice var up into blocks, we will slice the var according to
+            # pserver services' count. A pserver may have two or more listening ports.
+            grad_blocks = slice_variable(grad_list, len(self.pserver_endpoints))
+            param_blocks = slice_variable(param_list,
+                                          len(self.pserver_endpoints))
+        else:
+            # when we do NOT slice var up into blocks, we will always slice params
+            # grads into one block.
+            grad_blocks = slice_variable(grad_list, 1)
+            param_blocks = slice_variable(param_list, 1)
+        assert (len(grad_blocks) == len(param_blocks))
+
+        # origin_varname -> [splited_var]
+        self.param_var_mapping = self._create_vars_from_blocklist(
+            self.origin_program, param_blocks)
+        self.grad_var_mapping = self._create_vars_from_blocklist(
+            self.origin_program,
+            grad_blocks,
+            add_trainer_suffix=self.trainer_num > 1)
+        self.grad_param_mapping = dict()
+        for g, p in zip(grad_blocks, param_blocks):
+            g_name, g_bid, _ = g.split(":")
+            p_name, p_bid, _ = p.split(":")
+            self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] =  \
+                    self.param_var_mapping[p_name][int(p_bid)]
 
-        grad_blocks = split_dense_variable(grad_list, len(pserver_endpoints))
-        param_blocks = split_dense_variable(param_list, len(pserver_endpoints))
-        # step2: Create new vars for the parameters and gradients blocks and
-        # add ops to do the split.
-        grad_var_mapping = self._append_split_op(program, grad_blocks)
-        param_var_mapping = self._create_vars_from_blocklist(program,
-                                                             param_blocks)
-        # step3: Add gradients as send op inputs and parameters as send
-        # op outputs.
-        send_inputs = []
-        send_outputs = []
-        for b in grad_blocks:  # append by order
-            varname, block_id, _ = b.split(":")
-            send_inputs.append(grad_var_mapping[varname][int(block_id)])
-        for b in param_blocks:
-            varname, block_id, _ = b.split(":")
-            send_outputs.append(param_var_mapping[varname][int(block_id)])
-        # let send_op know which endpoint to send which var to, eplist has the same
-        # order as send_inputs.
-        eplist = split_method(send_inputs, pserver_endpoints)
         # create mapping of endpoint -> split var to create pserver side program
         self.param_grad_ep_mapping = dict()
+        [
+            self.param_grad_ep_mapping.update({
+                ep: {
+                    "params": [],
+                    "grads": []
+                }
+            }) for ep in self.pserver_endpoints
+        ]
+
+    def transpile(self,
+                  trainer_id,
+                  program=None,
+                  pservers="127.0.0.1:6174",
+                  trainers=1,
+                  slice_var_up=True,
+                  split_method=RoundRobin,
+                  sync_mode=True):
+        """
+        :param trainer_id: one unique id for each trainer in a job.
+        :type trainer_id: int
+        :param program: program to transpile, default is default_main_program
+        :type program: Program
+        :param pservers: parameter server endpoints like "m1:6174,m2:6174"
+        :type pservers: string
+        :param trainers: total number of workers/trainers in the job
+        :type trainers: int
+        :param split_method: A function to determin how to split variables
+            to different servers equally.
+        :type split_method: function
+        :type sync_mode: boolean default True
+        :param sync_mode: if sync_mode is set True, it means that dist transpiler
+        will transpile the program into sync_mode pserver and trainer program.
+        """
+        assert (split_method.__bases__[0] == PSDispatcher)
+        if program is None:
+            program = default_main_program()
+        self.origin_program = program
+        self.trainer_num = trainers
+        self.sync_mode = sync_mode
+        self.trainer_id = trainer_id
+        pserver_endpoints = pservers.split(",")
+        self.pserver_endpoints = pserver_endpoints
+        self.optimize_ops, self.params_grads = self._get_optimize_pass()
+
+        ps_dispatcher = split_method(self.pserver_endpoints)
+        self.has_distributed_lookup_table = self._has_distributed_lookup_table()
+
+        # split and create vars, then put splited vars in dicts for later use.
+        self._init_splited_vars(slice_var_up)
+
+        # step 3.1: insert send op to send gradient vars to parameter servers
+        ps_dispatcher.reset()
+        send_vars = []
+
+        # in general cases, the number of pservers is times of 2, and this
+        # will lead to uneven distribution among weights and bias:
+        #       fc_w@GRAD_trainer_0, fc_w@GRAD_trainer_1 --> pserver1
+        #       fc_b@GRAD_trainer_0, fc_b@GRAD_trainer_1 --> pserver2
+        # shuffle the map will avoid the uneven distribution above
+        grad_var_mapping_items = self.grad_var_mapping.items()
+        if not slice_var_up:
+            np.random.shuffle(grad_var_mapping_items)
+
+        for orig_varname, splited_vars in grad_var_mapping_items:
+            eplist = ps_dispatcher.dispatch(splited_vars)
+
+            if not slice_var_up:
+                assert (len(splited_vars) == 1)
+
+            if len(splited_vars) == 1:
+                orig_varname = splited_vars[0].name
+                index = find_op_by_output_arg(program.global_block(),
+                                              orig_varname)
+            elif len(splited_vars) > 1:
+                orig_var = program.global_block().vars[orig_varname]
+                index = find_op_by_output_arg(program.global_block(),
+                                              orig_varname)
+                self._insert_split_op(program, orig_var, index, splited_vars)
+                index += 1
+            else:
+                AssertionError("Can not insert the send op by original "
+                               "variable name :", orig_varname)
+
+            program.global_block().insert_op(
+                index=index + 1,
+                type="send_vars",
+                inputs={"X": splited_vars},
+                outputs={},
+                attrs={
+                    "epmap": eplist,
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                })
+            for _, var in enumerate(splited_vars):
+                send_vars.append(var)
+
+        if self.sync_mode:
+            program.global_block().append_op(
+                type="send_barrier",
+                inputs={},
+                outputs={},
+                attrs={
+                    "endpoints": pserver_endpoints,
+                    "sync_mode": self.sync_mode,
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                })
+
+        # step 3.2: insert recv op to receive parameters from parameter server
+        recv_vars = []
+        for _, var in enumerate(send_vars):
+            recv_vars.append(self.grad_param_mapping[var])
+        ps_dispatcher.reset()
+        eplist = ps_dispatcher.dispatch(recv_vars)
+
         for i, ep in enumerate(eplist):
-            param = send_outputs[i]
-            grad = send_inputs[i]
-            if not self.param_grad_ep_mapping.has_key(ep):
-                self.param_grad_ep_mapping[ep] = {"params": [], "grads": []}
-            self.param_grad_ep_mapping[ep]["params"].append(param)
-            self.param_grad_ep_mapping[ep]["grads"].append(grad)
-
-        rpc_client_var = program.global_block().create_var(
-            name=RPC_CLIENT_VAR_NAME,
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
-
-        # create send_op
+            self.param_grad_ep_mapping[ep]["params"].append(recv_vars[i])
+            self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i])
+
+        # step4: Concat the parameters splits together after recv.
+        for varname, splited_var in self.param_var_mapping.iteritems():
+            eps = []
+            for var in splited_var:
+                index = [v.name for v in recv_vars].index(var.name)
+                eps.append(eplist[index])
+
+            program.global_block().append_op(
+                type="recv",
+                inputs={},
+                outputs={"Out": splited_var},
+                attrs={
+                    "epmap": eps,
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                })
+
         program.global_block().append_op(
-            type="send",
-            inputs={"X": send_inputs},
-            outputs={"Out": send_outputs,
-                     "RPCClient": rpc_client_var},
+            type="fetch_barrier",
+            inputs={},
+            outputs={},
             attrs={
                 "endpoints": pserver_endpoints,
-                "epmap": eplist,
-                "sync_mode": self.sync_mode
+                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
             })
-        # step4: Concat the parameters splits together after recv.
-        for varname, splited_var in param_var_mapping.iteritems():
+
+        for varname, splited_var in self.param_var_mapping.iteritems():
             if len(splited_var) <= 1:
                 continue
             orig_param = program.global_block().vars[varname]
@@ -319,10 +384,9 @@ class DistributeTranspiler:
                 attrs={"axis": 0})
 
         if self.has_distributed_lookup_table:
-            self._replace_lookup_table_op_with_prefetch(program, rpc_client_var,
-                                                        eplist)
-            self._split_table_grad_and_add_send_vars(program, rpc_client_var,
-                                                     pserver_endpoints)
+            self._replace_lookup_table_op_with_prefetch(program,
+                                                        pserver_endpoints)
+            self._split_table_grad_and_add_send_vars(program, pserver_endpoints)
 
     def get_trainer_program(self):
         # remove optimize ops and add a send op to main_program
@@ -350,7 +414,6 @@ class DistributeTranspiler:
             # we don't need to create them when grad arrives.
             # change client side var name to origin name by
             # removing ".trainer_%d" suffix
-
             suff_idx = v.name.find(".trainer_")
             if suff_idx >= 0:
                 orig_var_name = v.name[:suff_idx]
@@ -387,24 +450,14 @@ class DistributeTranspiler:
         # located on current pserver
         opt_op_on_pserver = []
         for _, op in enumerate(self.optimize_ops):
-            if self._is_opt_op(op) and self._is_opt_op_on_pserver(endpoint, op):
+            if self._is_optimizer_op(op) and self._is_opt_op_on_pserver(
+                    endpoint, op):
                 opt_op_on_pserver.append(op)
         # step 3.3
         # Iterate through the ops, and if an op and the optimize ops
         # which located on current pserver are in one set, then
         # append it into the sub program.
 
-        # We try to put optimization program run parallelly, assume
-        # optimization program always looks like:
-        #
-        # prevop -> prevop -> opt op -> following op -> following op; ->
-        # prevop -> prevop -> opt op -> following op -> following op; ->
-        # global op -> global op
-        #
-        # we put operators that can run parallelly to many program blocks.
-        # in above example, we seperate ops by the ";". Global ops must run
-        # after all the optimize ops finished.
-
         global_ops = []
         # HACK: optimization global ops only used to scale beta1 and beta2
         # replace it with dependency engine.
@@ -412,12 +465,18 @@ class DistributeTranspiler:
             if self._is_adam_connected_op(op):
                 global_ops.append(op)
 
-        def __append_optimize_op__(op, block, grad_to_block_id):
-            if self._is_opt_op(op):
+        def __append_optimize_op__(op, block, grad_to_block_id, merged_var):
+            if self._is_optimizer_op(op):
                 self._append_pserver_ops(block, op, endpoint, grad_to_block_id,
-                                         default_main_program())
+                                         self.origin_program, merged_var)
             else:
-                self._append_pserver_non_opt_ops(block, op)
+                self._append_pserver_non_opt_ops(block, op, endpoint)
+
+        def __op_have_grad_input__(op):
+            for varname in op.input_arg_names:
+                if varname.find("@GRAD") >= 0:
+                    return varname
+            return ""
 
         # append lr decay ops to the child block if exists
         lr_ops = self._get_lr_ops()
@@ -425,17 +484,26 @@ class DistributeTranspiler:
             lr_decay_block = pserver_program.create_block(
                 pserver_program.num_blocks - 1)
             for _, op in enumerate(lr_ops):
-                self._append_pserver_non_opt_ops(lr_decay_block, op)
+                self._append_pserver_non_opt_ops(lr_decay_block, op, endpoint)
 
         # append op to the current block
         grad_to_block_id = []
         pre_block_idx = pserver_program.num_blocks - 1
         for idx, opt_op in enumerate(opt_op_on_pserver):
             per_opt_block = pserver_program.create_block(pre_block_idx)
+            # append grad merging ops before clip and weight decay
+            for _, op in enumerate(self.optimize_ops):
+                # find the origin @GRAD var before clipping
+                grad_varname_for_block = __op_have_grad_input__(op)
+                if ufind.is_connected(op, opt_op) and grad_varname_for_block:
+                    merged_var = self._append_pserver_grad_merge_ops(
+                        per_opt_block, grad_varname_for_block, endpoint,
+                        grad_to_block_id, self.origin_program)
             for _, op in enumerate(self.optimize_ops):
                 # optimizer is connected to itself
                 if ufind.is_connected(op, opt_op) and op not in global_ops:
-                    __append_optimize_op__(op, per_opt_block, grad_to_block_id)
+                    __append_optimize_op__(op, per_opt_block, grad_to_block_id,
+                                           merged_var)
 
         # append global ops
         if global_ops:
@@ -443,22 +511,14 @@ class DistributeTranspiler:
                 pserver_program.num_blocks - 1)
             for glb_op in global_ops:
                 __append_optimize_op__(glb_op, opt_state_block,
-                                       grad_to_block_id)
-
-        # NOT USED: single block version:
-        #
-        # for _, op in enumerate(self.optimize_ops):
-        #     for _, opt_op in enumerate(opt_op_on_pserver):
-        #         if ufind.is_connected(op, opt_op):
-        #             __append_optimize_op__(glb_op, optimize_block)
-        #             break
+                                       grad_to_block_id, None)
 
         # process distributed lookup_table
         prefetch_block = None
         if self.has_distributed_lookup_table:
             pserver_index = self.pserver_endpoints.index(endpoint)
             table_opt_block = self._create_table_optimize_block(
-                pserver_index, pserver_program, pre_block_idx)
+                pserver_index, pserver_program, pre_block_idx, grad_to_block_id)
             prefetch_block = self._create_prefetch_block(
                 pserver_index, pserver_program, table_opt_block)
 
@@ -494,7 +554,7 @@ class DistributeTranspiler:
         were split to several blocks.
         """
         s_prog = Program()
-        orig_s_prog = framework.default_startup_program()
+        orig_s_prog = default_startup_program()
         params = self.param_grad_ep_mapping[endpoint]["params"]
 
         def _get_splited_name_and_shape(varname):
@@ -541,9 +601,11 @@ class DistributeTranspiler:
                     attrs=op.attrs)
         return s_prog
 
+    # ====================== private transpiler functions =====================
+
     # transpiler function for dis lookup_table
-    def _replace_lookup_table_op_with_prefetch(self, program, rpc_client_var,
-                                               eplist):
+    def _replace_lookup_table_op_with_prefetch(self, program,
+                                               pserver_endpoints):
         # 1. replace lookup_table_op with split_ids_op -> prefetch_op -> sum_op
         self.prefetch_input_vars = None
         self.prefetch_output_vars = None
@@ -590,11 +652,11 @@ class DistributeTranspiler:
                         index=op_index + 1,
                         type="prefetch",
                         inputs={'X': self.prefetch_input_vars},
-                        outputs={
-                            "Out": self.prefetch_output_vars,
-                            "RPCClient": rpc_client_var
-                        },
-                        attrs={"epmap": eplist})
+                        outputs={"Out": self.prefetch_output_vars},
+                        attrs={
+                            "epmap": pserver_endpoints,
+                            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                        })
 
                     # insert concat_op
                     program.global_block().insert_op(
@@ -614,12 +676,11 @@ class DistributeTranspiler:
                     # break for loop
                     break
 
-    def _split_table_grad_and_add_send_vars(self, program, rpc_client_var,
-                                            pserver_endpoints):
+    def _split_table_grad_and_add_send_vars(self, program, pserver_endpoints):
         # 2. add split_ids_op and send_vars_op to send gradient to pservers
         # there should only be one table_name
         all_ops = program.global_block().ops
-        table_grad_name = framework.grad_var_name(self.table_name)
+        table_grad_name = grad_var_name(self.table_name)
         for op in all_ops:
             if table_grad_name in op.output_arg_names:
                 op_index = list(all_ops).index(op)
@@ -630,14 +691,17 @@ class DistributeTranspiler:
                     inputs={
                         'Ids': [program.global_block().vars[table_grad_name]]
                     },
-                    outputs={"Out": self.table_grad_list})
+                    outputs={"Out": self.trainer_side_table_grad_list})
                 program.global_block().insert_op(
                     index=op_index + 2,
                     type="send_vars",
-                    inputs={'X': self.table_grad_list},
-                    outputs={"RPCClient": rpc_client_var},
-                    attrs={"sync_send": True,
-                           "epmap": pserver_endpoints})
+                    inputs={'X': self.trainer_side_table_grad_list},
+                    outputs={},
+                    attrs={
+                        "sync_send": True,
+                        "epmap": pserver_endpoints,
+                        RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                    })
                 break
 
     def _create_prefetch_block(self, pserver_index, pserver_program,
@@ -670,16 +734,7 @@ class DistributeTranspiler:
         return prefetch_block
 
     def _create_table_optimize_block(self, pserver_index, pserver_program,
-                                     pre_block_idx):
-        def _clone_var(block, var, persistable=True):
-            assert isinstance(var, Variable)
-            return block.create_var(
-                name=var.name,
-                shape=var.shape,
-                dtype=var.dtype,
-                type=var.type,
-                persistable=persistable)
-
+                                     pre_block_idx, grad_to_block_id):
         # STEP: create table optimize block
         # create table param and grad var in pserver program
         origin_param_var = self.origin_program.global_block().vars[
@@ -690,11 +745,11 @@ class DistributeTranspiler:
             dtype=origin_param_var.dtype,
             type=core.VarDesc.VarType.SELECTED_ROWS,
             persistable=True)
-        grad_var = _clone_var(
-            pserver_program.global_block(),
-            self.origin_program.global_block().vars[framework.grad_var_name(
-                self.table_name)],
-            persistable=False)
+        # parameter must be selected rows
+        param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
+        grad_var = pserver_program.global_block().clone_variable(
+            self.origin_program.global_block().vars[grad_var_name(
+                self.table_name)])
 
         # create table optimize block in pserver program
         table_opt_op = [
@@ -708,7 +763,7 @@ class DistributeTranspiler:
         if self.sync_mode:
             # create grad vars in pserver program
             table_grad_var = self.table_param_grad[1]
-            table_grad_list = [
+            pserver_side_table_grad_list = [
                 pserver_program.global_block().create_var(
                     name="%s.trainer_%d.pserver_%d" %
                     (table_grad_var.name, index, pserver_index),
@@ -718,11 +773,21 @@ class DistributeTranspiler:
                 for index in range(self.trainer_num)
             ]
 
-            # append sum op for table_grad_list
+            # append sum op for pserver_side_table_grad_list
             table_opt_block.append_op(
                 type="sum",
-                inputs={"X": table_grad_list},
+                inputs={"X": pserver_side_table_grad_list},
                 outputs={"Out": [grad_var]})
+        else:
+            # in async_mode, for table gradient, it also need to be splited to each parameter server
+            origin_grad_name = grad_var.name
+            splited_grad_name = self.trainer_side_table_grad_list[
+                pserver_index].name
+            if not splited_grad_name.startswith(origin_grad_name):
+                raise ValueError("origin_grad_var: " + splited_grad_name +
+                                 " grad_var:" + grad_var.name)
+            grad_var = pserver_program.global_block().rename_var(
+                origin_grad_name, splited_grad_name)
 
         lr_var = pserver_program.global_block().vars[table_opt_op.input(
             "LearningRate")[0]]
@@ -738,9 +803,11 @@ class DistributeTranspiler:
             outputs=outputs,
             attrs=table_opt_op.attrs)
 
+        # add table parameter gradient and it's block id to grad_to_block_id
+        grad_to_block_id.append(grad_var.name + ":" + str(table_opt_block.idx))
+
         return table_opt_block
 
-    # ====================== private transpiler functions =====================
     def _create_vars_from_blocklist(self,
                                     program,
                                     block_list,
@@ -749,15 +816,27 @@ class DistributeTranspiler:
         Create vars for each split.
         NOTE: only grads need to be named for different trainers, use
               add_trainer_suffix to rename the grad vars.
-        :return: A dict mapping from original var name to each var split.
+        Args:
+            program (ProgramDesc): ProgramDesc which gradients blong.
+            block_list (list[(varname, block_id, block_size)]): List of gradient blocks.
+            add_trainer_suffix (Bool): Add trainer suffix to new variable's name if set True.
+        Returns:
+            var_mapping (dict(varname->[new_varname_variable])):A dict mapping
+                from original var name to each var split.
         """
+
+        # varname->[(block_id, current_block_size)]
         block_map = dict()
+
         var_mapping = dict()
         for block_str in block_list:
             varname, offset, size = block_str.split(":")
             if not block_map.has_key(varname):
                 block_map[varname] = []
             block_map[varname].append((long(offset), long(size)))
+        # Do not remove this important debug message:
+        print("block map: %s" % block_map)
+
         for varname, splited in block_map.iteritems():
             orig_var = program.global_block().var(varname)
             if len(splited) == 1:
@@ -821,41 +900,31 @@ class DistributeTranspiler:
             lod_level=var.lod_level,
             persistable=persistable)
 
-    def _append_split_op(self, program, gradblocks):
-        # Split variables that need to be split and append respective ops
-        add_suffix = False
-        if self.trainer_num > 1:
-            add_suffix = True
-        var_mapping = self._create_vars_from_blocklist(
-            program, gradblocks, add_trainer_suffix=add_suffix)
-        for varname, splited_vars in var_mapping.iteritems():
-            # variable that don't need to split have empty splited_vars
-            if len(splited_vars) <= 1:
-                continue
-            orig_var = program.global_block().vars[varname]
-            if orig_var.type == core.VarDesc.VarType.SELECTED_ROWS:
-                height_sections = []
-                for v in splited_vars:
-                    height_sections.append(v.shape[0])
-                program.global_block().append_op(
-                    type="split_selected_rows",
-                    inputs={"X": orig_var},
-                    outputs={"Out": splited_vars},
-                    attrs={"height_sections": height_sections})
-            elif orig_var.type == core.VarDesc.VarType.LOD_TENSOR:
-                sections = []
-                for v in splited_vars:
-                    sections.append(v.shape[0])
-                program.global_block().append_op(
-                    type="split_byref",
-                    inputs={"X": orig_var},
-                    outputs={"Out": splited_vars},
-                    attrs={"sections": sections}  # assume split evenly
-                )
-            else:
-                AssertionError("Variable type should be in set "
-                               "[LOD_TENSOR, SELECTED_ROWS]")
-        return var_mapping
+    def _insert_split_op(self, program, orig_var, index, splited_vars):
+        if orig_var.type == core.VarDesc.VarType.SELECTED_ROWS:
+            height_sections = []
+            for v in splited_vars:
+                height_sections.append(v.shape[0])
+            program.global_block().insert_op(
+                index=index + 1,
+                type="split_selected_rows",
+                inputs={"X": orig_var},
+                outputs={"Out": splited_vars},
+                attrs={"height_sections": height_sections})
+        elif orig_var.type == core.VarDesc.VarType.LOD_TENSOR:
+            sections = []
+            for v in splited_vars:
+                sections.append(v.shape[0])
+            program.global_block().insert_op(
+                index=index + 1,
+                type="split_byref",
+                inputs={"X": orig_var},
+                outputs={"Out": splited_vars},
+                attrs={"sections": sections}  # assume split evenly
+            )
+        else:
+            AssertionError("Variable type should be in set "
+                           "[LOD_TENSOR, SELECTED_ROWS]")
 
     def _get_optimizer_input_shape(self, op_type, varkey, orig_shape,
                                    param_shape):
@@ -884,17 +953,74 @@ class DistributeTranspiler:
             pass
         return orig_shape
 
-    def _orig_varname(self, varname):
-        suff_idx = varname.find(".trainer_")
+    def _get_varname_parts(self, varname):
+        # returns origin, blockid, trainerid
         orig_var_name = ""
-        if suff_idx >= 0:
-            orig_var_name = varname[:suff_idx]
+        trainer_part = ""
+        block_part = ""
+        trainer_idx = varname.find(".trainer_")
+        if trainer_idx >= 0:
+            trainer_part = varname[trainer_idx + 1:]
+        else:
+            trainer_idx = len(varname)
+        block_index = varname.find(".block")
+        if block_index >= 0:
+            block_part = varname[block_index + 1:trainer_idx]
+        else:
+            block_index = len(varname)
+        orig_var_name = varname[0:min(block_index, trainer_idx)]
+        return orig_var_name, block_part, trainer_part
+
+    def _orig_varname(self, varname):
+        orig, _, _ = self._get_varname_parts(varname)
+        return orig
+
+    def _append_pserver_grad_merge_ops(self, optimize_block,
+                                       grad_varname_for_block, endpoint,
+                                       grad_to_block_id, origin_program):
+        program = optimize_block.program
+        pserver_block = program.global_block()
+        grad_block = None
+        for g in self.param_grad_ep_mapping[endpoint]["grads"]:
+            if self._orig_varname(g.name) == \
+                    self._orig_varname(grad_varname_for_block):
+                grad_block = g
+                break
+        if not grad_block:
+            # do not append this op if current endpoint
+            # is not dealing with this grad block
+            return
+        orig_varname, block_name, trainer_name = self._get_varname_parts(
+            grad_block.name)
+        if block_name:
+            merged_var_name = '.'.join([orig_varname, block_name])
         else:
-            orig_var_name = varname
-        return orig_var_name
+            merged_var_name = orig_varname
+        merged_var = \
+            pserver_block.vars[merged_var_name]
+        grad_to_block_id.append(merged_var.name + ":" + str(optimize_block.idx))
+        if self.sync_mode and self.trainer_num > 1:
+            vars2merge = []
+            for i in xrange(self.trainer_num):
+                per_trainer_name = "%s.trainer_%d" % \
+                (merged_var_name, i)
+                vars2merge.append(pserver_block.vars[per_trainer_name])
+
+            optimize_block.append_op(
+                type="sum",
+                inputs={"X": vars2merge},
+                outputs={"Out": merged_var})
+            # TODO(panyx0718): What if it's SELECTED_ROWS.
+            if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
+                optimize_block.append_op(
+                    type="scale",
+                    inputs={"X": merged_var},
+                    outputs={"Out": merged_var},
+                    attrs={"scale": 1.0 / float(self.trainer_num)})
+        return merged_var
 
     def _append_pserver_ops(self, optimize_block, opt_op, endpoint,
-                            grad_to_block_id, origin_program):
+                            grad_to_block_id, origin_program, merged_var):
         program = optimize_block.program
         pserver_block = program.global_block()
         new_inputs = dict()
@@ -902,40 +1028,6 @@ class DistributeTranspiler:
         # moment can use the updated shape
         for key in opt_op.input_names:
             if key == "Grad":
-                grad_block = None
-                for g in self.param_grad_ep_mapping[endpoint]["grads"]:
-                    if same_or_split_var(
-                            self._orig_varname(g.name),
-                            self._orig_varname(opt_op.input(key)[0])):
-                        grad_block = g
-                        break
-                if not grad_block:
-                    # do not append this op if current endpoint
-                    # is not dealing with this grad block
-                    return
-                merged_var = \
-                    pserver_block.vars[self._orig_varname(grad_block.name)]
-                grad_to_block_id.append(merged_var.name + ":" + str(
-                    optimize_block.idx))
-                if self.sync_mode and self.trainer_num > 1:
-                    vars2merge = []
-                    for i in xrange(self.trainer_num):
-                        per_trainer_name = "%s.trainer_%d" % \
-                        (self._orig_varname(grad_block.name), i)
-                        vars2merge.append(pserver_block.vars[per_trainer_name])
-
-                    optimize_block.append_op(
-                        type="sum",
-                        inputs={"X": vars2merge},
-                        outputs={"Out": merged_var})
-                    # TODO(panyx0718): What if it's SELECTED_ROWS.
-                    if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
-                        optimize_block.append_op(
-                            type="scale",
-                            inputs={"X": merged_var},
-                            outputs={"Out": merged_var},
-                            attrs={"scale": 1.0 / float(self.trainer_num)})
-
                 new_inputs[key] = merged_var
             elif key == "Param":
                 # param is already created on global program
@@ -994,17 +1086,31 @@ class DistributeTranspiler:
             outputs=outputs,
             attrs=opt_op.attrs)
 
-    def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
+    def _is_splited_grad_var(self, var, var_dict):
+        grad_block = None
+        for _, g in var_dict.iteritems():
+            if self._orig_varname(g.name) == self._orig_varname(var.name):
+                if g.name.find(".trainer_") == -1:
+                    grad_block = g
+                    break
+        return grad_block
+
+    def _append_pserver_non_opt_ops(self, optimize_block, opt_op, endpoint):
         program = optimize_block.program
         # Append the ops for parameters that do not need to be optimized/updated
         inputs = self._get_input_map_from_op(
             self.origin_program.global_block().vars, opt_op)
-        for varlist in inputs.itervalues():
+        for key, varlist in inputs.iteritems():
             if not isinstance(varlist, list):
                 varlist = [varlist]
-
             for var in varlist:
-                if not program.global_block().vars.has_key(var.name):
+                # for ops like clipping and weight decay, get the splited var
+                # for inputs/outputs
+                grad_block = self._is_splited_grad_var(
+                    var, program.global_block().vars)
+                if grad_block:
+                    inputs[key] = grad_block
+                elif not program.global_block().vars.has_key(var.name):
                     program.global_block().create_var(
                         name=var.name,
                         persistable=var.persistable,
@@ -1013,13 +1119,16 @@ class DistributeTranspiler:
 
         outputs = self._get_output_map_from_op(
             self.origin_program.global_block().vars, opt_op)
-
-        for varlist in outputs.itervalues():
+        for key, varlist in outputs.iteritems():
             if not isinstance(varlist, list):
                 varlist = [varlist]
-
             for var in varlist:
-                program.global_block().clone_variable(var)
+                grad_block = self._is_splited_grad_var(
+                    var, program.global_block().vars)
+                if grad_block:
+                    outputs[key] = grad_block
+                elif not program.global_block().vars.has_key(var.name):
+                    program.global_block().clone_variable(var)
 
         optimize_block.append_op(
             type=opt_op.type,
@@ -1065,9 +1174,17 @@ class DistributeTranspiler:
                     ufind.union(op1, op2)
         return ufind
 
-    def _is_opt_op(self, op):
-        # NOTE: It's a HACK implement.
-        # optimize op: SGDOptimize, MomentumOptimizer, AdamOptimizer and etc...
+    def _is_opt_role_op(self, op):
+        # NOTE: depend on oprole to find out whether this op is for
+        # optimize
+        op_maker = core.op_proto_and_checker_maker
+        optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
+        if op_maker.kOpRoleAttrName() in op.attrs and \
+            int(op.attrs[op_maker.kOpRoleAttrName()]) == int(optimize_role):
+            return True
+        return False
+
+    def _is_optimizer_op(self, op):
         if "Param" in op.input_names and \
             "LearningRate" in op.input_names:
             return True
@@ -1117,7 +1234,7 @@ class DistributeTranspiler:
         # find learning rate variables by optimize op
         lr_vars = set()
         for op in self.optimize_ops:
-            if self._is_opt_op(op):
+            if self._is_optimizer_op(op):
                 lr_vars.add(op.input("LearningRate")[0])
 
         find_ops = []
@@ -1134,7 +1251,7 @@ class DistributeTranspiler:
                 # NOTE: we need to skip all optimize ops, since it is connected
                 # with forward/backward ops and lr ops, we only need the lr ops.
                 if op1 != op2 and self._is_op_connected(op1, op2) and \
-                    not self._is_opt_op(op1) and not self._is_opt_op(op2):
+                    not self._is_optimizer_op(op1) and not self._is_optimizer_op(op2):
                     ufind.union(op1, op2)
         # find all ops which is related with lr var
         for op1 in block.ops:
@@ -1146,16 +1263,30 @@ class DistributeTranspiler:
         return lr_ops
 
     def _get_optimize_pass(self):
+        """
+        Get optimizer operators, paramters and gradients from origin_program
+        Returns:
+            opt_ops (list): optimize operators.
+            params_grads (dict): paramter->gradient.
+        """
         block = self.origin_program.global_block()
         opt_ops = []
         params_grads = []
+        origin_var_dict = self.origin_program.global_block().vars
         for op in block.ops:
-            if self._is_opt_op(op):
+            if self._is_opt_role_op(op):
                 opt_ops.append(op)
-                params_grads.append((self.origin_program.global_block().var(
-                    op.input("Param")[0]),
-                                     self.origin_program.global_block().var(
-                                         op.input("Grad")[0])))
+                # HACK(wuyi): if we find grad vars from input of optimize
+                # ops, we may get the output of clip op. Use syntax "@GRAD"
+                # and op_role_var to get the pair.
+                for input_name in op.input_arg_names:
+                    if input_name.find("@GRAD") != -1 and \
+                        op.attrs[RPC_OP_ROLE_ATTR_NAME]:
+                        param_name = op.attrs[OP_ROLE_VAR_ATTR_NAME][0]
+                        params_grads.append([
+                            origin_var_dict[param_name],
+                            origin_var_dict[input_name]
+                        ])
             elif self._is_adam_connected_op(op):
                 opt_ops.append(op)
             else:
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler_simple.py b/python/paddle/fluid/transpiler/distribute_transpiler_simple.py
deleted file mode 100644
index ea8c27cdca885dbbf90349b35df9691951264061..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/transpiler/distribute_transpiler_simple.py
+++ /dev/null
@@ -1,254 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ..framework import Program, default_main_program, Parameter, Variable
-from ..layer_helper import LayerHelper
-
-
-def hash_name_to_server(params_grads, pserver_endpoints):
-    """
-    :param param_grads:
-    :return: a map of pserver endpoint -> 
-                    params -> [param list]
-                    grads  -> [grad list]
-    """
-
-    def _hash_param(param_name, total):
-        return hash(param_name) % total
-
-    param_grad_map = dict()
-    for param, grad in params_grads:
-        if param.trainable is True and grad is not None:
-            server_id = _hash_param(param.name, len(pserver_endpoints))
-            server_for_param = pserver_endpoints[server_id]
-            if not param_grad_map.has_key(server_for_param):
-                param_grad_map[server_for_param] = {"params": [], "grads": []}
-            param_grad_map[server_for_param]["params"].append(param)
-            param_grad_map[server_for_param]["grads"].append(grad)
-
-    return param_grad_map
-
-
-def round_robin(params_grads, pserver_endpoints):
-    assert (len(params_grads) > len(pserver_endpoints))
-
-    param_grad_map = dict()
-    pserver_idx = 0
-    for param, grad in params_grads:
-        if param.trainable is True:
-            server_for_param = pserver_endpoints[pserver_idx]
-            if not param_grad_map.has_key(server_for_param):
-                param_grad_map[server_for_param] = {"params": [], "grads": []}
-
-            param_grad_map[server_for_param]["params"].append(param)
-            param_grad_map[server_for_param]["grads"].append(grad)
-
-            pserver_idx += 1
-            if pserver_idx >= len(pserver_endpoints):
-                pserver_idx = 0
-    return param_grad_map
-
-
-class SimpleDistributeTranspiler:
-    def transpile(self,
-                  optimize_ops,
-                  params_grads,
-                  program=None,
-                  pservers="127.0.0.1:6174",
-                  trainers=1,
-                  split_method=round_robin):
-        """
-            Transpile the program to a distributed data-parallelism programs.
-
-            The main_program will be transform to use a remote parameter server
-            to do parameter optimization. And the optimization graph will be put
-            in to a parameter server program.
-
-            Use different methods to split trainable varialbles to different
-            parameter servers.
-
-            Example to run:
-
-            exe = fluid.Executor(place)
-            t = fluid.DistributeTranspiler()
-            t.transpile(optimize_ops, params_grads, pservers="127.0.0.1:6174", trainers=1)
-
-            pserver_endpoint = os.getenv("PSERVER")
-            if pserver_endpoint:
-                pserver_prog = t.get_pserver_program(pserver_endpoint, optimize_ops)
-                exe.run(fluid.default_startup_program())
-                exe.run(pserver_prog)
-            else:
-                feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
-                exe.run(fluid.default_startup_program())
-
-                for pass_id in range(PASS_NUM):
-                    ...
-
-            :param optimize_ops: op list of optimization, should be the
-                                 return value of Optimizer.minimize
-            :type optimize_ops: list
-            :param program: program to optimize, default default_main_program
-            :param pservers: parameter server endpoints like "m1:6174,m2:6174"
-            :type pservers: string
-
-            :return: return a list of programs
-        """
-        if program is None:
-            program = default_main_program()
-        self.program = program
-        self.trainers = trainers
-        self.optimize_ops = optimize_ops
-        self._optimize_distributed(
-            optimize_ops,
-            program,
-            params_grads,
-            pservers=pservers,
-            trainers=trainers,
-            split_method=split_method)
-
-    def _clone_param(self, block, v):
-        assert isinstance(v, Parameter)
-        new_p = Parameter(
-            block=block,
-            shape=v.shape,
-            dtype=v.dtype,
-            type=v.type,
-            lod_level=v.lod_level,
-            stop_gradient=v.stop_gradient,
-            trainable=v.trainable,
-            optimize_attr=v.optimize_attr,
-            regularizer=v.regularizer,
-            name=v.name)
-        block.vars[new_p.name] = new_p
-
-    def _clone_var(self, block, var):
-        assert isinstance(var, Variable)
-        return block.create_var(
-            name=var.name,
-            shape=var.shape,
-            dtype=var.dtype,
-            type=var.type,
-            lod_level=var.lod_level,
-            persistable=var.persistable)
-
-    def _optimize_distributed(self, optimize_ops, program, params_and_grads,
-                              **kwargs):
-        if kwargs.has_key("split_method"):
-            split_method = kwargs["split_method"]
-        else:
-            split_method = round_robin
-
-        assert (callable(split_method))
-        pserver_endpoints = kwargs["pservers"].split(",")
-        self.param_grad_map = split_method(params_and_grads, pserver_endpoints)
-
-        send_op_ordered_inputs = []
-        send_op_ordered_outputs = []
-        epmap = []
-        for ep, v in self.param_grad_map.iteritems():
-            send_op_ordered_inputs.extend(v["grads"])
-            send_op_ordered_outputs.extend(v["params"])
-            for i in v["grads"]:
-                epmap.append(ep)
-        send_op = program.global_block().append_op(
-            type="send",
-            inputs={"X": send_op_ordered_inputs
-                    },  # inputs is a list of tensors to be send
-            outputs={"Out": send_op_ordered_outputs},
-            attrs={"endpoints": pserver_endpoints,
-                   "epmap": epmap})
-
-    def get_trainer_program(self):
-        # remove optimize ops and add a send op to main_program
-        self.program.global_block().delete_ops(self.optimize_ops)
-        return self.program
-
-    def _create_var_for_trainers(self, block, var, trainers):
-        var_list = []
-        for i in xrange(trainers):
-            var_each = block.create_var(
-                name="%s.trainer_%d" % (var.name, i),
-                psersistable=var.persistable,
-                dtype=var.dtype,
-                shape=var.shape)
-            var_list.append(var_each)
-        return var_list
-
-    def get_pserver_program(self, endpoint, optimize_ops):
-        pserver_program = Program()
-        for v in self.param_grad_map[endpoint]["params"]:
-            self._clone_param(pserver_program.global_block(), v)
-
-        optimize_sub_program = Program()
-        grad_var_names = [
-            var.name for var in self.param_grad_map[endpoint]["grads"]
-        ]
-        for opt_op in optimize_ops:
-            for _, var in opt_op.inputs.iteritems():
-                # NOTE: append operators to merge gradients from multiple
-                # trainers. If trainers == 1, this is not needed.
-                if self.trainers > 1 and var.name in grad_var_names:
-                    vars2merge = self._create_var_for_trainers(
-                        optimize_sub_program.global_block(), var, self.trainers)
-                    merged_var = optimize_sub_program.global_block().create_var(
-                        name=var.name,
-                        persistable=var.persistable,
-                        dtype=var.dtype,
-                        shape=var.shape)
-                    optimize_sub_program.global_block().append_op(
-                        type="sum",
-                        inputs={"X": vars2merge},
-                        outputs={"Out": merged_var})
-                    optimize_sub_program.global_block().append_op(
-                        type="scale",
-                        inputs={"X": merged_var},
-                        outputs={"Out": merged_var},
-                        attrs={"scale": 1.0 / float(self.trainers)})
-                else:
-                    optimize_sub_program.global_block().create_var(
-                        name=var.name,
-                        persistable=var.persistable,
-                        dtype=var.dtype,
-                        shape=var.shape)
-
-            if opt_op.inputs.has_key("Grad"):
-                if opt_op.inputs["Grad"].name in grad_var_names:
-                    optimize_sub_program.global_block().append_op(
-                        type=opt_op.type,
-                        inputs=opt_op.inputs,
-                        outputs=opt_op.outputs,
-                        attrs=opt_op.attrs)
-            else:
-                optimize_sub_program.global_block().append_op(
-                    type=opt_op.type,
-                    inputs=opt_op.inputs,
-                    outputs=opt_op.outputs,
-                    attrs=opt_op.attrs)
-        pserver_program.global_block().append_op(
-            type="recv",
-            inputs={"RX":
-                    self.param_grad_map[endpoint]["grads"]},  # grads to recv
-            outputs={},
-            attrs={
-                "OptimizeBlock": optimize_sub_program.global_block(),
-                "endpoint": endpoint,
-                "ParamList":
-                [p.name for p in self.param_grad_map[endpoint]["params"]],
-                "GradList":
-                [p.name for p in self.param_grad_map[endpoint]["grads"]],
-                "Trainers": self.trainers
-            })
-        pserver_program.sync_with_cpp()
-        return pserver_program
diff --git a/python/paddle/fluid/transpiler/distributed_splitter.py b/python/paddle/fluid/transpiler/distributed_splitter.py
deleted file mode 100644
index 060c1df8ad2badc5132f45ff0f44d136d828faa1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/transpiler/distributed_splitter.py
+++ /dev/null
@@ -1,57 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-def hash_name(varlist, pserver_endpoints):
-    """
-    hash variable names to several endpoints.
-
-    Args:
-        varlist(list): a list of Variables
-
-    Returns(dict): a map of pserver endpoint -> varname
-    """
-
-    def _hash_block(block_str, total):
-        return hash(block_str) % total
-
-    eplist = []
-    for var in varlist:
-        server_id = _hash_block(var.name(), len(pserver_endpoints))
-        server_for_param = pserver_endpoints[server_id]
-        eplist.append(server_for_param)
-    return eplist
-
-
-def round_robin(varlist, pserver_endpoints):
-    """
-    Distribute variables to several endpoints.
-    Args:
-        varlist(list): a list of variables
-        pserver_endpoints(list): a list of pserver endpoints
-
-    Returns(list[int]): the endpoint for each variable
-    """
-    assert (len(varlist) >= len(pserver_endpoints))
-
-    eplist = []
-    pserver_idx = 0
-    for var in varlist:
-        server_for_param = pserver_endpoints[pserver_idx]
-        eplist.append(server_for_param)
-
-        pserver_idx += 1
-        if pserver_idx >= len(pserver_endpoints):
-            pserver_idx = 0
-    return eplist
diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
index 49034b47b2d184e4027bcebc29413a163340fdaa..9ff0ae6fca27d4681891b2033e2f8f95bd825942 100644
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -24,7 +24,8 @@ dtype_to_size = {
     core.VarDesc.VarType.INT16: 2,
     core.VarDesc.VarType.INT32: 4,
     core.VarDesc.VarType.INT64: 8,
-    core.VarDesc.VarType.BOOL: 1
+    core.VarDesc.VarType.BOOL: 1,
+    core.VarDesc.VarType.UINT8: 1,
 }
 
 SUB_BLOCK_OPS = [
@@ -106,7 +107,7 @@ class ControlFlowGraph(object):
         # Repeatedly apply liveness updates until the algorithm stablize
         # on a complete set live input vars and live output vars.
         while True:
-            for i in range(self.op_size, 0, -1):
+            for i in reversed(range(self.op_size)):
                 live_in[i] = set(self._live_in[i])
                 live_out[i] = set(self._live_out[i])
                 for s in self._successors[i]:
diff --git a/python/paddle/fluid/transpiler/ps_dispatcher.py b/python/paddle/fluid/transpiler/ps_dispatcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6a68677527deb09ace0e3a23cbc093d6d7b4349
--- /dev/null
+++ b/python/paddle/fluid/transpiler/ps_dispatcher.py
@@ -0,0 +1,78 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class PSDispatcher(object):
+    """
+    PSDispatcher is the base class for dispatching vars
+    into different pserver instance.
+    You need to implement the `dispatch` inferface.
+    """
+
+    def __init__(self, pserver_endpoints):
+        self._eps = pserver_endpoints
+        self._step = 0
+
+    @property
+    def eps(self):
+        return self._eps
+
+    def reset(self):
+        self._step = 0
+
+    def dispatch(self, varlist):
+        """
+        :param varlist: a list of Variables
+        :return: a map of pserver endpoint -> varname 
+        """
+        AssertionError("Interface has not been implemented.")
+
+
+class HashName(PSDispatcher):
+    """
+      Hash variable names to several endpoints
+    """
+
+    def __init__(self, pserver_endpoints):
+        super(self.__class__, self).__init__(pserver_endpoints)
+
+    def _hash_block(self, block_str, total):
+        return hash(block_str) % total
+
+    def dispatch(self, varlist):
+        eplist = []
+        for var in varlist:
+            server_id = self._hash_block(var.name(), len(self._eps))
+            server_for_param = self._eps[server_id]
+            eplist.append(server_for_param)
+        return eplist
+
+
+class RoundRobin(PSDispatcher):
+    """
+    Distribute variables to serveral endpoints.
+    """
+
+    def __init__(self, pserver_endpoints):
+        super(self.__class__, self).__init__(pserver_endpoints)
+
+    def dispatch(self, varlist):
+        eplist = []
+        for var in varlist:
+            server_for_param = self._eps[self._step]
+            eplist.append(server_for_param)
+            self._step += 1
+            if self._step >= len(self._eps):
+                self._step = 0
+        return eplist
diff --git a/python/paddle/v2/minibatch.py b/python/paddle/v2/minibatch.py
index 317cf037c69f8639e3760fbfce20565127794fcb..d48c54fcbb66487617b1946bc69724870c8f879c 100644
--- a/python/paddle/v2/minibatch.py
+++ b/python/paddle/v2/minibatch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']
 
 
-def batch(reader, batch_size):
+def batch(reader, batch_size, drop_last=False):
     """
     Create a batched reader.
 
@@ -23,6 +23,8 @@ def batch(reader, batch_size):
     :type reader: callable
     :param batch_size: size of each mini-batch
     :type batch_size: int
+    :param drop_last: drop the last batch, if the size of last batch is not equal to batch_size.
+    :type drop_last: bool
     :return: the batched reader.
     :rtype: callable
     """
@@ -35,7 +37,7 @@ def batch(reader, batch_size):
             if len(b) == batch_size:
                 yield b
                 b = []
-        if b:
+        if drop_last == False and len(b) != 0:
             yield b
 
     return batch_reader
diff --git a/python/setup.py.in b/python/setup.py.in
index c42601d335f01491156dc3591341c1a3213aecfe..8257f1d5e212a84188a4c51bc2d0f4d4c7af91fb 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -69,7 +69,8 @@ packages=['paddle',
           'paddle.fluid.proto',
           'paddle.fluid.proto.profiler',
           'paddle.fluid.layers',
-          'paddle.fluid.transpiler']
+          'paddle.fluid.transpiler',
+          'paddle.fluid.transpiler.details']
 
 if '${WITH_FLUID_ONLY}'== 'OFF':
     packages+=['paddle.proto',
diff --git a/tools/aws_benchmarking/server/cluster_master.py b/tools/aws_benchmarking/server/cluster_master.py
index 1333a942bf013a8182585b56e5843803c56945b1..a9b24846544d8aca5e4c7bd5709e70564c088431 100644
--- a/tools/aws_benchmarking/server/cluster_master.py
+++ b/tools/aws_benchmarking/server/cluster_master.py
@@ -20,6 +20,7 @@ import time
 import threading
 import logging
 import copy
+import csv
 
 import netaddr
 import boto3
@@ -136,6 +137,12 @@ parser.add_argument(
 parser.add_argument(
     '--master_server_ip', type=str, default="", help="master server private ip")
 
+parser.add_argument(
+    '--metric_data_identifier',
+    type=str,
+    default="**metrics_data: ",
+    help="key string to identify metrics data")
+
 parser.add_argument(
     '--no_clean_up',
     type=str2bool,
@@ -155,6 +162,11 @@ logging.basicConfig(
 
 log_files = ["master.log"]
 
+metrics = {}
+
+metrics_csv_file_name = "metrics.csv"
+is_metrics_file_created = False
+
 
 def create_subnet():
     # if no vpc id provided, list vpcs
@@ -329,12 +341,42 @@ def create_pservers():
         cleanup(args.task_name)
 
 
+def save_metrics_data(str_msg):
+    #parse msg
+    logging.info("found metrics data, saving it to csv file")
+    global is_metrics_file_created
+    metrics_raw = str_msg.split(",")
+    with open(args.log_path + metrics_csv_file_name, 'a') as csvfile:
+        csv_fieldnames = []
+        csv_write_data = {}
+        for metric in metrics_raw:
+            metric_data = metric.split("=")
+            metric_key = metric_data[0].strip()
+            metric_val = float(metric_data[1].strip())
+            if not metric_key in metrics:
+                metrics[metric_key] = []
+            metric_repo = metrics[metric_key]
+            metric_repo.append(metric_val)
+            csv_fieldnames.append(metric_key)
+            csv_write_data[metric_key] = metric_val
+        writer = csv.DictWriter(csvfile, fieldnames=csv_fieldnames)
+        if not is_metrics_file_created:
+            writer.writeheader()
+            is_metrics_file_created = True
+        writer.writerow(csv_write_data)
+        logging.info("csv file appended")
+
+
 def log_to_file(source, filename):
     if not filename in log_files:
         log_files.append(filename)
     with open(args.log_path + filename, "a") as log_file:
         for line in iter(source.readline, ""):
             log_file.write(line)
+            if (line.startswith(args.metric_data_identifier)):
+                #found key data, trying to add to csv
+                line = line.replace(args.metric_data_identifier, "")
+                save_metrics_data(line)
 
 
 def parse_command(command_raw, defaults={}):
diff --git a/tools/codestyle/.gitignore b/tools/codestyle/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..0d20b6487c61e7d1bde93acf4a14b7a89083a16d
--- /dev/null
+++ b/tools/codestyle/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
index 94d1e23ce716f7f1d723bad5f1f4c60030f19eb7..b194af76dc529fd52b0aedfab9c41d625fe64c0d 100755
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ b/tools/codestyle/cpplint_pre_commit.hook
@@ -4,8 +4,12 @@ TOTAL_ERRORS=0
 
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
 for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
-    cpplint $file;
-    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
+    if [[ $file =~ ^(paddle/api/.*|paddle/capi/.*|paddle/contrib/.*|paddle/cuda/.*|paddle/function/.*|paddle/gserver/.*|paddle/math/.*|paddle/optimizer/.*|paddle/parameter/.*|paddle/pserver/.*|paddle/trainer/.*|paddle/utils/.*) ]]; then
+        continue;
+    else
+        cpplint $file;
+        TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
+    fi
 done
 
 exit $TOTAL_ERRORS
diff --git a/tools/codestyle/docstring_checker.py b/tools/codestyle/docstring_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..48100e5bf989520043b5ca372b02883faea8a9fd
--- /dev/null
+++ b/tools/codestyle/docstring_checker.py
@@ -0,0 +1,334 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DocstringChecker is used to check python doc string's style."""
+
+import six
+import astroid
+
+from pylint.checkers import BaseChecker, utils
+from pylint.interfaces import IAstroidChecker
+
+from collections import defaultdict
+import re
+
+
+def register(linter):
+    """Register checkers."""
+    linter.register_checker(DocstringChecker(linter))
+
+
+class Docstring(object):
+    """Docstring class holds the parsed doc string elements.
+    """
+
+    def __init__(self):
+        self.d = defaultdict(list)  #name->[]
+        self.clear()
+
+    def clear(self):
+        self.d['Args'] = []
+        self.d['Examples'] = []
+        self.d['Returns'] = []
+        self.d['Raises'] = []
+        self.args = {}  #arg_name->arg_type
+
+    def get_level(self, string, indent='    '):
+        level = 0
+        unit_size = len(indent)
+        while string[:unit_size] == indent:
+            string = string[unit_size:]
+            level += 1
+
+        return level
+
+    def parse(self, doc):
+        """parse gets sections from doc
+        Such as Args, Returns, Raises, Examples s
+        Args:
+            doc (string): is the astroid node doc string.
+        Returns:
+            True if doc is parsed successfully.
+        """
+        self.clear()
+
+        lines = doc.splitlines()
+        state = ("others", -1)
+        for l in lines:
+            c = l.strip()
+            if len(c) <= 0:
+                continue
+
+            level = self.get_level(l)
+            if c.startswith("Args:"):
+                state = ("Args", level)
+            elif c.startswith("Returns:"):
+                state = ("Returns", level)
+            elif c.startswith("Raises:"):
+                state = ("Raises", level)
+            elif c.startswith("Examples:"):
+                state = ("Examples", level)
+            else:
+                if level > state[1]:
+                    self.d[state[0]].append(c)
+                    continue
+
+                state = ("others", -1)
+                self.d[state[0]].append(c)
+
+        self._arg_with_type()
+        return True
+
+    def get_returns(self):
+        return self.d['Returns']
+
+    def get_raises(self):
+        return self.d['Raises']
+
+    def get_examples(self):
+        return self.d['Examples']
+
+    def _arg_with_type(self):
+
+        for t in self.d['Args']:
+            m = re.search('([A-Za-z0-9_-]+)\s{0,4}(\(.+\))\s{0,4}:', t)
+            if m:
+                self.args[m.group(1)] = m.group(2)
+
+        return self.args
+
+
+class DocstringChecker(BaseChecker):
+    """DosstringChecker is pylint checker to
+    check docstring style.
+    """
+    __implements__ = (IAstroidChecker, )
+
+    POSITIONAL_MESSAGE_ID = 'str-used-on-positional-format-argument'
+    KEYWORD_MESSAGE_ID = 'str-used-on-keyword-format-argument'
+
+    name = 'doc-string-checker'
+    symbol = "doc-string"
+    priority = -1
+    msgs = {
+        'W9001': ('One line doc string on > 1 lines', symbol + "-one-line",
+                  'Used when a short doc string is on multiple lines'),
+        'W9002':
+        ('Doc string does not end with "." period', symbol + "-end-with",
+         'Used when a doc string does not end with a period'),
+        'W9003': ('All args with their types must be mentioned in doc string',
+                  symbol + "-with-all-args",
+                  'Used when not all arguments are in the doc string '),
+        'W9005': ('Missing docstring or docstring is too short',
+                  symbol + "-missing", 'Add docstring longer >=10'),
+        'W9006': ('Docstring indent error, use 4 space for indent',
+                  symbol + "-indent-error", 'Use 4 space for indent'),
+        'W9007': ('You should add `Returns` in comments',
+                  symbol + "-with-returns",
+                  'There should be a `Returns` section in comments'),
+        'W9008': ('You should add `Raises` section in comments',
+                  symbol + "-with-raises",
+                  'There should be a `Raises` section in comments'),
+    }
+    options = ()
+
+    def visit_functiondef(self, node):
+        """visit_functiondef checks Function node docstring style.
+        Args:
+            node (astroid.node): The visiting node.
+        Returns:
+            True if successful other wise False.
+        """
+
+        self.check_doc_string(node)
+
+        if node.tolineno - node.fromlineno <= 10:
+            return True
+
+        if not node.doc:
+            return True
+
+        doc = Docstring()
+        doc.parse(node.doc)
+
+        self.all_args_in_doc(node, doc)
+        self.with_returns(node, doc)
+        self.with_raises(node, doc)
+
+    def visit_module(self, node):
+        self.check_doc_string(node)
+
+    def visit_classdef(self, node):
+        self.check_doc_string(node)
+
+    def check_doc_string(self, node):
+        self.missing_doc_string(node)
+        self.one_line(node)
+        self.has_period(node)
+        self.indent_style(node)
+
+    def missing_doc_string(self, node):
+        if node.tolineno - node.fromlineno <= 10:
+            return True
+
+        if node.doc is None or len(node.doc) < 10:
+            self.add_message('W9005', node=node, line=node.fromlineno)
+        return False
+
+    # FIXME(gongwb): give the docstring line-no
+    def indent_style(self, node, indent=4):
+        """indent_style checks docstring's indent style
+        Args:
+            node (astroid.node): The visiting node.
+            indent (int): The default indent of style
+        Returns:
+            True if successful other wise False.
+        """
+        if node.doc is None:
+            return True
+
+        doc = node.doc
+        lines = doc.splitlines()
+
+        for l in lines:
+            cur_indent = len(l) - len(l.lstrip())
+            if cur_indent % indent != 0:
+                self.add_message('W9006', node=node, line=node.fromlineno)
+                return False
+
+        return True
+
+    def one_line(self, node):
+        """one_line checks if docstring (len < 40) is on one line.
+        Args:
+            node (astroid.node): The node visiting.
+        Returns:
+            True if successful otherwise False.
+        """
+
+        doc = node.doc
+        if doc is None:
+            return True
+
+        if len(doc) > 40:
+            return True
+        elif sum(doc.find(nl) for nl in ('\n', '\r', '\n\r')) == -3:
+            return True
+        else:
+            self.add_message('W9001', node=node, line=node.fromlineno)
+            return False
+
+        return True
+
+    def has_period(self, node):
+        """has_period checks if one line doc end-with '.' .
+        Args:
+            node (astroid.node): the node is visiting.
+        Returns:
+            True if successful otherwise False.
+        """
+        if node.doc is None:
+            return True
+
+        if len(node.doc.splitlines()) > 1:
+            return True
+
+        if not node.doc.strip().endswith('.'):
+            self.add_message('W9002', node=node, line=node.fromlineno)
+            return False
+
+        return True
+
+    def with_raises(self, node, doc):
+        """with_raises checks if one line doc end-with '.' .
+        Args:
+            node (astroid.node): the node is visiting.
+            doc (Docstring): Docstring object.
+        Returns:
+            True if successful otherwise False.
+        """
+
+        find = False
+        for t in node.body:
+            if not isinstance(t, astroid.Raise):
+                continue
+
+            find = True
+            break
+
+        if not find:
+            return True
+
+        if len(doc.get_raises()) == 0:
+            self.add_message('W9008', node=node, line=node.fromlineno)
+            return False
+
+        return True
+
+    def with_returns(self, node, doc):
+        """with_returns checks if docstring comments what are returned .
+        Args:
+            node (astroid.node): the node is visiting.
+            doc (Docstring): Docstring object.
+        Returns:
+            True if successful otherwise False.
+        """
+
+        find = False
+        for t in node.body:
+            if not isinstance(t, astroid.Return):
+                continue
+
+            find = True
+            break
+
+        if not find:
+            return True
+
+        if len(doc.get_returns()) == 0:
+            self.add_message('W9007', node=node, line=node.fromlineno)
+            return False
+
+        return True
+
+    def all_args_in_doc(self, node, doc):
+        """all_args_in_doc checks if arguments are mentioned in doc
+        Args:
+            node (astroid.node): the node is visiting.
+            doc (Docstring): Docstring object
+        Returns:
+            True if successful otherwise False.
+        """
+        args = []
+        for arg in node.args.get_children():
+            if (not isinstance(arg, astroid.AssignName)) \
+                or arg.name == "self":
+                continue
+            args.append(arg.name)
+
+        if len(args) <= 0:
+            return True
+
+        parsed_args = doc.args
+        if len(args) > 0 and len(parsed_args) <= 0:
+            print "debug:parsed args: ", parsed_args
+            self.add_message('W9003', node=node, line=node.fromlineno)
+            return False
+
+        for t in args:
+            if t not in parsed_args:
+                print t, " with (type) not in ", parsed_args
+                self.add_message('W9003', node=node, line=node.fromlineno)
+                return False
+
+        return True
diff --git a/tools/codestyle/pylint_pre_commit.hook b/tools/codestyle/pylint_pre_commit.hook
new file mode 100755
index 0000000000000000000000000000000000000000..e7c92ba671e0eb778b2ab5447bea7c4b14fe761b
--- /dev/null
+++ b/tools/codestyle/pylint_pre_commit.hook
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+TOTAL_ERRORS=0
+
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+export PYTHONPATH=$DIR:$PYTHONPATH
+
+# The trick to remove deleted files: https://stackoverflow.com/a/2413151
+for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
+    pylint --disable=all --load-plugins=docstring_checker \
+    --enable=doc-string-one-line,doc-string-end-with,doc-string-with-all-args,doc-string-triple-quotes,doc-string-missing,doc-string-indent-error,doc-string-with-returns,doc-string-with-raises $file;
+    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
+done
+
+#exit $TOTAL_ERRORS
+#For now, just warning:
+exit 0
+
diff --git a/tools/codestyle/test_docstring_checker.py b/tools/codestyle/test_docstring_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..0547f7d1610c64b0ca6efa9384e97d658c8276fe
--- /dev/null
+++ b/tools/codestyle/test_docstring_checker.py
@@ -0,0 +1,232 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import docstring_checker
+import pylint.testutils
+import astroid
+import pytest
+import sys
+
+
+class TestDocstring(pylint.testutils.CheckerTestCase):
+    CHECKER_CLASS = docstring_checker.DocstringChecker
+
+    def test_one_line(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            """get 
+            news.
+            """
+            if True:
+                return 5
+            return 5
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9001' == got[0][0]
+
+    def test_one_line(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            """get news"""
+            if True:
+                return 5
+            return 5
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9002' == got[0][0]
+
+    def test_args(self):
+        func_node = astroid.extract_node('''
+        def test(scale, mean): 
+            """get news.
+            Args:
+                scale (int): scale is the number.
+            """
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9003' == got[0][0]
+
+    def test_missing(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9005' == got[0][0]
+
+    def test_indent(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            """ get get get get get get get get
+              get get get get get get get get.
+            """
+            pass 
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9006' == got[0][0]
+
+    def test_with_resturns(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            """get news.
+            Args:
+                scale (int): scale is the number.
+            """
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            return mean
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9007' == got[0][0]
+
+    def test_with_raises(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            """get news.
+            Args:
+                scale (int): scale is the number.
+            """
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            raise ValueError('A very specific bad thing happened.')
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9008' == got[0][0]
+
+    def test_no_message(self):
+        p = '''
+def fc(input,
+       size,
+       num_flatten_dims=1,
+       param_attr=None,
+       bias_attr=None,
+       act=None,
+       name=None):
+    """
+    **Fully Connected Layer**
+    The fully connected layer can take multiple tensors as its inputs. It
+    creates a variable called weights for each input tensor, which represents
+    a fully connected weight matrix from each input unit to each output unit.
+    The fully connected layer multiplies each input tensor with its coresponding
+    weight to produce an output Tensor. If multiple input tensors are given,
+    the results of multiple multiplications will be sumed up. If bias_attr is
+    not None, a bias variable will be created and added to the output. Finally,
+    if activation is not None, it will be applied to the output as well.
+    This process can be formulated as follows:
+
+    Args:
+        input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
+            the input tensor(s) is at least 2.
+        size(int): The number of output units in this layer.
+        num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than
+            two dimensions. If this happens, the multidimensional tensor will first be flattened
+            into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
+            tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
+            dimensions will be flatten to form the first dimension of the final matrix (height of
+            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
+            form the second dimension of the final matrix (width of the matrix). For example, suppose
+            `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
+            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
+        param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
+            parameters/weights of this layer.
+        bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
+            of this layer. If it is set to None, no bias will be added to the output units.
+        act (str, default None): Activation to be applied to the output of this layer.
+        name (str, default None): The name of this layer.
+    Returns:
+        A tensor variable storing the transformation result.
+    Raises:
+        ValueError: If rank of the input tensor is less than 2.
+    Examples:
+        .. code-block:: python
+            data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+            fc = fluid.layers.fc(input=data, size=1000, act="tanh")
+    """
+    raise ValueError('A very specific bad thing happened.')
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    return size
+    '''
+
+        func_node = astroid.extract_node(p)
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 0
diff --git a/tools/manylinux1/README.md b/tools/manylinux1/README.md
index 898e00bd37c7b7bcbcb4a56476ff10c87381e47a..0e5905040175047f5b79939d97a3efcf38992944 100644
--- a/tools/manylinux1/README.md
+++ b/tools/manylinux1/README.md
@@ -28,3 +28,38 @@ git clone https://github.com/paddlepaddle/paddle
 cd paddle/tools/manylinux1
 REPO=[yourrepo] ./build_all.sh
 ```
+
+## Build PaddlePaddle for the different Python ABIs
+
+Choose one of the following Python ABI and set the correct environment variables.
+
+- cp27-cp27m
+
+  ```bash
+  export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
+  export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
+  export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
+        -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
+        -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
+  ```
+
+- cp27-cp27mu
+
+  ```bash
+  export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
+  export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
+  export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
+        -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
+        -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
+  ```
+
+And then add the `PYTHON_FLAGS` as your cmake flags:
+
+```bash
+cmake ..
+  ${PYTHON_FLAGS} \
+  -DWITH_GPU=OFF \
+  ...
+```
+
+You can find more details about cmake flags at [here](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/build_from_source_en.html#appendix-build-options)
diff --git a/tools/test_runner.py b/tools/test_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dc750b89058cd73355a2f7984d577252c03526d
--- /dev/null
+++ b/tools/test_runner.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import sys
+import paddle.fluid as fluid
+import importlib
+import cStringIO
+
+
+def main():
+    sys.path.append(os.getcwd())
+    some_test_failed = False
+    for module_name in sys.argv[1:]:
+        buffer = cStringIO.StringIO()
+        main = fluid.Program()
+        startup = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.program_guard(main, startup):
+            with fluid.scope_guard(scope):
+                with fluid.unique_name.guard():
+                    test_loader = unittest.TestLoader()
+                    module = importlib.import_module(module_name)
+                    tests = test_loader.loadTestsFromModule(module)
+                    res = unittest.TextTestRunner(stream=buffer).run(tests)
+                    if not res.wasSuccessful():
+                        some_test_failed = True
+                        print >> sys.stderr, module_name, 'failed\n', buffer.getvalue(
+                        )
+
+    if some_test_failed:
+        exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/timeline.py b/tools/timeline.py
index 8cd6353d46f496831cb61c1cdbbd156ca0579fb4..b413bb6fe0505df8fb09fa0759fefb6509b95bc9 100644
--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -171,7 +171,7 @@ if args.timeline_path:
 
 profile_paths = profile_path.split(',')
 profile_dict = dict()
-if len(profile_path) == 1:
+if len(profile_paths) == 1:
     with open(profile_path, 'r') as f:
         profile_s = f.read()
         profile_pb = profiler_pb2.Profile()