diff --git a/.clang-format b/.clang-format
index aff93435f58c522f5ed1090aef2005f76e91cf31..8b5830627348c6bff12260b7d9adbd357f074718 100644
--- a/.clang-format
+++ b/.clang-format
@@ -19,7 +19,7 @@ BasedOnStyle:  Google
 IndentWidth:     2
 TabWidth:        2
 ContinuationIndentWidth: 4
-AccessModifierOffset: -2  # The private/protected/public has no indent in class
+AccessModifierOffset: -1  # The private/protected/public has no indent in class
 Standard:  Cpp11 
 AllowAllParametersOfDeclarationOnNextLine: true
 BinPackParameters: false
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6140340890c0e5025eb08209e8ea78df918b4dc0..eeda759ff18ccb86ce6a585fe41cb972ea3ae295 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -34,6 +34,14 @@ repos:
         entry: bash ./tools/codestyle/cpplint_pre_commit.hook
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
+-   repo: local
+    hooks:
+    -   id: pylint-doc-string
+        name: pylint
+        description: Check python docstring style using docstring_checker.
+        entry: bash ./tools/codestyle/pylint_pre_commit.hook
+        language: system
+        files: \.(py)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
     sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
     hooks:
diff --git a/.travis.yml b/.travis.yml
index 3391e2c3cab9938c9dc5705b51367c707d3bbe9d..8c772030925dcad3909f142b08e4d8057a3f89b7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -18,6 +18,8 @@ env:
 addons:
   ssh_known_hosts: 13.229.163.131
 before_install:
+  # For pylint dockstring checker
+  - sudo pip install pylint pytest astroid isort
   - |
     function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
diff --git a/AUTHORS.md b/AUTHORS.md
index 4ee05420982d13f686cf13e8957ce41dfcdd2cb8..11f227be7148d8d6e055538347a8c31679406c84 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -4,6 +4,7 @@
 | backyes | Yan-Fei Wang |
 | baiyfbupt | Yi-Fan Bai |
 | beckett1124 | Bin Qi |
+| ChengduoZH | Cheng-Duo Zhao|
 | chengxiaohua1105 | Xiao-Hua Cheng |
 | cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
 | cxysteven | Xing-Yi Cheng |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 710b4774ca021c2e916460e7253d4fbf979a38cc..cfaab206e1f321a55119d4a8d65c4a99d3819fff 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -57,7 +57,10 @@ option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
 option(WITH_DISTRIBUTE  "Compile with grpc distributed support"         OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
+option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
+option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
+option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -202,7 +205,7 @@ endif(USE_NNPACK)
 
 add_subdirectory(proto)
 
-if(NOT MOBILE_INFERENCE)
+if(NOT MOBILE_INFERENCE AND NOT WITH_FLUID_ONLY)
     # "add_subdirectory(go)" should be placed after the following loine,
     # because it depends on paddle/optimizer.
     add_subdirectory(paddle/optimizer)
@@ -230,3 +233,7 @@ if(WITH_DOC)
     find_python_module(recommonmark REQUIRED)
     add_subdirectory(doc)
 endif()
+
+if (WITH_CONTRIB)
+    add_subdirectory(paddle/contrib)
+endif()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3c36cffcb4eeaaf7f8cff5167777628dd2697e7d..b1b02bcc2f4fd14297715bcf5bfd1617e3d5f0c9 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -58,6 +58,8 @@ PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful-
     create mode 100644 233
    ```
 
+	NOTE: The `yapf` installed by `pip install pre-commit` and `conda install -c conda-forge pre-commit` is slightly different. Paddle developers use `pip install pre-commit`.
+
 1. Build and test
 
    Users can build PaddlePaddle natively on Linux and Mac OS X.  But to unify the building environment and to make it easy for debugging, the recommended way is [using Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/build_en.md).
diff --git a/Dockerfile b/Dockerfile
index ea39efd00bb5c0a7deb3f6d57083d83a673b883c..4d6165b79a1d94b8f27d7f3ee1b6e2cee5992d31 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -29,7 +29,7 @@ RUN apt-get update && \
     wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
     curl sed grep graphviz libjpeg-dev zlib1g-dev  \
     python-matplotlib gcc-4.8 g++-4.8 \
-    automake locales clang-format swig doxygen cmake  \
+    automake locales clang-format swig cmake  \
     liblapack-dev liblapacke-dev \
     clang-3.8 llvm-3.8 libclang-3.8-dev \
     net-tools libtool ccache && \
@@ -79,6 +79,9 @@ RUN pip install pre-commit 'ipython==5.3.0' && \
     pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
     pip install opencv-python
 
+#For docstring checker
+RUN pip install pylint pytest astroid isort
+
 COPY ./python/requirements.txt /root/
 RUN pip install -r /root/requirements.txt
 
@@ -101,6 +104,3 @@ RUN echo 'root:root' | chpasswd
 RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
 RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 EXPOSE 22
-
-# development image default do build work
-CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
diff --git a/Dockerfile.android b/Dockerfile.android
index 848a7eba6f1421432addae8acff407b611adb4ae..48db2efea21a648657e3f490c95429b9a29ede52 100644
--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -40,5 +40,3 @@ RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
     unzip -q android-ndk-r14b-linux-x86_64.zip && \
     mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
     rm -rf /opt/android-ndk-tmp
-
-CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"]
diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md
deleted file mode 100644
index 64816098a524f064ec12474a736cd4c721227a70..0000000000000000000000000000000000000000
--- a/benchmark/cluster/README.md
+++ /dev/null
@@ -1,196 +0,0 @@
-# Cluster Training Benchmark
-
-## Setup
-
-- Platform
-  - Kubernetes: v1.6.2
-  - Linux Kernel: v3.10.0
-
-- Resource
-  - CPU: 10 Cores per Pod
-  - Memory: 5GB per Pod
-
-- Docker Image
-
-  We use different base Docker Image to run the benchmark on Kubernetes:
-  - PaddlePaddle v2: paddlepaddle/paddle:0.11.0
-  - PaddlePaddle Fluid: paddlepaddle/paddle:[commit-id]
-  - TensorFlow: tensorflow/tensorflow:1.5.0-rc0
-
-- Model
-  vgg16 is used in this benchmark.
-
-## Cases
-
-- Variable
-  - Batch Size of training data.
-  - PServer count of the training job.
-  - The number of trainers.
-
-- Invariant
-  - The resource of trainer/pserver Pod.
-
-### Measure the Performance for Different Batch Size
-
-- PServer Count: 40
-- Trainer Count: 100
-- Metrics: mini-batch / sec
-
-
-<table>
-<thead>
-<tr>
-<th>Batch Size </th>
-<th> 32</th>
-<th>64</th>
-<th>128 </th>
-<th>256</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td>-</td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2  </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-</tbody>
-</table>
-
-### Measure the Performance for Different PServer Count
-
-- Trainer Count: 100
-- Batch Size: 64
-- Metrics: mini-batch / sec
-
-
-<table>
-<thead>
-<tr>
-<th>PServer Count  </th>
-<th>10</th>
-<th>20</th>
-<th>40 </th>
-<th>60</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td>-</td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2  </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-</tr>
-</tbody>
-</table>
-
-### Measure Parallel Efficiency By Increasing Trainer Count
-
-- PServer Count: 20
-- Batch Size: 64
-- Metrics:
-
-$S = \div(T1, TN)$
-
-which S is the ratio of T1 over TN, training time of 1 and N trainers.
-The parallel efficiency is:
-
-$E = \div(S, N)$
-
-<table>
-<thead>
-<tr>
-<th>Trainer Counter  </th>
-<th>1</th>
-<th>10</th>
-<th>20 </th>
-<th>30</th>
-<th>40</th>
-<th>50</th>
-<th>60 </th>
-<th>70</th>
-<th>80</th>
-<th>90</th>
-<th>100 </th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2  </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td>-  </td>
-<td>- </td>
-<td>-  </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-<td>- </td>
-<td>-</td>
-<td>- </td>
-<td>- </td>
-</tr>
-</tbody>
-</table>
-
-
-## Reproduce the benchmark
-
-TODO
diff --git a/benchmark/cluster/vgg16/Dockerfile b/benchmark/cluster/vgg16/Dockerfile
deleted file mode 100644
index 13ad8e1b6237e6f41a076c4fb54311728832ae33..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/Dockerfile
+++ /dev/null
@@ -1,35 +0,0 @@
-FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
-
-# you can get mirror list here:
-# https://launchpad.net/ubuntu/+archivemirrors
-ARG UBUNTU_MIRROR
-RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-
-RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
-RUN pip install -U kubernetes opencv-python
-
-RUN pip install paddlepaddle
-# if network is slowly, you may need to add proxy here.
-# ENV https_proxy=
-RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
-RUN pip uninstall -y paddlepaddle
-# unset proxy if it is setted.
-# ENV https_proxy=""
-
-# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
-#       so we must build one with distribute support to install in this image.
-ADD *.whl /
-RUN pip install /*.whl && rm -f /*.whl
-ENV LD_LIBRARY_PATH=/usr/local/lib
-
-# tf k8s
-RUN pip install tensorflow==1.4.0
-ADD tf_k8s /usr/bin
-RUN chmod +x /usr/bin/tf_k8s
-ADD vgg16_tf.py /workspace/
-
-# below lines may change a lot for debugging
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
-RUN chmod +x /usr/bin/paddle_k8s
-ADD vgg16_fluid.py vgg16_v2.py /workspace/
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
deleted file mode 100644
index d56a912b9b03986e32693363f82df05a34b779e9..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/README.md
+++ /dev/null
@@ -1,195 +0,0 @@
-# Performance for Distributed vgg16
-
-## Test Result
-
-### Hardware Infomation
-
-- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
-- cpu MHz		: 2101.000
-- cache size	: 20480 KB
-
-### Blas settings
-
-Setting environment variable: `MKL_NUM_THREADS=1`.
-
-### Single Node Single Thread
-
-- Metrics: samples / sec
-
-<table>
-<thead>
-<tr>
-<th>Batch Size </th>
-<th> 32</th>
-<th>64</th>
-<th>128 </th>
-<th>256</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td> 15.44 </td>
-<td> 16.32 </td>
-<td> 16.74 </td>
-<td> 16.79 </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2  </td>
-<td> 15.97 </td>
-<td> 17.04 </td>
-<td> 17.60 </td>
-<td> 17.83 </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td> 9.09 </td>
-<td> 9.10 </td>
-<td> 9.24 </td>
-<td> 8.66 </td>
-</tr>
-</tbody>
-</table>
-
-
-### Different Batch Size
-
-- PServer Count: 10
-- Trainer Count: 20
-- Metrics: samples / sec
-
-<table>
-<thead>
-<tr>
-<th>Batch Size </th>
-<th> 32</th>
-<th>64</th>
-<th>128 </th>
-<th>256</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td> 190.20 </td>
-<td> 222.15 </td>
-<td> 247.40 </td>
-<td> 258.18 </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2  </td>
-<td> 170.96 </td>
-<td> 233.71 </td>
-<td> 256.14 </td>
-<td> 329.23 </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-</tr>
-</tbody>
-</table>
-
-### Accelerate Rate
-
-- Pserver Count: 20
-- Batch Size: 128
-- Metrics: samples / sec
-
-<table>
-<thead>
-<tr>
-<th>Trainer Count </th>
-<th>20</th>
-<th>40</th>
-<th>80</th>
-<th>100</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid</td>
-<td> 263.29 (78.64%) </td>
-<td> 518.80 (77.47%) </td>
-<td> 836.26 (62.44%) </td>
-<td> 1019.29 (60.89%) </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2 (need more tests)   </td>
-<td> 326.85 (92.85%) </td>
-<td> 534.58 (75.93%) </td>
-<td> 853.30 (60.60%) </td>
-<td> 1041.99 (59.20%) </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-</tr>
-</tbody>
-</table>
-
-
-### Different Pserver Count
-
-- Trainer Count: 60
-- Batch Size: 128
-- Metrics: samples/ sec
-
-<table>
-<thead>
-<tr>
-<th>PServer Count </th>
-<th>3</th>
-<th>6</th>
-<th>10</th>
-<th>20</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td> PaddlePaddle Fluid(should fix in next PR) </td>
-<td> 589.1 </td>
-<td> 592.6 </td>
-<td> 656.4 </td>
-<td> 655.8 </td>
-</tr>
-<tr>
-<td>PaddlePaddle v2 (need more tests)   </td>
-<td> 593.4 </td>
-<td> 791.3 </td>
-<td> 729.7 </td>
-<td> 821.7 </td>
-</tr>
-<tr>
-<td>TensorFlow </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-<td> - </td>
-</tr>
-</tbody>
-</table>
-
-
-*The performance gap between Fuild and v2 comes from the network interference.*
-
-
-## Steps to Run the Performance Test
-
-1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
-1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
-1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
-1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
-1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
-
-Check the logs for the distributed training progress and analyze the performance.
-
-## Enable Verbos Logs
-
-Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
diff --git a/benchmark/cluster/vgg16/fluid_pserver.yaml b/benchmark/cluster/vgg16/fluid_pserver.yaml
deleted file mode 100644
index ee8b0763b62fc011f40f6197e929a68b48a93e47..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/fluid_pserver.yaml
+++ /dev/null
@@ -1,72 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
-  name: vgg16job-pserver
-spec:
-  replicas: 10
-  template:
-    metadata:
-      labels:
-        paddle-job-pserver: vgg16job
-    spec:
-      hostNetwork: true
-      imagePullSecrets:
-      - name: job-registry-secret
-      containers:
-      - name: pserver
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
-        imagePullPolicy: Always
-        ports:
-        - name: jobport-30236
-          containerPort: 30236
-        env:
-        - name: PADDLE_JOB_NAME
-          value: vgg16job
-        - name: MKL_NUM_THREADS
-          value: "1"
-        - name: TRAINING_ROLE
-          value: "PSERVER"
-        - name: TRAINERS
-          value: "20"
-        - name: PSERVERS
-          value: "10"
-        - name: TOPOLOGY
-          value: ""
-        - name: ENTRY
-          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: PADDLE_INIT_PORT
-          value: "30236"
-        - name: PADDLE_INIT_NICS
-          value: "xgbe0"
-        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
-          value: "1"
-        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
-          value: "20"
-        - name: PADDLE_INIT_NUM_PASSES
-          value: "1"
-        - name: PADDLE_INIT_USE_GPU
-          value: "0"
-        - name: LD_LIBRARY_PATH
-          value: "/usr/local/lib:/usr/local/nvidia/lib64"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        - name: POD_IP
-          valueFrom:
-            fieldRef:
-              fieldPath: "status.podIP"
-        command: ["paddle_k8s", "start_fluid"]
-        resources:
-          requests:
-            memory: 10Gi
-            cpu: 4
-          limits:
-            memory: 10Gi
-            cpu: 4
diff --git a/benchmark/cluster/vgg16/fluid_trainer.yaml b/benchmark/cluster/vgg16/fluid_trainer.yaml
deleted file mode 100644
index 3d56caac009464d1073423bb63abff1f8b0cf28f..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ /dev/null
@@ -1,69 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: vgg16job-trainer
-spec:
-  parallelism: 20
-  completions: 20
-  template:
-    metadata:
-      labels:
-        paddle-job: vgg16job
-    spec:
-      imagePullSecrets:
-      - name: job-registry-secret
-      hostNetwork: true
-      containers:
-      - name: trainer
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
-        imagePullPolicy: Always
-        command: ["paddle_k8s", "start_fluid"]
-        env:
-        - name: PADDLE_JOB_NAME
-          value: vgg16job
-        - name: TRAINING_ROLE
-          value: "TRAINER"
-        - name: TRAINERS
-          value: "20"
-        - name: PSERVERS
-          value: "10"
-        - name: TOPOLOGY
-          value: ""
-        - name: ENTRY
-          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: PADDLE_INIT_PORT
-          value: "30236"
-        - name: PADDLE_INIT_NICS
-          value: "xgbe0"
-        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
-          value: "1"
-        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
-          value: "20"
-        - name: PADDLE_INIT_NUM_PASSES
-          value: "1"
-        - name: PADDLE_INIT_USE_GPU
-          value: "0"
-        - name: LD_LIBRARY_PATH
-          value: "/usr/local/lib:/usr/local/nvidia/lib64"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        - name: POD_IP
-          valueFrom:
-            fieldRef:
-              fieldPath: "status.podIP"
-        resources:
-          requests:
-            memory: 40Gi
-            cpu: 2
-          limits:
-            memory: 40Gi
-            cpu: 2
-      restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/run_vgg_dist.sh b/benchmark/cluster/vgg16/run_vgg_dist.sh
deleted file mode 100644
index 8c0501439e9d5fa175f5aa9b62d286e690a10904..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/run_vgg_dist.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-
-# Update to point to the source file.
-VGG_SRC="vgg16_fluid.py"
-
-export TRAINING_ROLE=PSERVER
-export TRAINERS=2
-export POD_IP=127.0.0.1
-export PADDLE_INIT_PORT=6174
-MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 &
-
-# Need to wait for the ps to start first.
-sleep 10
-echo "done start ps"
-
-export TRAINING_ROLE=TRAINER
-export TRAINERS=2
-export POD_IP=127.0.0.1
-export PADDLE_INIT_PORT=6174
-CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 &
-CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 &
diff --git a/benchmark/cluster/vgg16/tf_k8s b/benchmark/cluster/vgg16/tf_k8s
deleted file mode 100644
index 4fc263d5f681aeabfa71f1758714d269d987b272..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/tf_k8s
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/bin/bash
-check_trainer_ret() {
-  ret=$1
-  stdbuf -oL echo "job returned $ret...setting pod return message..."
-  stdbuf -oL echo "==============================="
-
-  if [ $ret -eq 136 ] ; then
-    echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
-  elif [ $ret -eq 139 ] ; then
-    echo "Segmentation Fault" > /dev/termination-log
-  elif [ $ret -eq 1 ] ; then
-    echo "General Error" > /dev/termination-log
-  elif [ $ret -eq 134 ] ; then
-    echo "Program Abort" > /dev/termination-log
-  fi
-  stdbuf -oL echo "termination log wroted..."
-  exit $ret
-}
-
-g_pservers=""
-g_trainers=""
-
-wait_running_pods(){
-  pserver_label="tf-job-pserver=${JOB_NAME}"
-  trainer_label="tf-job-trainer=${JOB_NAME}"
-
-  stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
-  stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
-
-  g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
-  g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
-}
-
-start_tf_pserver(){
-  wait_running_pods
-
-  label="tf-job-pserver=${JOB_NAME}"
-  pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
-
-  cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
-  --job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
-
-  stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
-}
-
-start_tf_trainer(){
-  wait_running_pods
-
-  label="tf-job-trainer=${JOB_NAME}"
-  trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
-
-  cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
-  --job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
-
-  stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
-  check_trainer_ret $?
-}
-
-start_tf(){
-    if [[ "${TF_JOB_NAME}" == "worker" ]]; then
-        start_tf_trainer
-    else
-        start_tf_pserver
-    fi
-}
-
-usage() {
-    echo "usage: tf_k8s [<args>]:"
-    echo "  start_tf         Start tensorflow jobs"
-}
-
-case "$1" in
-    start_tf)
-        start_tf
-        ;;
-    --help)
-        usage
-        ;;
-    *)
-        usage
-        ;;
-esac
diff --git a/benchmark/cluster/vgg16/tf_pserver.yaml b/benchmark/cluster/vgg16/tf_pserver.yaml
deleted file mode 100644
index 5e37c700819119c8af05c40fe4b8d13911efc3e1..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/tf_pserver.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
-  name: vgg16job-tf-pserver
-spec:
-  replicas: 10
-  template:
-    metadata:
-      labels:
-        tf-job-pserver: vgg16job-tf
-    spec:
-      hostNetwork: true
-      imagePullSecrets:
-      - name: job-registry-secret
-      containers:
-      - name: pserver
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
-        imagePullPolicy: Always
-        command: ["tf_k8s", "start_tf"]
-        ports:
-        - name: jobport-30236
-          containerPort: 30236
-        env:
-        - name: PORT
-          value: "32036"
-        - name: ENTRY
-          value: "python vgg16_tf.py"
-        - name: JOB_NAME
-          value: vgg16job-tf
-        - name: PSERVERS_NUM
-          value: "10"
-        - name: TF_JOB_NAME 
-          value: "ps"
-        - name: TRAINERS_NUM
-          value: "20"
-        - name: BATCH_SIZE
-          value: "128"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: NUM_PASSES
-          value: "1"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        - name: POD_IP
-          valueFrom:
-            fieldRef:
-              fieldPath: "status.podIP"
-        resources:
-          requests:
-            memory: 10Gi
-            cpu: 4
-          limits:
-            memory: 10Gi
-            cpu: 4
diff --git a/benchmark/cluster/vgg16/tf_trainer.yaml b/benchmark/cluster/vgg16/tf_trainer.yaml
deleted file mode 100644
index 08795df3addfa7b618db24a65e57be190e268f06..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/tf_trainer.yaml
+++ /dev/null
@@ -1,58 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: vgg16job-tf-trainer
-spec:
-  parallelism: 20
-  completions: 20
-  template:
-    metadata:
-      labels:
-        tf-job-trainer: vgg16job-tf
-    spec:
-      imagePullSecrets:
-      - name: job-registry-secret
-      hostNetwork: true
-      containers:
-      - name: trainer
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
-        imagePullPolicy: Always
-        command: ["tf_k8s", "start_tf"]
-        ports:
-        - name: jobport-30236
-          containerPort: 30236
-        env:
-        - name: PORT
-          value: "32036"
-        - name: JOB_NAME
-          value: vgg16job-tf
-        - name: TF_JOB_NAME 
-          value: "worker"
-        - name: ENTRY
-          value: "python vgg16_tf.py"
-        - name: PSERVERS_NUM
-          value: "10"
-        - name: BATCH_SIZE
-          value: "128"
-        - name: TRAINERS_NUM
-          value: "20"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: NUM_PASSES
-          value: "1"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        - name: POD_IP
-          valueFrom:
-            fieldRef:
-              fieldPath: "status.podIP"
-        resources:
-          requests:
-            memory: 40Gi
-            cpu: 2
-          limits:
-            memory: 40Gi
-            cpu: 2
-      restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/v2_pserver.yaml b/benchmark/cluster/vgg16/v2_pserver.yaml
deleted file mode 100644
index dd1271e0cf399184134c06b3200ee1202c65cef0..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/v2_pserver.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
-  name: vgg16v2job-pserver
-spec:
-  replicas: 10
-  template:
-    metadata:
-      labels:
-        paddle-job-pserver: vgg16v2job
-    spec:
-      hostNetwork: true
-      imagePullSecrets:
-      - name: job-registry-secret
-      containers:
-      - name: pserver
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
-        imagePullPolicy: Always
-        ports:
-        - name: jobport-30236
-          containerPort: 30236
-        env:
-        - name: PADDLE_JOB_NAME
-          value: vgg16v2job
-        - name: TRAINERS
-          value: "20"
-        - name: PSERVERS
-          value: "10"
-        - name: TOPOLOGY
-          value: ""
-        - name: ENTRY
-          value: "python train.py"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: PADDLE_INIT_PORT
-          value: "30236"
-        - name: PADDLE_INIT_NICS
-          value: "xgbe0"
-        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
-          value: "1"
-        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
-          value: "20"
-        - name: PADDLE_INIT_NUM_PASSES
-          value: "1"
-        - name: PADDLE_INIT_USE_GPU
-          value: "0"
-        - name: LD_LIBRARY_PATH
-          value: "/usr/local/lib:/usr/local/nvidia/lib64"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        command: ["paddle_k8s", "start_pserver"]
-        resources:
-          requests:
-            memory: 10Gi
-            cpu: 4
-          limits:
-            memory: 10Gi
-            cpu: 4
diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml
deleted file mode 100644
index 12c8964066cbcfe8d2a44de2f51a3d12ea422fe2..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: vgg16v2job-trainer
-spec:
-  parallelism: 20
-  completions: 20
-  template:
-    metadata:
-      labels:
-        paddle-job: vgg16v2job
-    spec:
-      imagePullSecrets:
-        - name: job-registry-secret
-      hostNetwork: true
-      containers:
-      - name: trainer
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
-        imagePullPolicy: Always
-        command: ["paddle_k8s", "start_trainer", "v2"]
-        env:
-        - name: PADDLE_JOB_NAME
-          value: vgg16v2job
-        - name: BATCH_SIZE
-          value: "256"
-        - name: TRAINERS
-          value: "20"
-        - name: PSERVERS
-          value: "10"
-        - name: TOPOLOGY
-          value: ""
-        - name: ENTRY
-          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: PADDLE_INIT_PORT
-          value: "30236"
-        - name: PADDLE_INIT_NICS
-          value: "xgbe0"
-        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
-          value: "1"
-        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
-          value: "20"
-        - name: PADDLE_INIT_NUM_PASSES
-          value: "2"
-        - name: PADDLE_INIT_USE_GPU
-          value: "0"
-        - name: LD_LIBRARY_PATH
-          value: "/usr/local/lib:/usr/local/nvidia/lib64"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        resources:
-          requests:
-            memory: 40Gi
-            cpu: 2
-          limits:
-            memory: 40Gi
-            cpu: 2
-      restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
deleted file mode 100644
index e9360ab4c79d23bdf9f84d0c0d407af6d39bde3e..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ /dev/null
@@ -1,312 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""VGG16 benchmark in Fluid"""
-from __future__ import print_function
-
-import sys
-import time
-import numpy as np
-import paddle.v2 as paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.profiler as profiler
-import argparse
-import functools
-import os
-from paddle.fluid import debuger
-
-
-def str2bool(v):
-    if v.lower() in ('yes', 'true', 't', 'y', '1'):
-        return True
-    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
-        return False
-    else:
-        raise argparse.ArgumentTypeError('Boolean value expected.')
-
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    '--batch_size', type=int, default=16, help="Batch size for training.")
-parser.add_argument(
-    '--learning_rate',
-    type=float,
-    default=1e-3,
-    help="Learning rate for training.")
-parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
-parser.add_argument(
-    '--device',
-    type=str,
-    default='CPU',
-    choices=['CPU', 'GPU'],
-    help="The device type.")
-parser.add_argument('--device_id', type=int, default=0, help="The device id.")
-parser.add_argument(
-    '--data_format',
-    type=str,
-    default='NCHW',
-    choices=['NCHW', 'NHWC'],
-    help='The data order, now only support NCHW.')
-parser.add_argument(
-    '--data_set',
-    type=str,
-    default='flowers',
-    choices=['cifar10', 'flowers'],
-    help='Optional dataset for benchmark.')
-parser.add_argument(
-    '--local',
-    type=str2bool,
-    default=True,
-    help='Whether to run as local mode.')
-
-parser.add_argument(
-    "--ps_hosts",
-    type=str,
-    default="",
-    help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
-    "--trainer_hosts",
-    type=str,
-    default="",
-    help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
-    "--profile", action='store_true', help="If set, profile a few steps.")
-
-# Flags for defining the tf.train.Server
-parser.add_argument(
-    "--task_index", type=int, default=0, help="Index of task within the job")
-args = parser.parse_args()
-
-
-def vgg16_bn_drop(input):
-    def conv_block(input, num_filter, groups, dropouts):
-        return fluid.nets.img_conv_group(
-            input=input,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act='relu',
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type='max')
-
-    conv1 = conv_block(input, 64, 2, [0.3, 0])
-    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
-    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
-    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
-    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
-    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
-    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
-    return fc2
-
-
-def main():
-    if args.data_set == "cifar10":
-        classdim = 10
-        if args.data_format == 'NCHW':
-            data_shape = [3, 32, 32]
-        else:
-            data_shape = [32, 32, 3]
-    else:
-        classdim = 102
-        if args.data_format == 'NCHW':
-            data_shape = [3, 224, 224]
-        else:
-            data_shape = [224, 224, 3]
-
-    # Input data
-    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    # Train program
-    net = vgg16_bn_drop(images)
-    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    # Evaluator
-    batch_size = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(
-        input=predict, label=label, total=batch_size)
-
-    # inference program
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(batch_acc)
-
-    # Optimization
-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
-    optimize_ops, params_grads = optimizer.minimize(avg_cost)
-
-    # Initialize executor
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(
-        args.device_id)
-    exe = fluid.Executor(place)
-
-    # test
-    def test(exe):
-        test_pass_acc = fluid.average.WeightedAverage()
-        for batch_id, data in enumerate(test_reader()):
-            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
-                                    data)).astype("float32")
-            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-            y_data = y_data.reshape([-1, 1])
-
-            outs = exe.run(inference_program,
-                           feed={"pixel": img_data,
-                                 "label": y_data},
-                           fetch_list=[batch_acc, batch_size])
-            test_pass_acc.add(value=np.array(outs[0]), weight=np.array(outs[1]))
-
-        return test_pass_acc.eval()
-
-    def train_loop(exe, trainer_prog):
-        iters = 0
-        ts = time.time()
-        train_pass_acc = fluid.average.WeightedAverage()
-        for pass_id in range(args.num_passes):
-            # train
-            start_time = time.time()
-            num_samples = 0
-            train_pass_acc.reset()
-
-            def run_step(batch_id, data):
-                img_data = np.array(
-                    map(lambda x: x[0].reshape(data_shape), data)).astype(
-                        "float32")
-                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-                y_data = y_data.reshape([-1, 1])
-
-                loss, acc, b_size = exe.run(
-                    trainer_prog,
-                    feed={"pixel": img_data,
-                          "label": y_data},
-                    fetch_list=[avg_cost, batch_acc, batch_size])
-                return loss, acc, b_size
-
-            if args.profile:
-                with profiler.profiler('All', 'total',
-                                       '/tmp/profile_vgg_%d' % args.task_index):
-                    for batch_id, data in enumerate(train_reader()):
-                        if batch_id > 5: break
-                        run_step(batch_id, data)
-
-            total_time = 0.0
-            count = 0
-            for batch_id, data in enumerate(train_reader()):
-                ts = time.time()
-                loss, acc, b_size = run_step(batch_id, data)
-                iters += 1
-                num_samples += len(data)
-                train_pass_acc.add(value=acc, weight=b_size)
-
-                duration = time.time() - ts
-                total_time += duration
-                count += len(data)
-                print(
-                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
-                    "Speed = %.2f (%.2f) img/s" % (pass_id, iters, loss, acc,
-                                                   len(data) / duration,
-                                                   count / total_time)
-                )  # The accuracy is the accumulation of batches, but not the current batch.
-
-            pass_elapsed = time.time() - start_time
-            pass_train_acc = train_pass_acc.eval()
-            pass_test_acc = test(exe)
-            print("Task:%d Pass = %d, Training performance = %f imgs/s, "
-                  "Train accuracy = %f, Test accuracy = %f\n" %
-                  (args.task_index, pass_id, num_samples / pass_elapsed,
-                   pass_train_acc, pass_test_acc))
-
-    if args.local:
-        # Parameter initialization
-        exe.run(fluid.default_startup_program())
-
-        # data reader
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
-                else paddle.dataset.flowers.train(),
-                buf_size=5120),
-            batch_size=args.batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.cifar.test10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-            batch_size=args.batch_size)
-        train_loop(exe, fluid.default_main_program())
-    else:
-        trainers = int(os.getenv("TRAINERS"))  # total trainer count
-        print("trainers total: ", trainers)
-
-        training_role = os.getenv(
-            "TRAINING_ROLE",
-            "TRAINER")  # get the training role: trainer/pserver
-
-        t = fluid.DistributeTranspiler()
-        t.transpile(
-            trainer_id=args.task_index,
-            pservers=args.ps_hosts,
-            trainers=trainers)
-
-        if training_role == "PSERVER":
-            current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
-                "PADDLE_INIT_PORT")
-            if not current_endpoint:
-                print("need env SERVER_ENDPOINT")
-                exit(1)
-            pserver_prog = t.get_pserver_program(current_endpoint)
-            pserver_startup = t.get_startup_program(current_endpoint,
-                                                    pserver_prog)
-            exe.run(pserver_startup)
-            exe.run(pserver_prog)
-        elif training_role == "TRAINER":
-            # Parameter initialization
-            exe.run(fluid.default_startup_program())
-
-            # data reader
-            train_reader = paddle.batch(
-                paddle.reader.shuffle(
-                    paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
-                    else paddle.dataset.flowers.train(),
-                    buf_size=5120),
-                batch_size=args.batch_size)
-            test_reader = paddle.batch(
-                paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else
-                paddle.dataset.flowers.test(),
-                batch_size=args.batch_size)
-
-            trainer_prog = t.get_trainer_program()
-            feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
-            # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
-            exe.run(fluid.default_startup_program())
-            train_loop(exe, trainer_prog)
-        else:
-            print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
-
-
-def print_arguments():
-    print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == "__main__":
-    print_arguments()
-    main()
diff --git a/benchmark/cluster/vgg16/vgg16_tf.py b/benchmark/cluster/vgg16/vgg16_tf.py
deleted file mode 100644
index 2d220478acae46566760209dbc012cff316946aa..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/vgg16_tf.py
+++ /dev/null
@@ -1,366 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""VGG16 benchmark in TensorFlow
-You can get distribution example template structure here:
-https://medium.com/clusterone/how-to-write-distributed-tensorflow-code-with-an-example-on-tensorport-70bf3306adcb
-https://www.tensorflow.org/deploy/distributed
-"""
-
-import tensorflow as tf
-import paddle.v2 as paddle
-import numpy as np
-import argparse
-import time
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    '--batch_size', type=int, default=128, help="Batch size for training.")
-parser.add_argument(
-    '--learning_rate',
-    type=float,
-    default=1e-3,
-    help="Learning rate for training.")
-parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
-parser.add_argument(
-    '--device',
-    type=str,
-    default='CPU',
-    choices=['CPU', 'GPU'],
-    help="The device type.")
-parser.add_argument(
-    '--data_format',
-    type=str,
-    default='NHWC',
-    choices=['NCHW', 'NHWC'],
-    help='The data order, NCHW=[batch, channels, height, width].'
-    'Only support NHWC right now.')
-parser.add_argument(
-    '--data_set',
-    type=str,
-    default='cifar10',
-    choices=['cifar10', 'flowers'],
-    help='Optional dataset for benchmark.')
-
-parser.add_argument(
-    "--ps_hosts",
-    type=str,
-    default="",
-    help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
-    "--worker_hosts",
-    type=str,
-    default="",
-    help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
-    "--job_name", type=str, default="", help="One of 'worker', 'ps'")
-# Flags for defining the tf.train.Server
-parser.add_argument(
-    "--task_index", type=int, default=0, help="Index of task within the job")
-
-args = parser.parse_args()
-
-
-class VGG16Model(object):
-    def __init__(self):
-        self.parameters = []
-
-    def batch_norm_relu(self, inputs, is_training):
-        """Performs a batch normalization followed by a ReLU."""
-        # We set fused=True for a significant speed boost. See
-        # https://www.tensorflow.org/speed/speed_guide#common_fused_ops
-        inputs = tf.layers.batch_normalization(
-            inputs=inputs,
-            axis=1 if args.data_format == 'NCHW' else -1,
-            momentum=0.9,
-            epsilon=1e-05,
-            center=True,
-            scale=True,
-            training=is_training,
-            fused=True)
-        inputs = tf.nn.relu(inputs)
-        return inputs
-
-    def conv_bn_layer(self,
-                      name,
-                      images,
-                      kernel_shape,
-                      is_training,
-                      drop_rate=0.0):
-        with tf.name_scope(name) as scope:
-            kernel = tf.Variable(
-                tf.truncated_normal(
-                    kernel_shape, dtype=tf.float32, stddev=1e-1),
-                name='weights')
-            conv = tf.nn.conv2d(
-                images,
-                kernel, [1, 1, 1, 1],
-                data_format=args.data_format,
-                padding='SAME')
-            biases = tf.Variable(
-                tf.constant(
-                    0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
-                trainable=True,
-                name='biases')
-            out = tf.nn.bias_add(conv, biases)
-            out = self.batch_norm_relu(out, is_training)
-            out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
-            return out
-
-    def fc_layer(self, name, inputs, shape):
-        with tf.name_scope(name) as scope:
-            fc_w = tf.Variable(
-                tf.truncated_normal(
-                    shape, dtype=tf.float32, stddev=1e-1),
-                name='weights')
-            fc_b = tf.Variable(
-                tf.constant(
-                    0.0, shape=[shape[-1]], dtype=tf.float32),
-                trainable=True,
-                name='biases')
-            out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
-            return out
-
-    def network(self, images, class_dim, is_training):
-        """ VGG16 model structure.
-
-            TODO(kuke): enable this network to support the 'NCHW' data format
-        """
-
-        # conv1
-        conv1_1 = self.conv_bn_layer(
-            'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
-        conv1_2 = self.conv_bn_layer(
-            'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
-        # pool1
-        pool1 = tf.nn.max_pool(
-            conv1_2,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool1')
-        # conv2
-        conv2_1 = self.conv_bn_layer(
-            'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
-        conv2_2 = self.conv_bn_layer(
-            'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
-        # pool2
-        pool2 = tf.nn.max_pool(
-            conv2_2,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool2')
-        # conv3
-        conv3_1 = self.conv_bn_layer(
-            'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
-        conv3_2 = self.conv_bn_layer(
-            'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
-        conv3_3 = self.conv_bn_layer(
-            'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
-        # pool3
-        pool3 = tf.nn.max_pool(
-            conv3_3,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool3')
-        # conv4
-        conv4_1 = self.conv_bn_layer(
-            'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
-        conv4_2 = self.conv_bn_layer(
-            'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
-        conv4_3 = self.conv_bn_layer(
-            'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
-        # pool4
-        pool4 = tf.nn.max_pool(
-            conv4_3,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool4')
-        # conv5
-        conv5_1 = self.conv_bn_layer(
-            'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
-        conv5_2 = self.conv_bn_layer(
-            'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
-        conv5_3 = self.conv_bn_layer(
-            'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
-        # pool5
-        pool5 = tf.nn.max_pool(
-            conv5_3,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool4')
-        # flatten
-        shape = int(np.prod(pool5.get_shape()[1:]))
-        pool5_flat = tf.reshape(pool5, [-1, shape])
-        # fc1
-        drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
-        fc1 = self.fc_layer('fc1', drop, [shape, 512])
-        # fc2
-        bn = self.batch_norm_relu(fc1, is_training)
-        drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
-        fc2 = self.fc_layer('fc2', drop, [512, 512])
-
-        fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
-
-        return fc3
-
-
-def run_benchmark(cluster_spec, server):
-    """Run benchmark on cifar10 or flowers."""
-
-    if args.data_set == "cifar10":
-        class_dim = 10
-        raw_shape = (3, 32, 32)
-        dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
-            None, 3, 32, 32)
-    else:
-        class_dim = 102
-        raw_shape = (3, 224, 224)
-        dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
-            None, 3, 224, 224)
-
-    device = tf.train.replica_device_setter(
-        worker_device="/job:worker/task:{}".format(args.task_index),
-        cluster=cluster_spec)
-
-    with tf.device(device):
-        images = tf.placeholder(tf.float32, shape=dat_shape)
-        labels = tf.placeholder(tf.int64, shape=(None, ))
-        is_training = tf.placeholder('bool')
-        onehot_labels = tf.one_hot(labels, depth=class_dim)
-
-        vgg16 = VGG16Model()
-        logits = vgg16.network(images, class_dim, is_training)
-        loss = tf.losses.softmax_cross_entropy(
-            onehot_labels=onehot_labels, logits=logits)
-        avg_loss = tf.reduce_mean(loss)
-
-        correct = tf.equal(tf.argmax(logits, 1), labels)
-        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
-
-        optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
-        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-        global_step = tf.Variable(0, name='global_step', trainable=False)
-        with tf.control_dependencies(update_ops):
-            train_op = optimizer.minimize(avg_loss, global_step=global_step)
-
-        summary_op = tf.summary.merge_all()
-        init_op = tf.global_variables_initializer()
-
-    # data reader
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
-            buf_size=5120),
-        batch_size=args.batch_size)
-    test_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.cifar.test10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-            buf_size=5120),
-        batch_size=args.batch_size)
-
-    # test
-    def test():
-        test_accs = []
-        for batch_id, data in enumerate(test_reader()):
-            test_images = np.array(
-         map(lambda x: np.transpose(x[0].reshape(raw_shape),
-         axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
-            test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
-            test_accs.append(
-                accuracy.eval(feed_dict={
-                    images: test_images,
-                    labels: test_labels,
-                    is_training: False
-                }))
-        return np.mean(test_accs)
-
-    config = tf.ConfigProto(
-        intra_op_parallelism_threads=1,
-        inter_op_parallelism_threads=1,
-        log_device_placement=True)
-    config.gpu_options.allow_growth = True
-
-    hooks = [tf.train.StopAtStepHook(last_step=1000000)]
-
-    with tf.train.MonitoredTrainingSession(
-            master=server.target,
-            is_chief=(args.task_index == 0),
-            hooks=hooks,
-            config=config) as sess:
-        iters, num_samples, start_time = 0, 0, 0.0
-        for pass_id in range(args.num_passes):
-            # train
-            num_samples = 0
-            start_time = time.time()
-            for batch_id, data in enumerate(train_reader()):
-                train_images = np.array(
-                    map(lambda x: np.transpose(x[0].reshape(raw_shape),
-                    axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
-                train_labels = np.array(map(lambda x: x[1], data)).astype(
-                    'int64')
-                iter_begin_time = time.time()
-                _, loss, acc = sess.run([train_op, avg_loss, accuracy],
-                                        feed_dict={
-                                            images: train_images,
-                                            labels: train_labels,
-                                            is_training: True
-                                        })
-                iters += 1
-                print(
-                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed=%.2f imgs/sec"
-                    % (pass_id, iters, loss, acc,
-                       len(data) / (time.time() - iter_begin_time)))
-                num_samples += len(data)
-            train_elapsed = time.time() - start_time
-            # test
-            pass_test_acc = test()
-            print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
-                  (pass_id, num_samples / train_elapsed, pass_test_acc))
-
-
-def print_arguments():
-    print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == '__main__':
-    print_arguments()
-
-    ps_hosts = args.ps_hosts.split(",")
-    worker_hosts = args.worker_hosts.split(",")
-
-    # Create a cluster from the parameter server and worker hosts.
-    cluster_spec = tf.train.ClusterSpec({
-        "ps": ps_hosts,
-        "worker": worker_hosts
-    })
-
-    # Create and start a server for the local task.
-    server = tf.train.Server(
-        cluster_spec, job_name=args.job_name, task_index=args.task_index)
-
-    if args.job_name == "ps":
-        print("start pserver")
-        server.join()
-    elif args.job_name == "worker":
-        print("start worker")
-        run_benchmark(cluster_spec, server)
diff --git a/benchmark/cluster/vgg16/vgg16_v2.py b/benchmark/cluster/vgg16/vgg16_v2.py
deleted file mode 100644
index 1a66af32d7131997c63bd3c3042875f33a467084..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/vgg16_v2.py
+++ /dev/null
@@ -1,154 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-import gzip
-
-import paddle.v2.dataset.cifar as cifar
-import paddle.v2 as paddle
-import time
-import os
-
-DATA_DIM = 3 * 32 * 32
-CLASS_DIM = 10
-BATCH_SIZE = os.getenv("BATCH_SIZE")
-if BATCH_SIZE:
-    BATCH_SIZE = int(BATCH_SIZE)
-else:
-    BATCH_SIZE = 128
-print "batch_size", BATCH_SIZE
-NODE_COUNT = int(os.getenv("TRAINERS"))
-ts = 0
-
-
-def vgg(input, nums, class_dim):
-    def conv_block(input, num_filter, groups, num_channels=None):
-        return paddle.networks.img_conv_group(
-            input=input,
-            num_channels=num_channels,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act=paddle.activation.Relu(),
-            pool_type=paddle.pooling.Max())
-
-    assert len(nums) == 5
-    # the channel of input feature is 3
-    conv1 = conv_block(input, 64, nums[0], 3)
-    conv2 = conv_block(conv1, 128, nums[1])
-    conv3 = conv_block(conv2, 256, nums[2])
-    conv4 = conv_block(conv3, 512, nums[3])
-    conv5 = conv_block(conv4, 512, nums[4])
-
-    fc_dim = 512
-    fc1 = paddle.layer.fc(input=conv5,
-                          size=fc_dim,
-                          act=paddle.activation.Relu(),
-                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
-    fc2 = paddle.layer.fc(input=fc1,
-                          size=fc_dim,
-                          act=paddle.activation.Relu(),
-                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
-    out = paddle.layer.fc(input=fc2,
-                          size=class_dim,
-                          act=paddle.activation.Softmax())
-    return out
-
-
-def vgg13(input, class_dim):
-    nums = [2, 2, 2, 2, 2]
-    return vgg(input, nums, class_dim)
-
-
-def vgg16(input, class_dim):
-    nums = [2, 2, 3, 3, 3]
-    return vgg(input, nums, class_dim)
-
-
-def vgg19(input, class_dim):
-    nums = [2, 2, 4, 4, 4]
-    return vgg(input, nums, class_dim)
-
-
-def main():
-    global ts
-    paddle.init(use_gpu=False)
-    image = paddle.layer.data(
-        name="image", type=paddle.data_type.dense_vector(DATA_DIM))
-    lbl = paddle.layer.data(
-        name="label", type=paddle.data_type.integer_value(CLASS_DIM))
-
-    extra_layers = None
-    # NOTE: for v2 distributed training need averaging updates.
-    learning_rate = 1e-3 / NODE_COUNT
-    out = vgg16(image, class_dim=CLASS_DIM)
-    cost = paddle.layer.classification_cost(input=out, label=lbl)
-
-    # Create parameters
-    parameters = paddle.parameters.create(cost)
-
-    # Create optimizer
-    optimizer = paddle.optimizer.Momentum(
-        momentum=0.9,
-        regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
-                                                         BATCH_SIZE),
-        learning_rate=learning_rate / BATCH_SIZE,
-        learning_rate_decay_a=0.1,
-        learning_rate_decay_b=128000 * 35,
-        learning_rate_schedule="discexp", )
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            cifar.train10(),
-            # To use other data, replace the above line with:
-            # reader.train_reader('train.list'),
-            buf_size=1000),
-        batch_size=BATCH_SIZE)
-    test_reader = paddle.batch(
-        cifar.test10(),
-        # To use other data, replace the above line with:
-        # reader.test_reader('val.list'),
-        batch_size=BATCH_SIZE)
-
-    # Create trainer
-    trainer = paddle.trainer.SGD(cost=cost,
-                                 parameters=parameters,
-                                 update_equation=optimizer,
-                                 extra_layers=extra_layers,
-                                 is_local=False)
-
-    # End batch and end pass event handler
-    def event_handler(event):
-        global ts, ts_pass
-        if isinstance(event, paddle.event.BeginPass):
-            ts_pass = time.time()
-        if isinstance(event, paddle.event.BeginIteration):
-            ts = time.time()
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 1 == 0:
-                print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics,
-                    time.time() - ts)
-        if isinstance(event, paddle.event.EndPass):
-            print "Pass %d end, spent: %f" % (event.pass_id,
-                                              time.time() - ts_pass)
-            result = trainer.test(reader=test_reader)
-            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
-
-    trainer.train(
-        reader=train_reader, num_passes=200, event_handler=event_handler)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/benchmark/fluid/Dockerfile b/benchmark/fluid/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..46140a9d1be01a50cd74dab2799e3731e8d3debd
--- /dev/null
+++ b/benchmark/fluid/Dockerfile
@@ -0,0 +1,22 @@
+FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop
+RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
+RUN pip install -U pip
+RUN pip install -U kubernetes opencv-python paddlepaddle
+
+# IMPORTANT:
+# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
+
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.imikolov.fetch()" | python'
+RUN pip uninstall -y paddlepaddle && mkdir /workspace
+
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s
+
+ENV LD_LIBRARY_PATH=/usr/local/lib
+ADD fluid_benchmark.py dataset.py models/ /workspace/
diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md
index 0fc02b704362f79f2219252538b4b3195e665b2c..1b0c7dce8bd6faab0c4c59caa1cbe337483cbd16 100644
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
@@ -24,31 +24,45 @@ Currently supported `--model` argument include:
 
 * Run the following command to start a benchmark job locally:
     ```bash
-      python fluid_benchmark.py --model mnist --parallel 1 --device GPU --with_test
+      python fluid_benchmark.py --model mnist  --device GPU
     ```
     You can choose to use GPU/CPU training. With GPU training, you can specify
-    `--parallel 1` to run multi GPU training.
+    `--gpus <gpu_num>` to run multi GPU training.
 * Run distributed training with parameter servers:
     * start parameter servers:
         ```bash
-        PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method pserver
+        PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist  --device GPU --update_method pserver
         ```
     * start trainers:
         ```bash
-        PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method pserver
+        PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist  --device GPU --update_method pserver
         ```
 * Run distributed training using NCCL2
     ```bash
-    PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3  PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --parallel 0 --device GPU --update_method nccl2
+    PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3  PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2
     ```
 
 ## Run Distributed Benchmark on Kubernetes Cluster
 
+You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
+have to start all those processes mannually on each node, which is not recommended.
+
+To build the Docker image, you need to choose a paddle "whl" package to run with, you may either
+download it from
+http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_en.html or
+build it by your own. Once you've got the "whl" package, put it under the current directory and run:
+
+```bash
+docker build -t [your docker image name]:[your docker image tag] .
+```
+
+Then push the image to a Docker registry that your Kubernetes cluster can reach.
+
 We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit
 distributed benchmark jobs to your cluster. To generate a job yaml, just run:
 
 ```bash
-python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver --with_test" --disttype pserver
+python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --gpus 8 --device GPU --update_method pserver " --disttype pserver
 ```
 
 Then the yaml files are generated under directory `myjob`, you can run:
@@ -58,3 +72,14 @@ kubectl create -f myjob/
 ```
 
 The job shall start.
+
+
+## Notes for Run Fluid Distributed with NCCL2 and RDMA
+
+Before running NCCL2 distributed jobs, please check that whether your node has multiple network
+interfaces, try to add the environment variable `export NCCL_SOCKET_IFNAME=eth0` to use your actual
+network device.
+
+To run high-performance distributed training, you must prepare your hardware environment to be
+able to run RDMA enabled network communication, please check out [this](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/cluster/nccl2_rdma_training.md)
+note for details.
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index 1d8f27440d0f1438e0520684ee3e90e8a5891a17..c1d458970a58bfac2a3369e8964eb100568b28f2 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -94,6 +94,12 @@ def parse_args():
         '--memory_optimize',
         action='store_true',
         help='If set, optimize runtime memory before start.')
+    parser.add_argument(
+        '--use_fake_data',
+        action='store_true',
+        help='If set ommit the actual read data operators.')
+    parser.add_argument(
+        '--profile', action='store_true', help='If set, profile a few steps.')
     parser.add_argument(
         '--update_method',
         type=str,
@@ -104,8 +110,8 @@ def parse_args():
     return args
 
 
-def append_nccl2_prepare():
-    if os.getenv("PADDLE_TRAINER_ID", None) != None:
+def append_nccl2_prepare(trainer_id):
+    if trainer_id >= 0:
         # append gen_nccl_id at the end of startup program
         trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
         port = os.getenv("PADDLE_PSERVER_PORT")
@@ -132,12 +138,12 @@ def append_nccl2_prepare():
             })
         return nccl_id_var, num_trainers, trainer_id
     else:
-        raise Exception(
-            "must set PADDLE_TRAINER_ID env variables for dist train.")
+        raise Exception("must set positive PADDLE_TRAINER_ID env variables for "
+                        "nccl-based dist train.")
 
 
-def dist_transpile():
-    if "PADDLE_TRAINING_ROLE" not in os.environ:
+def dist_transpile(trainer_id):
+    if trainer_id < 0:
         return None, None
 
     # the port of all pservers, needed by both trainer and pserver
@@ -154,9 +160,6 @@ def dist_transpile():
     trainers = int(os.getenv("PADDLE_TRAINERS"))
     # the IP of the local machine, needed by pserver only
     current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
-    # the unique trainer id, starting from 0, needed by trainer
-    # only
-    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
     # the role, should be either PSERVER or TRAINER
     training_role = os.getenv("PADDLE_TRAINING_ROLE")
 
@@ -198,6 +201,10 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
         exe.run(train_prog)
         return
 
+    if args.use_fake_data:
+        raise Exception(
+            "fake data is not supported in single GPU test for now.")
+
     place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
     exe = fluid.Executor(place)
     exe.run(startup_prog)
@@ -244,7 +251,31 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
 def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                    batch_acc, args, train_prog, startup_prog, nccl_id_var,
                    num_trainers, trainer_id):
+    feed_var_list = [
+        var for var in train_prog.global_block().vars.itervalues()
+        if var.is_data
+    ]
+    # generate fake:
+    if args.use_fake_data:
+        for var in feed_var_list:
+            v = startup_prog.global_block().clone_variable(var)
+            var.persistable = True
+            v.persistable = True
+
+            real_shape = list(var.shape)
+            real_shape[0] = args.batch_size / args.gpus
+            startup_prog.global_block().append_op(
+                outputs={"Out": v},
+                type="fill_constant",
+                attrs={"shape": real_shape,
+                       "value": 1.0,
+                       "dtype": var.dtype})
+
     place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    if nccl_id_var and trainer_id == 0:
+        #FIXME(wuyi): wait other trainer to start listening
+        time.sleep(30)
+
     startup_exe = fluid.Executor(place)
     startup_exe.run(startup_prog)
     strategy = fluid.ExecutionStrategy()
@@ -256,22 +287,27 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
         exec_strategy=strategy,
         num_trainers=num_trainers,
         trainer_id=trainer_id)
-    feed_var_list = [
-        var for var in train_prog.global_block().vars.itervalues()
-        if var.is_data
-    ]
+
     feeder = fluid.DataFeeder(feed_var_list, place)
     for pass_id in range(args.pass_num):
         num_samples = 0
         iters = 0
         start_time = time.time()
         for batch_id, data in enumerate(train_reader()):
+            if args.profile and pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif args.profile and pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id)
+
             if iters == args.skip_batch_num:
                 start_time = time.time()
                 num_samples = 0
             if iters == args.iterations:
                 break
-            loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
+            if args.use_fake_data:
+                loss, = exe.run([avg_loss.name])
+            else:
+                loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
             if args.update_method == "pserver":
                 exe.bcast_params()
             num_samples += len(data)
@@ -302,7 +338,11 @@ def print_arguments(args):
 def main():
     args = parse_args()
     print_arguments(args)
-    nccl_id_var, num_trainers, trainer_id = None, 1, 0
+
+    # the unique trainer id, starting from 0, needed by trainer
+    # only
+    nccl_id_var, num_trainers, trainer_id = (
+        None, 1, int(os.getenv("PADDLE_TRAINER_ID", "-1")))
 
     if args.use_cprof:
         pr = cProfile.Profile()
@@ -316,7 +356,7 @@ def main():
         fluid.memory_optimize(fluid.default_main_program())
 
     if args.update_method == "pserver":
-        train_prog, startup_prog = dist_transpile()
+        train_prog, startup_prog = dist_transpile(trainer_id)
         if not train_prog:
             raise Exception(
                 "Must configure correct environments to run dist train.")
@@ -332,7 +372,7 @@ def main():
     train_args.append(fluid.default_startup_program())
 
     if args.update_method == "nccl2":
-        nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare()
+        nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(trainer_id)
     if args.gpus == 1:
         # NOTE: parallel executor use profiler interanlly
         if args.use_nvprof and args.device == 'GPU':
diff --git a/benchmark/fluid/kube_gen_job.py b/benchmark/fluid/kube_gen_job.py
index 3dbb4b8c5dd13657f8d1853003b321ad047e1349..9da8a69af1d7b671b2648b1b3702776c1c0650b0 100644
--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
@@ -49,7 +49,7 @@ def parse_args():
     parser.add_argument(
         '--fluid', default=1, type=int, help='whether is fluid job')
     parser.add_argument(
-        '--rdma', action='store_ture', help='whether mount rdma libs')
+        '--rdma', action='store_true', help='whether mount rdma libs')
     parser.add_argument(
         '--disttype',
         default="pserver",
@@ -112,6 +112,7 @@ def gen_job():
     envs.append({"name": "PSERVERS", "value": str(args.pservers)})
     envs.append({"name": "ENTRY", "value": args.entry})
     envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)})
+    envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
     # NOTE: these directories below are cluster specific, please modify
     # this settings before you run on your own cluster.
     envs.append({
diff --git a/benchmark/fluid/kube_templates/__init__.py b/benchmark/fluid/kube_templates/__init__.py
index b64a7f78ff10d03987ea4a8c13a0e34bb433f64c..2d09d940a5ee638e4b55405d05924e2d76006cfc 100644
--- a/benchmark/fluid/kube_templates/__init__.py
+++ b/benchmark/fluid/kube_templates/__init__.py
@@ -54,5 +54,13 @@ envs = [
                 "fieldPath": "status.podIP"
             }
         }
+    },
+    {
+        "name": "PADDLE_CURRENT_IP",
+        "valueFrom": {
+            "fieldRef": {
+                "fieldPath": "status.podIP"
+            }
+        }
     }
 ]
diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh
index f6dfd20bf2ee0b668b6d4238d4511253b2233035..afaab5f4de43fa7e94feeed4a1de991351c04b76 100644
--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
@@ -37,7 +37,8 @@ nohup stdbuf -oL nvidia-smi \
       -l 1 &
 # mnist
 # mnist gpu mnist 128
-FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=mnist \
                --device=GPU \
                --batch_size=128 \
                --skip_batch_num=5 \
@@ -46,7 +47,8 @@ FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
 
 # vgg16
 # gpu cifar10 128
-FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=vgg16 \
                --device=GPU \
                --batch_size=128 \
                --skip_batch_num=5 \
@@ -54,7 +56,8 @@ FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
                2>&1 | tee -a vgg16_gpu_128.log
 
 # flowers gpu  128
-FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=vgg16 \
                --device=GPU \
                --batch_size=32 \
                --data_set=flowers \
@@ -64,40 +67,39 @@ FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
 
 # resnet50
 # resnet50 gpu cifar10 128
-FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=resnet50 \
                --device=GPU \
                --batch_size=128 \
                --data_set=cifar10 \
-               --model=resnet_cifar10 \
                --skip_batch_num=5 \
                --iterations=30 \
                2>&1 | tee -a resnet50_gpu_128.log
 
 # resnet50 gpu flowers 64
-FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=resnet50 \
                --device=GPU \
                --batch_size=64 \
                --data_set=flowers \
-               --model=resnet_imagenet \
                --skip_batch_num=5 \
                --iterations=30 \
                2>&1 | tee -a resnet50_gpu_flowers_64.log
 
 # lstm
 # lstm gpu imdb 32 # tensorflow only support batch=32
-FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=stacked_dynamic_lstm \
                --device=GPU \
                --batch_size=32 \
                --skip_batch_num=5 \
                --iterations=30 \
-               --hidden_dim=512 \
-               --emb_dim=512 \
-               --crop_size=1500 \
                2>&1 | tee -a lstm_gpu_32.log
 
 # seq2seq
 # seq2seq gpu wmb 128
-FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=machine_translation \
                --device=GPU \
                --batch_size=128 \
                --skip_batch_num=5 \
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index e490397cc0624c310949a4b571bd00cac6e8953b..682614742cf1bd3130c638020a2545e16226d4d6 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -41,6 +41,10 @@ if(USE_EIGEN_FOR_BLAS)
     add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
 endif(USE_EIGEN_FOR_BLAS)
 
+if(EIGEN_USE_THREADS)
+    add_definitions(-DEIGEN_USE_THREADS)
+endif(EIGEN_USE_THREADS)
+
 if(NOT WITH_PROFILER)
     add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 0fde4373a4be58e71ff1a305bd4991cc554d7a34..2665996432b1f6681927320a85d6835094abe4cd 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -212,6 +212,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
         ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake
             ${OPTIONAL_ARGS}
             -Dprotobuf_BUILD_TESTS=OFF
+            -DCMAKE_SKIP_RPATH=ON
             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
             -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 3b13b2150514bd615667241272d287c7e55d4e74..236a55d332a91c88d1c5515e7aca4142930a079f 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -56,24 +56,28 @@ set(dst_dir "${FLUID_INSTALL_DIR}/third_party/eigen3")
 copy(eigen3_lib
   SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
   DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported
+  DEPS eigen3
 )
 
 set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/gflags")
 copy(gflags_lib
   SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
   DSTS ${dst_dir} ${dst_dir}/lib
+  DEPS gflags
 )
 
 set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/glog")
 copy(glog_lib
   SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
   DSTS ${dst_dir} ${dst_dir}/lib
+  DEPS glog
 )
 
 set(dst_dir "${FLUID_INSTALL_DIR}/third_party/boost/")
 copy(boost_lib
   SRCS ${BOOST_INCLUDE_DIR}/boost
   DSTS ${dst_dir}
+  DEPS boost
 )
 
 if(NOT PROTOBUF_FOUND)
@@ -81,6 +85,7 @@ if(NOT PROTOBUF_FOUND)
     copy(protobuf_lib
       SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
       DSTS ${dst_dir} ${dst_dir}/lib
+      DEPS extern_protobuf
     )
 endif()
 
@@ -89,12 +94,14 @@ if(NOT CBLAS_FOUND)
     copy(openblas_lib
       SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
       DSTS ${dst_dir} ${dst_dir}
+      DEPS extern_openblas
     )
 elseif (WITH_MKLML)
     set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml")
     copy(mklml_lib
       SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
       DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
+      DEPS mklml
     )
 endif()
 
@@ -103,6 +110,7 @@ if(WITH_MKLDNN)
   copy(mkldnn_lib
     SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
     DSTS ${dst_dir} ${dst_dir}/lib
+    DEPS mkldnn
   )
 endif()
 
@@ -110,17 +118,20 @@ if(NOT MOBILE_INFERENCE AND NOT RPI)
   set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
   copy(snappy_lib
     SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
-    DSTS ${dst_dir} ${dst_dir}/lib)
+    DSTS ${dst_dir} ${dst_dir}/lib
+    DEPS snappy)
 
   set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
   copy(snappystream_lib
     SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
-    DSTS ${dst_dir} ${dst_dir}/lib)
+    DSTS ${dst_dir} ${dst_dir}/lib
+    DEPS snappystream)
 
   set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
   copy(zlib_lib
     SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
-    DSTS ${dst_dir} ${dst_dir}/lib)
+    DSTS ${dst_dir} ${dst_dir}/lib
+    DEPS zlib)
 endif()
 
 # paddle fluid module
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index 91449042fcdfd48c95f3dd3babf958c5d572e747..dbb99d3c03f39f650b2cb0dbe8ee49cd413db6e3 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -1003,9 +1003,15 @@ dice_loss
 ..  autofunction:: paddle.fluid.layers.dice_loss
     :noindex:
 
-bilinear_interp
+upsampling_bilinear2d
 ____
 
-..  autofunction:: paddle.fluid.layers.bilinear_interp
+..  autofunction:: paddle.fluid.layers.upsampling_bilinear2d
+    :noindex:
+
+gather
+____
+
+..  autofunction:: paddle.fluid.layers.gather
     :noindex:
 
diff --git a/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md b/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
new file mode 100644
index 0000000000000000000000000000000000000000..79df6c59578e2acf495a3453ab61f069c3f09a49
--- /dev/null
+++ b/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
@@ -0,0 +1,1819 @@
+
+# Paddle Fluid 开发者指南
+
+---
+
+### ==1==. 为什么需要 PaddlePaddle Fluid？
+
+---
+
+### 两个基础问题
+
+<font size=6>
+
+1. 如何描述机器学习模型和优化过程？
+    - 完备自洽，表达能力足以支持潜在出现的各种计算需求
+1. 如何充分利用资源高效计算？
+    - 支持异步设备、多卡、分布式计算
+    - 降低计算/计算优化的开发成本
+    - ……
+
+</font>
+
+---
+
+### 如何描述模型和优化过程？
+
+<font size=6>
+
+<table>
+<thead>
+<tr>
+<th> </th>
+<th>一组连续执行的layers</th>
+<th>variable和operator构成的计算图 </th>
+<th>不再有模型的概念 </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> 2013</td>
+<td> Caffe，Theano, Torch, PaddlePaddle </td>
+<td> </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> 2015 </td>
+<td> </td>
+<td> TensorFlow, MxNet, Caffe2, ONNX, n-graph </td>
+<td> </td>
+</tr>
+<tr>
+<td>2016 </td>
+<td> </td>
+<td> </td>
+<td> PyTorch, TensorFlow Eager Execution, <font color=#483D8B>**==PaddlePaddle Fluid==** </td>
+</tr>
+
+</tbody>
+</table>
+
+---
+
+
+### <p align="center">目标 </p>
+
+<font size=6>
+
+- 提高对各类机器学习任务的描述能力：能够描述潜在出现的任意机器学习模型。
+- 代码结构逻辑清晰，各模块充分解耦：内外部贡献者能够专注于自己所需的功能模块，基于框架进行再次开发。
+- 从设计上，留下技术优化的空间和潜力。
+- 代码解耦后降低多设备支持、计算优化等的开发成本。
+- 在统一的设计理念下，实现自动可伸缩，自动容错的分布式计算。
+
+</font>
+
+---
+
+## ==2.== Design Overview
+
+---
+
+# Fluid: 系统形态
+
+- <span style="background-color:#ACD6FF;">[编译器式的执行流程，区分编译时和运行时](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)</span>
+<br>
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid-compiler.png" width=100%>
+</p>
+
+---
+
+#### 让我们在Fluid程序实例中，区分编译时和运行时
+
+---
+### Fluid 编译时
+
+<font size=5>
+
+- ==**定义前向计算**==
+
+  ```python
+  x = fluid.layers.data(name='x',shape=[13], dtype='float32')
+  y_predict = fluid.layers.fc(input=x, size=1, act=None)
+  y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+  cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+  avg_cost = fluid.layers.mean(x=cost)
+  ```
+
+- ==**添加反向、正则、优化**==
+  ```python
+  learning_rate = 0.01
+  sgd_optimizer = fluid.optimizer.SGD(learning_rate)
+  sgd_optimizer.minimize(avg_cost)
+  ```
+</font>
+
+---
+
+### `Program` vs. 计算图
+
+<font size=5>
+
+- 在科学计算领域，计算图是一种描述计算的经典方式。下图展示了从前向计算图（蓝色）开始，通过添加反向（红色）和优化算法相关（绿色）操作，构建出整个计算图的过程：
+-
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/graph_construction_example_all.png" width=60%>
+</p>
+
+
+- Fluid ==使用`Program`而不是计算图==来描述模型和优化过程。`Program`由`Block`、`Operator`和`Variable`构成，相关概念会在后文详细展开。
+- 编译时 Fluid 接受前向计算（这里可以先简单的理解为是一段有序的计算流）`Program`，为这段前向计算按照：前向 -> 反向 -> 梯度 clip -> 正则 -> 优化 的顺序，添加相关 `Operator`和`Variable`到`Program`到完整的计算。
+
+</font>
+
+---
+
+### Fluid 运行时
+
+<font size=5>
+
+- ==**读入数据**==
+
+  ```python
+  train_reader = paddle.batch(
+      paddle.reader.shuffle(paddle.dataset.uci_housing.train(), buf_size=500),
+      batch_size=20)
+  feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+  ```
+- ==**定义执行程序的设备**==
+  ```python
+  place = fluid.CPUPlace()
+  feeder = fluid.DataFeeder(place=place,feed_list=[x, y])
+  ```
+
+- ==创建执行器（Executor），执行初始化 `Program`和训练`Program`==
+
+  ```python
+  exe = fluid.Executor(place)
+  exe.run(fluid.default_startup_program())
+  PASS_NUM = 100
+  for pass_id in range(PASS_NUM):
+      for data in train_reader():
+          avg_loss_value, = exe.run(fluid.default_main_program(),
+                                    feed=feeder.feed(data),
+                                    fetch_list=[avg_cost])
+          print(avg_loss_value)
+  ```
+</font>
+
+---
+
+### 总结：框架做什么？用户做什么？
+<br>
+
+<font size=5>
+<table>
+<thead>
+<tr>
+<th>构建训练</th>
+<th>执行训练</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>
+<span style="background-color:#B3D9D9">用户</span>：描述前向运算<br><span style="background-color:#DAB1D5;">框架</span>：添加反向运算<br><span style="background-color:#DAB1D5;">框架</span>：添加优化运算<br><span style="background-color:#DAB1D5;">框架</span>：添加内存优化<br><span style="background-color:#DAB1D5;">框架</span>：添加并行/多设备/分布式相关的计算单元
+</td>
+
+<td>
+<span style="background-color:#DAB1D5;">框架</span>：创建Operator（计算）+ Variable（数据）<br><span style="background-color:#DAB1D5;">框架</span>：创建`Block`<br><span style="background-color:#DAB1D5;">框架</span>：内存管理/设备管理<br><span style="background-color:#DAB1D5;">框架</span>：执行计算
+</td>
+</tr>
+</tbody>
+</table>
+</font>
+
+---
+
+### <p align="center">总结：编译时</p>
+<font size=5>
+
+<span style="background-color:#A3D1D1;">**用户编写一段Python程序，描述模型的前向计算**</span>
+1. 创建变量描述 `VarDesc`
+1. 创建operators的描述 `OpDesc`
+1. 创建operators的属性
+1. 推断变量的类型和形状，进行静态检查：`inferShape`
+1. 规划变量的内存复用
+1. 创建反向计算
+1. 添加优化相关的Operators
+1. （可选）添加多卡/多机相关的Operator，生成在多卡/多机上运行的程序
+
+</font>
+
+---
+
+### <p align="center">总结：运行时</p>
+<font size=5>
+
+<span style="background-color:#C7C7E2;">**执行规划好的计算**</span>
+1. 创建`Executor`
+1. 为将要执行的一段计算，在层级式的`Scope`空间中创建`Scope`
+1. 创建`Block`，依次执行`Block`
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/compile_run_time.png" width=50%><br>
+<font size=3> Figure. 编译时运行时概览</font>
+</p>
+
+</font>
+
+---
+<!-- *template: invert -->
+## ==3==. 用户如何描述计算？
+---
+
+### Fluid：==像写程序一样==定义计算
+<font size=5>
+
+- 顺序执行
+    ```python
+    x = fluid.layers.data(name='x',shape=[13], dtype='float32')
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    ```
+
+- 条件分支: [swith](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md)、[ifelse](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md)
+
+   ```python
+   a = fluid.Var(10)
+   b = fluid.Var(0)
+
+   switch = fluid.switch()
+   with switch.block():
+      with switch.case(fluid.less_equal(a, 10)):
+          fluid.print("Case 1")
+      with switch.case(fluid.larger(a, 0)):
+          fluid.print("Case 2")
+      with switch.default():
+          fluid.print("Case 3")
+   ```
+
+>[A Lisp cond form may be compared to a continued if-then-else as found in many algebraic programming languages](https://www.cs.cmu.edu/Groups/AI/html/cltl/clm/node84.html).
+
+</font>
+
+---
+
+### Fluid: ==像写程序一样==定义计算
+
+<font size=5>
+
+- 循环：[while](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py#L105)
+
+  ```python
+  d0 = layers.data("d0", shape=[10], dtype='float32')
+  data_array = layers.array_write(x=d0, i=i)
+  array_len = layers.fill_constant(shape=[1],dtype='int64', value=3)
+
+  cond = layers.less_than(x=i, y=array_len)
+  while_op = layers.While(cond=cond)
+  with while_op.block():
+      d = layers.array_read(array=data_array, i=i)
+      i = layers.increment(x=i, in_place=True)
+      layers.array_write(result, i=i, array=d)
+      layers.less_than(x=i, y=array_len, cond=cond)
+  ```
+
+- 完整实例请点查看 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_while_op.py#L36-L44)
+- beam search  [->]( https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py#L105)
+
+</font>
+
+---
+
+#### <p align="center">总结</p>
+
+<font size=5>
+
+1. 用户层提供的描述语法具有完备性、自洽性，有能力支持对复杂计算过程描述
+1. 使用方式和核心概念可以类比编程语言，认知能够直接迁移
+1. 能够支持：定义问题，逐步求解
+
+</font>
+
+---
+
+## ==3.== 核心概念
+
+---
+### 编译时概念 ：==变量和计算的描述==
+
+<font size=5>
+
+- `VarDesc` + `TensorDesc` + `OpDesc` -> `BlockDesc` -> `ProgramDesc`
+    - https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto
+
+- <span style="background-color:#DAB1D5;">什么是 Fluid Program</span>
+
+  - 在Fluid中，一个神经网络任务（训练/预测）被描述为一段`Program`
+  - `Program`包含对`Variable`（数据）和 `Operator`（对数据的操作）的描述
+  - `Variable` 和 `Operator` 被组织为多个可以嵌套的`Block`，构成一段完整的`Fluid Program`
+
+
+>编译阶段最终，经过 Transpiler 的执行规划，变换处理，生成使用`protobuf`序列化后的`ProgramDesc`。可以发送给多卡或者网络中的其它计算节点执行
+
+</font>
+
+---
+
+### 编译时概念 ：==**[Transpiler](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)**==
+<font size=5>
+
+1. 接受一段`ProgramDesc`作为输入，生成一段新的`ProgramDesc`
+
+    - *Memory optimization transpiler*：向原始`ProgramDesc` 中插入 `FreeMemoryOps`，在一次迭代优化结束前提前释放内存，使得能够维持较小的 memory footprint
+
+    - *Distributed training transpiler*：将原始的`ProgramDesc`中转化为对应的分布式版本，生成两段新的`ProgramDesc`:
+        1. trainer进程执行的`ProgramDesc`
+        1. parameter server执行的`ProgramDesc`
+
+1. ==**WIP**==: 接受一段`ProgramDesc`，生成可直接被`gcc`, `nvcc`, `icc`等编译的代码，编译后得到可执行文件
+
+</font>
+
+---
+### Transplier
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/transpiler.png" width=70%>
+</p>
+
+---
+
+### 打印 `ProgramDesc`
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/print_fluid_program.png" width=70%>
+</p>
+
+<font size=5>
+
+- `default_startup_program`：创建可学习参数，对参数进行初始化
+- `default_main_program`：由用户定义的模型，包括了前向、反向、优化及所有必要的计算
+
+- 打印可读的 `Program`
+  ```python
+  from paddle.v2.fluid import debuger
+  print debuger.pprint_program_codes(framework.default_main_program().desc)
+  ```
+</font>
+
+---
+### 输出效果
+
+<font size=5>
+
+<table>
+<thead>
+<th>variable in block 0</th>
+<th>variable in block 0</th>
+</thead>
+<tbody>
+<tr>
+<td><img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/program_desc1.png" width=70%></td>
+<td><img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/program_desc2.png" width=70%></td>
+</tr>
+</tbody>
+</table>
+</font>
+
+---
+
+### 运行时概念
+
+<font size=5>
+
+- 数据相关
+  - `Tensor` / `LoDTensor` / `Variable`
+  - `Scope`
+
+- 计算相关
+  - `Block`
+  - `Kernel`、`OpWithKernel`、`OpWithoutKernel`
+
+<table>
+<thead>
+<th></th>
+<th>protobuf messages</th>
+<th>C++ class objects</th>
+</thead>
+<tbody>
+<tr>
+<td>Data</td>
+<td>[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L107)
+</td>
+<td>[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24)
+</td>
+</tr>
+
+<tr>
+<td>Operation</td>
+<td>[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L35)
+</td>
+<td>[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L64)
+</td>
+</tr>
+<tr>
+<td>Block</td>
+<td>BlockDesc
+</td>
+<td>Block
+</td>
+</tr>
+
+
+</tbody>
+</table>
+
+- 执行相关 ：`Executor`
+
+</font>
+
+---
+#### Tensor 和 LoD(Level-of-Detail) Tensor
+<font size=5>
+
+- Tensor 是$n$-dimensional arry的推广，LoDTensor是在Tensor基础上附加了序列信息
+- Fluid中输入、输出，网络中的可学习参数全部统一使用LoDTensor（n-dimension array）表示
+- 一个mini-batch输入数据是一个LoDTensor
+  - 在Fluid中，RNN 处理变长序列无需padding，得益于 `LoDTensor`表示
+  - 可以简单将 LoD 理解为：`std::vector<std::vector<int>>`
+  - 对非序列数据，LoD 信息为空
+
+<table>
+<thead>
+<th></th>
+<th>TensorFlow</th>
+<th>PaddlePaddle</th>
+</thead>
+<tbody>
+<tr>
+<td>RNN</td>
+<td>Support
+</td>
+<td>Support
+</td>
+</tr>
+
+<tr>
+<td>recursive RNN</td>
+<td>Support
+</td>
+<td>Support
+</td>
+</tr>
+<tr>
+<td>padding zeros</td>
+<td>Must
+</td>
+<td>No need
+</td>
+<tr>
+<td>blob data type</td>
+<td>Tensor
+</td>
+<td>LODTensor
+</td>
+
+</tr>
+</tbody>
+</table>
+
+</font>
+
+---
+#### LoD 信息实例
+
+<font size=4>
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/LoDTensor.png" width=43%>
+</p>
+
+- 图(a)的LoD 信息
+  ```cpp
+  [0, 5, 8, 10, 14]
+  ```
+- 图(b)的 LoD 信息
+  ```cpp
+  [[0, 5, 8, 10, 14] /*level=1*/, [0, 2, 3, 5, 7, 8, 10, 13, 14] /*level=2*/]
+  ```
+</font>
+
+---
+#### Tensor, Variable, Scope 之间的关系
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/scope_variable_tensor.png" width=40%>
+</p>
+<font size=5>
+
+1. `Block` 是一个实现层的概念，不在应用层暴露给用户。目前用户无法自行创建并利用`Block`，用户能够感知的只有`Program`这个概念。
+1. 逻辑上，可以将 `Block` 类比为编程语言中的大括号：定义了一段作用域，其中运行一段代码
+1. `Executor`会为每一个`Block`创建一个`Scope`，`Block`是可嵌套的，因此`Scope`也是可嵌套的
+
+</font>
+
+---
+### Executor
+
+<font size=5>
+
+<table>
+<thead>
+<th>接口</th>
+<th>说明</th>
+</thead>
+<tbody>
+<tr>
+<td><p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/executor.png" width=60%>
+</p></td>
+<td><span style="background-color:#B3D9D9;">输入</span><br>1. `ProgramDesc`<br>2. `Scope`<br> 3.`block_id`<br><br><span style="background-color:#B3D9D9;">解释执行步骤</span><br>1. 创建所有 Variables<br> 2. 逐一创建 Operator 并运行
+</td>
+</tr>
+</tbody>
+</table>
+
+---
+### Operator/OpWithKernel/Kernel
+<font size=5>
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/operator1.png" width=50%>
+</p>
+
+- operator 无状态，Operator的核心是==Run==方法
+- 一个operator可以注册多个kernel
+- operator 可以无 kernel：while_op 、ifelse op
+
+</font>
+
+---
+#### Fluid Operator vs. PaddlePaddle layers
+<font size=5>
+
+<table>
+<thead>
+<th>Layer</th>
+<th>Operator</th>
+</thead>
+<tbody>
+<tr>
+<td><p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/layer.png" width=70%>
+</p></td>
+<td><p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/operator2.png" width=73%>
+</p></td>
+</tr>
+
+<tr>
+<td>1. 内部维护状态<br>2. 包含forward和backward方法</td>
+<td>1. 内部无状态<br>2. 只有Run方法</td>
+</tr>
+</tbody>
+</table>
+
+</font>
+
+---
+
+### ==4.== 内存管理
+
+---
+### 目标
+
+- 为异构设备提供统一的内存分配、回收接口
+- 最小化管理内存所需的时间，最小化管理开销
+- 减少内存碎片
+- 将内存管理与计算（Operators/Kernels）完全剥离
+- 统一内存管理是内存优化的基础
+
+---
+
+<font size=5>
+
+### Memory 接口
+
+- 内存管理模块向上层应用逻辑提供三个基础接口：
+  ```cpp
+  template <typename Place>
+  void* Alloc(Place place, size_t size);
+
+  template <typename Place>
+  void Free(Place place, void* ptr);
+
+  template <typename Place>
+  size_t Used(Place place);
+
+  struct Usage : public boost::static_visitor<size_t> {
+    size_t operator()(const platform::CPUPlace& cpu) const;
+    size_t operator()(const platform::CUDAPlace& gpu) const;
+  };
+  ```
+- 模板参数 `Place` 指示内存分配发生的设备
+- 实现时，需特化支持的 `Place`， 提供以上三个接口的实现
+
+</font>
+
+---
+### 代码结构
+
+<font size=5>
+
+内存管理模块可以理解为由以下两部分构成：
+
+1. SystemAllocator：实际从物理设备上分配、释放的内存的接口
+1. BuddyAllocator：内存管理算法
+
+</font>
+
+---
+### System Allocator
+
+<font size=5>
+
+- SystemAllocator 是实现物理内存分配、回收的基类
+    - 不同设备上的内存分配和回收终将转化为标准接口调用
+    - 为不同设备实现MemoryAllocator，继承自SystemAllocator
+
+  ```cpp
+  class SystemAllocator {
+   public:
+    virtual ~SystemAllocator() {}
+    virtual void* Alloc(size_t& index, size_t size) = 0;
+    virtual void Free(void* p, size_t size, size_t index) = 0;
+    virtual bool UseGpu() const = 0;
+  };
+  ```
+</font>
+
+---
+
+### CPU/GPU Allocator
+
+<font size=5>
+
+```cpp
+class CPUAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+};
+
+#ifdef PADDLE_WITH_CUDA
+class GPUAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+ private:
+  size_t gpu_alloc_size_ = 0;
+  size_t fallback_alloc_size_ = 0;
+};
+#endif
+```
+- CPUAllocator和GPUAllocator分别继承自SystemAllocator，分别调用相应的标准库函数实现物理内存的分配和释放。
+- 一旦大块、连续的物理内存分配之后，将通过内存管理算法实现内存的按块分配、回收、重用等。
+
+</font>
+
+---
+### CPU Allocator
+
+<font size=5>
+
+- CPU 内存的分配提供两种选项：
+    1. non-pinned memory：可分页内存
+    2. pinned memory：页锁定内存
+        - 分配过大的页锁定内存有可能因为系统可使用的分页内存减少，影响系统性能，默认CPU下分配的是可分页内存
+
+- 通过gflags进行设置一次性分配内存的大小以及是否使用页锁定内存。
+
+   ```cpp
+   DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
+   DEFINE_double(fraction_of_cpu_memory_to_use, 1,
+                 "Default use 100% of CPU memory for PaddlePaddle,"
+                 "reserve the rest for page tables, etc");
+   ```
+
+</font>
+
+---
+### GPU Allocator
+
+<font size=5>
+
+- 通过 cudaMalloc 分配GPU显存
+- GPUAllocator::Alloc 首先会计算指定GPU device上的可用显存
+    - 如果可用显存小于请求分配大小，调用cudaMalloc进行分配
+    - 如果可用显存不足，目前会报错退出。
+- 通过gflags控制GPU下一次性分配显存的大小：
+
+  ```cpp
+  DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
+                "Default use 92% of GPU memory for PaddlePaddle,"
+                "reserve the rest for page tables, etc");
+  ```
+
+</font>
+
+---
+#### 内存管理算法:  [Buddy Memory Allocation](https://en.wikipedia.org/wiki/Buddy_memory_allocation)
+
+<font size=5>
+
+- Memory Arena：一次性分配大块连续内存，之后会基于这块内存进行内存管理：动态分配、释放、重用内存块。
+- 伙伴内存分配：
+    - 将内存划分为 2 的幂次方个分区，使用 best-fit 方法来分配内存请求。
+    - 当释放内存时，检查 buddy 块，查看相邻的内存块是否也已被释放。如果是，将内存块合并，以最小化内存碎片。
+    - 分配的内存在物理内存的自然边界对齐，提高内存访问效率。
+    - 算法的时间效率高，单使用 best-fit 方法的缘故，会产生一定的内存浪费
+
+</font>
+
+---
+
+### Buddy Allocator
+
+<font size=5>
+
+- BuddyAllocator 是一个单例，每个设备（如： GPU/CPU(0)/GPU(1)） 拥有一个BuddyAllocator
+- BuddyAllocator 内部拥有一个私有成员变量 SystemAllocator
+- 当请求的内存超过BuddyAllocator管理的空余内存时，将会调用SystemAllocator去指定的设备上分配物理内存
+
+</font>
+
+---
+### 实例：CPU 下内存管理接口的实现
+
+<font size=5>
+
+- 对上层应用，统一通过BuddyAllocator来实现内存的分配、释放以及用量查询
+    ```cpp
+    template <>
+    void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
+      VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+      void* p = GetCPUBuddyAllocator()->Alloc(size);
+      VLOG(10) << "  pointer=" << p;
+      return p;
+    }
+
+    template <>
+    void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
+      VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+      GetCPUBuddyAllocator()->Free(p);
+    }
+
+    template <>
+    size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
+      return GetCPUBuddyAllocator()->Used();
+    }
+    ```
+</font>
+
+---
+### ==5.== 多设备支持
+
+---
+### 多设备支持（一）
+
+<font size=5>
+
+- step 1：添加Place类型，<span style="background-color:#DAB1D5;">由用户实现添加到框架</span>
+   - 可以将Place类型理解为一个整数加上一个枚举型，包括：设备号 + 设备类型
+
+    <p align="center">
+    <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/place.png" width=40%>
+    </p>
+- DeviceContext
+    - 不同的Place会对应一个相应的DeviceContext，用于组织管理与设备相关的信息
+      - 例如，GpuDeviceContext中会管理Cuda stream
+    - 目前实现中一些特殊的库也会对应有自己的DeviceContext：例如：
+      ```cpp
+      class MKLDNNDeviceContext : public CPUDeviceContext {……}
+      ```
+    - 每种设备对应的DeviceContext需要管理的内容不尽相同，视具体需求来实现
+
+</font>
+
+---
+
+### 多设备支持（二）
+
+<font size=5>
+
+- step 2: 增加KernelType，为相应的KernelType注册Kernel对象，<span style="background-color:#DAB1D5;">由用户实现注册给框架</span> 可以按照：
+    1. Place 执行设备
+    1. DataType 执行数据类型 FP32/FP64/INT32/INT64
+    1. Memory layout： 运行时 Tensor 在内存中的排布格式 NCHW、 NHWC
+    1. 使用的库
+
+    来区分Kernel，为同一个operator注册多个 Kernel。
+
+    ```cpp
+    struct OpKernelType {
+      proto::DataType data_type_;
+      DataLayout data_layout_;
+      platform::Place place_;
+      LibraryType library_type_;
+    }
+    ```
+
+</font>
+
+---
+
+### 多设备支持（三）
+
+<font size=5>
+
+step 3: 运行时的 KernelType 推断和Kernel切换，<span style="background-color:#DAB1D5;">按需要修改Kernel推断和Kernel切换规则</span>
+- Expected Kernel：期待调用的Kernel：由（1）`Place`和计算精度决定；或（2）用户在配置中显示指定使用的计算库，如`cudnn`、`mkldnn`等。
+- Actual Kernel：运行时从`Operator`的输入（`Variable`）可以推断出实际需要的`KernelType`
+- 当Expected Kernel和Actual Kernel不一致的时候，框架会插入`data_transformer`或者`data_layerout_transform`等，保证Expected Kernel可以执行，包括：
+   - CPUPlace -> GPUPlace ：跨设备内存复制
+   - NCHW -> nChw8c ：Layout转换
+   - FP32 -> FP16 ：精度转换 _**尚未支持**_
+   - ……
+- 以上过程实现在OperatorWithKernel类的Run方法中 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.cc#L497)
+
+</font>
+
+---
+## ==6.== while_op
+
+---
+### while_op
+
+<font size=5>
+
+- 循环执行一段`Program`，直到条件operator判断循环条件不满足时终止循环
+- while_op 的特殊之处：
+  1. while_op 没有 kernel
+  1. while_op 拥有自己的`Block`，会形成一段嵌套的`Block`
+  1. ==while_op 内部创建了一个 Executor，来循环执行`Block`==
+
+- while_op 输入输出 ： LoDTensorArray
+    ```cpp
+    namespace paddle {
+    namespace framework {
+    using LoDTensorArray = std::vector<LoDTensor>;
+    }
+    }
+    ```
+    - 每一次循环，从原始输入中“切出”一个片段
+    - LoDTensorArray 在Python端暴露，是Fluid支持的基础数据结构之一，用户可以直接创建并使用
+
+</font>
+
+---
+### while_op [Run](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/while_op.cc#L42) 方法概览
+
+<font size=5>
+
+```cpp
+
+void Run(const framework::Scope &scope,
+         const platform::Place &dev_place) const override {
+  PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
+  auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
+  PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
+
+  framework::Executor executor(dev_place);
+  auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+
+  auto *program = block->Program();
+  auto step_scopes =
+      scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();
+
+  while (cond.data<bool>()[0]) {
+    auto &current_scope = scope.NewScope();
+    step_scopes->push_back(&current_scope);
+    executor.Run(*program, &current_scope, block->ID(),
+                   false /*create_local_scope*/);
+  }
+}
+
+```
+
+</font>
+
+---
+### while_op 的重要应用：Dynamic RNN
+
+---
+
+### 什么是 `dynamicRNN` ?
+
+<font size=5>
+<br>
+
+1. 用户可以自定义在一个时间步之内的计算, 框架接受序列输入数据，在其上循环调用用户定义的单步计算
+1. 可学习参数在多个时间步之间共享
+1. `dynamicRNN` 由 `while_op` 实现
+1. 如果`dynamicRNN`中定义了`memory`，将会构成一个循环神经网络，否则其行为就等于在输入序列上循环调用预定义的单步计算
+
+</font>
+
+---
+
+#### `dynamic RNN` 用户接口
+<font size=5>
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/user_interface.png" width=75%>
+</p>
+
+- `dynamicRNN` 中的重要元素
+  1. **step input**: `dynamicRNN` 每个时间步的输入
+  1. **step function**: 用户定义的单步计算
+  1. **memory**: 用于形成循环连接
+  1. **external/static memory**：单步计算的每一步都可以全部读取到的外部输入
+
+</font>
+
+---
+
+#### dynamicRNN 中的 Memory
+
+<font size=5>
+
+`dynamicRNN`中`memory`的行为非常类似于 C++ 中的引用变量
+  - `memory` “指向” 一个operator的输出变量，记作： A
+  - `memory` 可以被 LoDTensor 初始化（当LoD信息为空时，为非序列，否则为序列）,默认`memory`被初始化为零
+  - `memory` 在 operator A 前向计算之后，进行前向计算
+  - 当 `memory` 的前向计算会 "指向" A 的输出 LoDTensor
+  - `memory` 的输出可以是另一个 operator 的输入，于是形成了“循环”连接
+
+</font>
+
+---
+
+### DynamicRNN 实现细节
+
+<font size=5>
+
+- `while_op` <span style="background-color:#DAB1D5;">无法独立构成dynamicRNN</span>，必须和一组相关的 operator 及数据结构配合
+    - 依赖的 operators (这里仅列出最重要的，并非全部):
+        - `lod_rank_table` operator
+        - `lod_tensor_to_array` operator
+        - `array_to_lod_tensor` operator
+        - `shrink_memory` operator
+    - 依赖的数据结构
+        - `TensorArray`
+        - `LoDRankTable`
+
+- 在Fluid中，RNN接受变长序列输入，无需填充，以上数据结构和相关的operator配合工作，实现了对变长输入以batch计算
+
+</font>
+
+---
+
+### `dynamicRNN` 如何实现 batch 计算 ?
+
+<font size=5>
+
+- 问题：
+  - RNN 可以看作是一个展开的前向网络，前向网络的深度是最长序列的长度
+  - 如果不对变长序列进行填充，将它们填充到一样长度，每个mini-batch输入将会不等长，每个样本展开长度不一致，导致前向和反向计算实现困难
+
+</font>
+
+----
+##### 实例 ：RNN encoder-decoder with attention
+
+<font size=5>
+
+- 以机器翻译的RNN encoder-decoder 模型（涉及了`dynamicRNN`的所有设计要素）为例，下图是 RNN encoder-decoder 的原始输入：
+  <p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/raw_input.png" width=100%><br><font size=3> Figure. RNN encoder-decoder 原始batch 输入数据</font>
+  </p>
+
+- source word sequences 是encoder RNN的输出，是一个LoDTensor
+- target word sequences 是look_uptable的输入，是一个LoDTensor
+- 上图中一个矩形方块是CPU/GPU内存中一片连续的内存空间，表示一个dense vector
+
+</font>
+
+---
+
+### `dynamicRNN` 如何实现 batch 计算 ?
+
+<font size=5>
+
+1. 对一个mini batch中不等长样本进行排序，最长样本变成batch中的第一个，最短样本是batch中最后一个
+      - `LoDTensor` -> `LoDRankTable` :heavy_plus_sign: `lod_rank_table operaator`
+          - 可以将`LoDRankTable`理解为对LoDTensor中的多个序列按照长度排序LoDRankTable 存储了排序之后的index
+
+2. 构建每个时间步的batch输入：随着时间步增加，每个时间步的batch输入可能会逐渐缩小
+    - `TensorArray` :heavy_plus_sign: `lod_tensor_to_array` -> `LoDTensor` (without LoD)
+3. 每个时间步输出写入一个输出 `LoDTensorArray`
+3. `dynamicRNN`循环结束后, 按照`LoDRankTable`中记录的信息对输出`LoDTensorArray`重排序，还原会原始输入顺序
+    - `TensorArray` :heavy_plus_sign: `array_to_lod_tensor` -> `LoDTensor`
+
+</font>
+
+---
+
+### 运行实例
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/sorted_input.png" width=100%>
+</p>
+
+---
+### 运行实例
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/1.png" width=100%>
+</p>
+
+<font size=5>
+
+- 执行到第5~7个batch时，batch size将会缩小
+
+</font>
+
+---
+### 运行实例
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/1.png" width=80%>
+</p>
+
+<font size=5>
+
+- 第5 ~ 7个batch时RNN的`memory`会发生什么？
+    - `memory` 指向某个operator的输出Tensor，在该operator前向计算之后，“取回”其计算结果
+    - 5 ~ 7时，遇到了序列的结束，==下一个时间步计算不再需要在已经结束的序列上展开==
+    - 在`dynamicRNN`中`shrink_memory` operator 用来缩小`memory`的batch输入
+
+</font>
+
+---
+### 运行实例：batch 1 ~ 2
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/2.png" width=70%><br><font size=4>Figure. 第1、2个batch输入dynamicRNN的batch输入</font>
+</p>
+
+---
+### 运行实例：batch 3 ~ 4
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/3.png" width=70%><br><font size=4>Figure. 第3、4个batch输入dynamicRNN的batch输入</font>
+</p>
+
+---
+
+### 运行实例：batch 5 ~ 7
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/4.png" width=70%><br><font size=4>Figure. 第5、6、7个batch输入dynamicRNN的batch输入</font>
+</p>
+
+---
+### ==7.== Fluid 代码结构
+
+---
+### Fluid 代码结构
+
+<table>
+<thead>
+<tr>
+<th>代码结构</th>
+<th>模块结构</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid_module_1.png" width=60%>
+</p>
+</td>
+<td>
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid_module_2.png" width=60%>
+</p>
+</td>
+</tr>
+
+</tbody>
+</table>
+
+---
+
+### ==8.== 文档总结
+
+---
+<font size=5>
+
+- 设计概览
+  - 重构概览 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/refactorization.md)
+  - fluid [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md)
+  - fluid_compiler [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)
+- 核心概念
+  - variable 描述 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/var_desc.md)
+  - Tensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.md)
+  - LoDTensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
+  - TensorArray [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md)
+  - Program [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md)
+  - Block [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md)
+  - Scope [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md)
+
+---
+
+- 重要功能模块
+  - backward [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/backward.md)
+  - 内存优化 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/memory_optimization.md)
+  - evaluator [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/executor.md)
+  - python API [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md)
+  - regularization [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/regularization.md)
+
+- 开发指南
+  - 支持新设硬件设备库 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md)
+  - 添加新的Operator [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_cn.md)
+  - 添加新的Kernel [->](
+https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_kernel_en.md)
+
+</font>
+
+---
+
+### ==9.== 开发指南
+
+---
+
+#### 建议开发环境：使用 Docker 编译和测试
+
+<font size=5>
+
+Docker编译PaddlePaddle源码: [->](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)
+
+PaddlePaddle 在 Dockerhub 地址：[->](
+    https://hub.docker.com/r/paddlepaddle/paddle/tags/)
+
+1. 获取PaddlePaddle的Docker镜像
+    ```bash
+    docker pull paddlepaddle/paddle:latest-dev
+    ```
+
+1. 启动 docker container
+
+    ```bash
+    docker run -it -v $PWD/Paddle:/paddle paddlepaddle/paddle:latest-dev /bin/bash
+    ```
+
+1. 进入docker container后，从源码编译，请参考文档 [->]( http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/build_from_source_cn.html)
+
+</font>
+
+---
+
+### 一些说明
+
+<font size=5>
+
+1. PaddlePaddle的Docker镜像为了减小体积，默认没有安装vim，可以在容器中执行`apt-get install -y vim`来安装vim。
+1. 开发推荐使用tag为`latest-dev`的镜像，其中打包了所有编译依赖。`latest`及`lastest-gpu`是production镜像，主要用于运行PaddlePaddle程序。
+2. 在Docker中运行GPU程序，推荐使用nvidia-docker，[否则需要将CUDA库和设备挂载到Docker容器内](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)。
+   <font size=4>
+
+   ```bash
+   nvidia-docker run -it -v $PWD/Paddle:/paddle paddlepaddle/paddle:latest-dev /bin/bash
+   ```
+   </font>
+
+
+</font>
+
+---
+
+### [如何贡献](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/dev/contribute_to_paddle_cn.html)
+
+<font size=5>
+
+- ==提交PullRequest前请务必阅读==： [->](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/dev/contribute_to_paddle_cn.html)
+- 代码要求
+    1. 代码注释遵守 Doxygen 的样式
+    1. 确保编译器选项 WITH_STYLE_CHECK 已打开，并且编译能通过代码样式检查
+    1. 所有代码必须具有单元测试，且能够通过所有单元测试
+- 使用 `pre-commit` 钩子提交Pull Request
+    1. 帮助格式化源代码（C++，Python）
+    1. 在提交前自动检查一些基本事宜：如每个文件只有一个 EOL，Git 中不要添加大文件等
+    1. 安装pre-commit，并在PaddlePaddle根目录运行：
+    ```bash
+      ➜  pip install pre-commit
+      ➜  pre-commit install
+    ```
+</font>
+
+---
+
+### 如何贡献
+
+<font size=5>
+
+1. 开始开发之前请先建立issue。
+    - 让其它同学知道某项工作已经有人在进行，以避免多人开发同一功能的情况。
+1. 提交PR必须关联相关的issue。做法请参考：[->](https://help.github.com/articles/closing-issues-using-keywords/)
+    - 目的：为了在提交的版本中留有记录描述这个PR是为了开发什么样的功能，为了解决什么样的问题。
+    - 当PR被merge后，关联的issue会被自动关闭。
+1. PR review 中，reviewer的每条comment都必须回复。
+    - 如修改完可直接回复：Done。
+    - 目的：review comment 中可能会有（1）询问类型的问题；（2）可以在下一个PR修改的问题；（3）comment意见不合理等。需要明确回复，以便reviewer和其他人有历史可查，便于区分是否已经进行修改，或者准备下一个PR修改，或者意见不合理可以不用进行修改。
+
+</font>
+
+---
+
+### ==10.== 添加新的 Operator
+
+---
+
+### 概念简介
+
+<font size=5>
+
+添加一个新的operator，会涉及实现以下C++类的派生类：
+
+1. `framework::OperatorBase`: Operator(简写，Op)基类。
+1. `framework::OpKernel`: Op计算函数的基类，称作Kernel。
+1. `framework::OperatorWithKernel`：继承自OperatorBase，Op有计算函数，称作有Kernel。
+1. `class OpProtoAndCheckerMaker`：描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
+
+依据是否包含kernel，可以将Op分为两种：
+1. 包含Kernel的Op：继承自OperatorWithKernel，==绝大多数operator都属于这一类==
+1. 不包含kernel的Op，继承自OperatorBase，只有少量Op属于这一类，例如while_op，ifelse_op
+
+<span style="background-color:#DAB1D5;">这里主要介绍带Kernel的Op如何编写。</span>
+
+</font>
+
+---
+
+#### 添加新的Operator需要修改/添加哪些文件？
+
+<font size=5>
+
+<table>
+<thead>
+<tr>
+<th>内容</th>
+<th>定义位置</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>
+OpProtoMake定义
+</td>
+<td>
+`.cc`文件，<span style="background-color:#DAB1D5;">Backward Op不需要OpProtoMaker</span>
+</td>
+</tr>
+<tr>
+<td>
+Op定义
+</td>
+<td>
+`.cc`文件
+</td>
+</tr>
+<tr>
+<td>
+Kernel实现
+</td>
+<td>
+<span style="background-color:#DAB1D5;">CPU、CUDA共享Kernel实现在`.h`文件中</span>，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。
+</td>
+</tr>
+
+<tr>
+<td>
+注册Op
+</td>
+<td>
+Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中
+</td>
+</tr>
+
+</tbody>
+</table>
+
+- 添加 Operator 之前请阅读：[Operator 命名规范](https://github.com/PaddlePaddle/Paddle/blob/63cca04cfd488a4dab6d6273fd04a8017ef45932/doc/fluid/dev/name_convention.md)及[Operator Markdown注释规范](https://github.com/PaddlePaddle/Paddle/blob/63cca04cfd488a4dab6d6273fd04a8017ef45932/doc/fluid/dev/op_markdown_format.md)。
+- 实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。
+- 根据文件名自动构建op和Python端绑定，<span style="background-color:#DAB1D5;">请务必遵守以上命名，否则需要进一步修改PyBind相关文件及CMakeLists.txt</span>。
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step1</span>: 定义ProtoMaker类
+
+<font size=5>
+
+下面均以[clip_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.h)为例进行介绍
+
+- clip_op计算公式：$Out = \min(\max(X, min), max)$
+- 首先定义`ProtoMaker`来描述该Op的输入、输出，并添加注释（<font size=4>*下面代码段的中注释进行了简化，实现时需按照规范添加注释*</font>）：
+
+    ```cpp
+    template <typename AttrType>
+    class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
+     public:
+      ClipOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+          : OpProtoAndCheckerMaker(proto, op_checker) {
+        AddInput("X","(Tensor)The input of clip op.");
+        AddOutput("Out", "(Tensor),The output of clip op.");
+        AddAttr<AttrType>(
+            "min", "(float),Minimum value.");
+        AddAttr<AttrType>(
+            "max", "(float),Maximum value.");
+        AddComment(R"DOC(
+        ……
+    )DOC");
+      }
+    };
+    ```
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step2</span>: 定义Operator类
+
+<font size=5>
+
+下面的代码段实现了`clip_op`的定义：
+
+```cpp
+class ClipOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ClipOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ClipOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto max = ctx->Attrs().Get<float>("max");
+    auto min = ctx->Attrs().Get<float>("min");
+    PADDLE_ENFORCE_LT(min, max, "max should be greater than min.");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+```
+</font>
+
+---
+
+### Operator 类中需要完成的工作
+
+<font size=5>
+
+1. clip_op 继承自`OperatorWithKernel`，
+
+    ```cpp
+    using framework::OperatorWithKernel::OperatorWithKernel;
+    ```
+    表示使用基类`OperatorWithKernel`的构造函数。
+
+1. 重写`InferShape`接口。
+    - `InferShape` 为const函数，不能修改Op的成员变
+    - `InferShape` 的参数为 `const framework::InferShapeContext &ctx`，从中可获取到输入输出以及属性
+    - `InferShape` 会被调用两次，一次是编译时（创建op），一次是运行时（调用op的`Run`方法时），需要完成以下功能：
+        1. 做检查， 尽早报错：检查输入数据维度、类型等是否合法
+        2. 设置输出Tensor的形状
+
+<span style="background-color:#DAB1D5;">通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中。</span>
+
+</font>
+
+---
+
+### 补充说明
+
+<font size=5>
+
+1. `InferShape`目前支持两种实现方式，<span style="background-color:#DAB1D5;">二者最后都会生成一个functor注册给OpInfo结构体。</span>
+    1. 继承framework::InferShapeBase，实现为一个functor（参考 [mul_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L22)）
+    2. override InferShape函数（参考 [clip_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.cc#L24)）
+
+1. 什么是`functor` ?
+
+   - 类或结构体仅重载了`()`，一般是可被多个kernel复用的计算函数。
+
+        <font size=4>
+
+        ```cpp
+        template <typename T>
+        class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
+         public:
+          void operator()(const platform::CPUDeviceContext& ctx,
+                          framework::Tensor* out,
+                          const framework::Tensor* prob,
+                          const framework::Tensor* labels, const bool softLabel) {
+               ……
+          }
+        };
+        ```
+        </font>
+
+    - 在 clip_op 内也会看到将一段计算函数抽象为functor的使用法： [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.h#L27)。
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step3</span>: 定义OpKernel类
+
+<font size=5>
+
+- `ClipKernel`继承自`framework::OpKernel`，带有下面两个模板参数:
+    1. `typename DeviceContext`: 表示设备类型，不同设备共享同一个Kernel时，需添加该模板参数。不共享时，需要提供针对不同设备的特化实现。
+    1. `typename T` : 表示支持的数据类型，如`float`, `double`等
+
+- 在`ClipKernel`类中重写`Compute`方法
+    1. `Compute`接受输入参数：`const framework::ExecutionContext& context`
+        - `ExecutionContext` 是从 `Scope`中将运行时Op的输入、输出`Variable`组织在一起，使得Op在调用`Compute`方法时，能够简单地通过名字拿到需要的输入输出`Variable`
+        - 与`InferShapeContext`相比，`ExecutionContext` 中增加了设备类型
+    1. 在`Compute`函数里实现`OpKernel`的具体计算逻辑
+
+</font>
+
+---
+#### ClipKernel 代码概览
+
+<font size=5>
+
+```cpp
+template <typename DeviceContext, typename T>
+class ClipKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto max = context.Attr<T>("max");
+    auto min = context.Attr<T>("min");
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    const T* x_data = x->data<T>();
+    int64_t numel = x->numel();
+    Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x_data,
+          x_data + numel, out_data, ClipFunctor<T>(min, max));
+  }
+};
+```
+
+- 为了使`OpKernel`的计算过程书写更加简单，并且CPU、CUDA的代码可以复用， Fluid 使用 Eigen 作为基础的矩阵运算库
+- Fluid对Eigen unsupported Tensor提供了一些基本的封装，可以在`Compute`接口中直接调用
+    - 关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_cn.md)。
+
+</font>
+
+---
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step4</span>: 实现反向Op
+
+<font size=5>
+
+- ==**反向Op没有`ProtoMaker`**==，除此之外定义与实现方式前向Op完全一致，不再赘述
+- 这里仅对反向Op的输入输出进行说明：
+    1. 反向Op的输入
+        - 前向Op的输出
+        - 反向传播过程中传递给当前Op的梯度
+            - 需要注意，<span style="background-color:#e1c4c4;">Fluid中，不区分Cost Op和中间层Op，所有Op都必须正确处理接收到的梯度</span>
+    2. 反向Op的输出
+        - 对可学习参数的求导结果
+        - 对所有输入的求导结果
+
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step5</span>: 注册Op及Kernel
+
+<font size=5>
+
+至此Op和Op kernel都已经实现完毕，接下来，需要在`.cc`和`cu`文件中注册op和kernel
+
+1. 在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
+
+    <font size=4>
+
+    ```cpp
+    namespace ops = paddle::operators;
+    REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad,
+                ops::ClipOpGrad);
+    REGISTER_OP_CPU_KERNEL(
+        clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>);
+    REGISTER_OP_CPU_KERNEL(
+        clip_grad, ops::ClipGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ```
+
+   - 在上面的代码片段中：
+
+     1. `REGISTER_OP` ： 注册`ops::ClipOp`类，类型名为`clip`，该类的`ProtoMaker`为`ops::ClipOpMaker`，注册`ops::ClipOpGrad`，类型名为`clip_grad`
+     1. `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op，例如：优化算法相关的Op
+     1. `REGISTER_OP_CPU_KERNEL` ：注册`ops::ClipKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::ClipGradKernel`类
+
+    </font>
+1. 按照同样方法，在`.cu`文件中注册GPU Kernel
+   -  <span style="background-color:#e1c4c4;">如果CUDA Kernel的实现基于Eigen，需在 `.cu`的开始加上宏定义 `#define EIGEN_USE_GPU` </span>
+
+</font>
+
+---
+
+##### 编译和Python端绑定
+
+<font size=5>
+
+- 运行下面命令可以仅编译新添加的Op：
+
+  ```
+  make mul_op
+  ```
+  - <span style="background-color:#e1c4c4;">需注意，运行单元测试需要编译整个工程</span>
+
+- 如果遵循前文的文件命名规则，构建过程中，会自动为新增的op添加Python端绑定，并链接到生成的lib库中
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step6</span>: 添加前向单测及梯度检测
+
+<font size=5>
+
+- 新增Op的单元测试统一添加至：[python/paddle/v2/fluid/tests/unittests](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/unittests)目录
+- 前向Operator单测
+
+    1. Op单元测试继承自`OpTest`，各项具体的单元测试在`TestClipOp`里完成，所有单测case都以`TestXX`命名
+    1. 单元测试Operator，需要：
+        1. 在`setUp`函数定义输入、输出，以及相关的属性参数
+        1. 生成随机的输入数据
+        1. 在Python脚本中实现与前向operator相同的计算逻辑，得到输出值，与operator前向计算的输出进行对比
+        1. 反向梯度检测流程测试框架已经实现，直接调用相应接口`check_grad`即可
+
+- `clip_op` 单测代码请参考 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_clip_op.py)，这里不再展开
+
+</font>
+
+---
+#### 编译执行单测
+
+<font size=5>
+
+- `python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译
+
+    - <span style="background-color:#e1c4c4;">运行单元测试测时需要编译整个工程，并且编译时需要打开`WITH_TESTING`</span>, 即`cmake paddle_dir -DWITH_TESTING=ON`
+- 编译成功后，执行下面的命令来运行单元测试：
+
+  ```bash
+  make test ARGS="-R test_mul_op -V"
+  ```
+
+  或者:
+
+  ```
+  ctest -R test_mul_op
+  ```
+</font>
+
+---
+
+### 添加Op的一些注意事项
+
+<font size=5>
+
+- 为每个Op创建单独的`*_op.h`（如有）、`*_op.cc`和`*_op.cu`（如有）。<span style="background-color:#e1c4c4;">不允许一个文件中包含多个Op</span>，将会导致编译出错。
+- 注册Op时的类型名，需要和该Op的名字一样。<span style="background-color:#e1c4c4;">不允许在`A_op.cc`里面，注册`REGISTER_OP(B, ...)`</span>，会导致单元测试出错。
+- 如果Op<span style="background-color:#e1c4c4;">没有实现CUDA Kernel，不要创建空的`*_op.cu`</span>，会导致单元测试出错。
+- 如果多个Op依赖一些共用的函数，可以创建非`*_op.*`格式的文件来存放，如`gather.h`文件。
+
+</font>
+
+---
+
+### ==10.== 使用相关问题
+
+---
+
+### 定义前向计算
+
+<font size=5>
+
+- 当在python端执行时：
+    ```python
+    import paddle.v2.fluid as fluid
+    ```
+    [`framework.py`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#L1040)定义了两个全局`Program`:
+    ```python
+    # program is a global instance.
+    _main_program_ = Program()
+    _startup_program_ = Program()
+    ```
+
+- 前向定义的过程就是不断往`mian_program`中添加Op和Variable
+- 如果需要执行一个新的`mian_program`时，可以调用调用：
+    ```python
+    def switch_main_program(program):
+        """
+        Switch the main program to a new program.
+        This funtion returns the previous main program.
+        """
+        ……
+    ```
+</font>
+
+---
+
+### 自定义参数的初始化
+
+<font size=5>
+
+- 调用`fluid.ParamAttr(……)`接口，自定义参数的初始化
+
+  ```python
+  w_param_attrs = ParamAttr(name=None,
+      initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
+      learning_rate=1.0,
+      regularizer=L1Decay(1.0),
+      trainable=True,
+      clip=GradientClipByValue(-1.0, 1.0),
+  )
+  y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
+  ```
+
+- 补充问题：如何创建 `Variable`
+  ```python
+  cur_program = Program()
+  cur_block = cur_program.current_block()
+  new_var = cur_block.create_var(name="X", shape=[-1, 16, 16], dtype="float32")
+  ```
+
+</font>
+
+---
+
+### 添加反向Op
+
+<font size=5>
+
+- 调用`fluid.backward.append_backward(X)`（`X`是一个Variable），来为一段前向`ProgramDesc`添加反Op
+
+    ```python
+    data = fluid.layers.data(name="data", shape=(2,3,4))
+    out = fluid.layers.fc(input=data,size=128,act=None)
+    loss = fluid.layers.reduce_sum(out)
+    fluid.backward.append_backward(loss=loss)
+    ```
+
+- 添加优化相关的Op
+    ```python
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(loss)
+    ```
+
+- 可以随时调用`print(fluid.default_main_program())`来输出当前的`main_program`
+
+- 当构建完成整个`Program`后，调用下面的接口执行内存优化：
+  ```python
+  fluid.memory_optimize(fluid.default_main_program())
+  ```
+  - _<span style="background-color:#e1c4c4;">注：内存优化目前仍在持续开发中，有可能不够稳定。</span>_
+
+</font>
+
+---
+
+### 总结：编译时执行流程
+
+<font size=5>
+
+- 用户定义前向计算
+- 添加反向Op到`default_main_program`
+- 添加 gradient clipping Op 到
+- 添加 regularization Op 到`default_main_program`
+- 为指定的优化算法，添加相关的状态 variable of optimizer 到`default_startup_program`
+    - 状态相关 variable是指如学习率, 历史 momentum, 二阶momentum等
+- 添加初始化 variable 的Op 到 `default_startup_program`
+- 为整个网络最后一个op，添加设置其接受到的梯度的Op到`default_main_program`
+- 进行内存优化规划
+
+</font>
+
+---
+
+### Feed 数据 (一)：通过 feed 字典
+
+<font size=5>
+
+- 执行executor的run方法时，指定feed字典，feed op 会将指定的数据放到`x`和`y`两个Variable中
+  ```python
+  y_data = np.random.randint(0, 8, [1]).astype("int32")
+  y_tensor = core.Tensor()
+  y_tensor.set(y_data, place)
+
+  x_data = np.random.uniform(0.1, 1, [11, 8]).astype("float32")
+  x_tensor = core.Tensor()
+  x_tensor.set(x_data, place)
+  ……
+  cost = exe.run(
+      fluid.default_main_program(),
+      feed={'x': x_tensor,
+            'y': y_tensor},
+      fetchlist=[avg_cost])
+  ```
+
+- 这种方法较为底层，一般用于单测中
+
+</font>
+
+---
+
+### Feed 数据 (二)：使用 DataFeeder接口
+
+<font size=5>
+
+- 编写一个data_reader函数，data_reader是一个Python generator
+
+  ```python
+  def demo_reader():
+      def random_generator():
+          yield np.random.uniform(0.1, 1, [4]), np.random.randint(0, 1, [1])
+      return random_generator
+  ```
+- 在训练任务中使用 DataFeeder 接口
+  ```python
+  cost = exe.run(
+      fluid.default_main_program(),
+      feed={'x': x_tensor,
+            'y': y_tensor},
+      fetchlist=[avg_cost])
+
+  train_reader = paddle.batch(
+      paddle.reader.shuffle(demo_reader(), buf_size=500), batch_size=4)
+  feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+  for data in train_reader():
+      cost = exe.run(
+          fluid.default_main_program(),
+          feed=feeder.feed(data),
+          fetch_list=[cost])
+  ```
+
+</font>
+
+---
+
+### 常见问题
+
+<font size=5>
+
+- 如何使用 evaluator ? [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_label_semantic_roles.py#L168)
+
+    ```python
+    accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+    for pass_id in range(PASS_NUM):
+        accuracy.reset()
+        for data in train_reader():
+            loss, acc = exe.run(fluid.default_main_program(),
+                                feed=feeder.feed(data),
+                                fetch_list=[avg_cost] + accuracy.metrics)
+             pass_acc = accuracy.eval(exe)
+             # acc 当前一个batch 的 accuracy
+             # pass_acc 当前batch 的 accuracy
+         pass_total_acc = accuracy.eval(exe)  # 整个pass的accuracy
+    ```
+
+- 如何在训练中测试？[->](https://github.com/dzhwinter/benchmark/blob/master/fluid/vgg16.py#L144)
+- 如何保存训练好的模型？[->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py#L143)
+- 如何加载训练好的模型进行预测？[->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py#L154)
+- 如何在同一个训练任务中定义多个Program，并交替运行？ [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/demo/fc_gan.py)
+- 如何profile？Fluid 实现了profile 工具，可以直接调用。请参考示例 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_profiler.py)
+
+
+</font>
+
+---
diff --git a/doc/fluid/getstarted/index_cn.rst b/doc/fluid/getstarted/index_cn.rst
index 75af7354be93a6eeabfa9ccf86903505402a7ca6..3daea71d0933a2774227ff2b5e744392ca6b1765 100644
--- a/doc/fluid/getstarted/index_cn.rst
+++ b/doc/fluid/getstarted/index_cn.rst
@@ -17,3 +17,4 @@
   :maxdepth: 1
 
   concepts/use_concepts_cn.rst
+  developer's_guide_to_paddle_fluid.md
diff --git a/doc/fluid/getstarted/index_en.rst b/doc/fluid/getstarted/index_en.rst
index 75a43f4af87c34830ec940068196e6ca72640501..fb20bb4f245281c3acf67c417979dc63c144fef3 100644
--- a/doc/fluid/getstarted/index_en.rst
+++ b/doc/fluid/getstarted/index_en.rst
@@ -16,3 +16,4 @@ Here is an example of linear regression. It introduces workflow of PaddlePaddle,
   :maxdepth: 1
 
   concepts/index_en.rst
+  developer's_guide_to_paddle_fluid.md
diff --git a/doc/fluid/getstarted/quickstart_cn.rst b/doc/fluid/getstarted/quickstart_cn.rst
index 135beb75d0330f39d062753aa2aa83a077f36bb1..6a964d4f8561f30aa10936d2399698c51583442c 100644
--- a/doc/fluid/getstarted/quickstart_cn.rst
+++ b/doc/fluid/getstarted/quickstart_cn.rst
@@ -11,7 +11,7 @@ PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.
 
      pip install paddlepaddle
 
-如果需要安装支持GPU的版本（cuda7.5_cudnn5_avx_openblas），需要执行：
+如果需要安装支持GPU的版本（cuda8.0_cudnn5_avx_openblas），需要执行：
 
   .. code-block:: bash
 
@@ -28,18 +28,18 @@ PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.
 
      import paddle.dataset.uci_housing as uci_housing
      import paddle.fluid as fluid
-  
+
      with fluid.scope_guard(fluid.core.Scope()):
          # initialize executor with cpu
          exe = fluid.Executor(place=fluid.CPUPlace())
-         # load inference model 
+         # load inference model
          [inference_program, feed_target_names,fetch_targets] =  \
              fluid.io.load_inference_model(uci_housing.fluid_model(), exe)
          # run inference
-         result = exe.run(inference_program, 
-                          feed={feed_target_names[0]: uci_housing.predict_reader()}, 
+         result = exe.run(inference_program,
+                          feed={feed_target_names[0]: uci_housing.predict_reader()},
                           fetch_list=fetch_targets)
-         # print predicted price is $12,273.97 
+         # print predicted price is $12,273.97
          print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000)
 
 执行 :code:`python housing.py` 瞧！ 它应该打印出预测住房数据的清单。
diff --git a/doc/fluid/getstarted/quickstart_en.rst b/doc/fluid/getstarted/quickstart_en.rst
index df6619cfd039fc1fdca8cde57db9cc6aebf8f029..680122f25893a5a48fac103266bda4788f891f6d 100644
--- a/doc/fluid/getstarted/quickstart_en.rst
+++ b/doc/fluid/getstarted/quickstart_en.rst
@@ -12,7 +12,7 @@ Simply run the following command to install, the version is cpu_avx_openblas:
 
      pip install paddlepaddle
 
-If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
+If you need to install GPU version (cuda8.0_cudnn5_avx_openblas), run:
 
   .. code-block:: bash
 
@@ -31,18 +31,18 @@ code:
 
      import paddle.dataset.uci_housing as uci_housing
      import paddle.fluid as fluid
-  
+
      with fluid.scope_guard(fluid.core.Scope()):
          # initialize executor with cpu
          exe = fluid.Executor(place=fluid.CPUPlace())
-         # load inference model 
+         # load inference model
          [inference_program, feed_target_names,fetch_targets] =  \
              fluid.io.load_inference_model(uci_housing.fluid_model(), exe)
          # run inference
-         result = exe.run(inference_program, 
-                          feed={feed_target_names[0]: uci_housing.predict_reader()}, 
+         result = exe.run(inference_program,
+                          feed={feed_target_names[0]: uci_housing.predict_reader()},
                           fetch_list=fetch_targets)
-         # print predicted price is $12,273.97 
+         # print predicted price is $12,273.97
          print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000)
 
 Run :code:`python housing.py` and voila! It should print out a list of predictions
diff --git a/doc/fluid/howto/index_cn.rst b/doc/fluid/howto/index_cn.rst
index b7c620179724ebe97a0a47b75a57b376b21ccf90..b57af64f44da82926c4862578f3072960ca5aa92 100644
--- a/doc/fluid/howto/index_cn.rst
+++ b/doc/fluid/howto/index_cn.rst
@@ -4,5 +4,5 @@
 .. toctree::
   :maxdepth: 1
 
+  inference/index_cn.rst
   optimization/index_cn.rst
-  inference/inference_support_in_fluid.md
diff --git a/doc/fluid/howto/index_en.rst b/doc/fluid/howto/index_en.rst
index f3ca41cdbf1d40ec8afaf045233a38755d8a777a..fd21e167ce3a46da167db1e9d7013804f730e047 100644
--- a/doc/fluid/howto/index_en.rst
+++ b/doc/fluid/howto/index_en.rst
@@ -5,4 +5,3 @@ HOW TO
   :maxdepth: 1
 
   optimization/index_en.rst
-  inference/inference_support_in_fluid.md
diff --git a/doc/fluid/howto/inference/build_and_install_lib_cn.rst b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c8d9992fcc92c25f8c14f71c79bde9f79fd92b1f
--- /dev/null
+++ b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
@@ -0,0 +1,96 @@
+安装与编译C++预测库
+===========================
+
+直接下载安装
+-------------
+
+======================   ========================================
+版本说明                            C++预测库   
+======================   ========================================
+cpu_avx_mkl              `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/fluid.tgz>`_ 
+cpu_avx_openblas         `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/fluid.tgz>`_
+cpu_noavx_openblas       `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/fluid.tgz>`_
+cuda7.5_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+cuda8.0_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+cuda8.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+======================   ========================================
+
+从源码编译
+----------
+用户也可以从 PaddlePaddle 核心代码编译C++预测库，只需在编译时配制下面这些编译选项：
+
+=================   =========
+选项                 值   
+=================   =========
+CMAKE_BUILD_TYPE    Release
+FLUID_INSTALL_DIR   安装路径    
+WITH_FLUID_ONLY     ON（推荐）
+WITH_SWIG_PY        OFF（推荐
+WITH_PYTHON         OFF（推荐）
+WITH_GPU            ON/OFF
+WITH_MKL            ON/OFF
+=================   =========
+
+建议按照推荐值设置，以避免链接不必要的库。其它可选编译选项按需进行设定。
+
+下面的代码片段从github拉取最新代码，配制编译选项（需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径）：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+     PADDLE_ROOT=/path/of/capi
+     git clone https://github.com/PaddlePaddle/Paddle.git
+     cd Paddle
+     mkdir build
+     cd build
+     cmake -DFLUID_INSTALL_DIR=$PADDLE_ROOT \
+           -DCMAKE_BUILD_TYPE=Release \
+           -DWITH_FLUID_ONLY=ON \
+           -DWITH_SWIG_PY=OFF \
+           -DWITH_PYTHON=OFF \
+           -DWITH_MKL=OFF \
+           -DWITH_GPU=OFF  \
+           ..
+      make
+      make inference_lib_dist
+
+成功编译后，使用C++预测库所需的依赖（包括：（1）编译出的PaddlePaddle预测库和头文件；（2）第三方链接库和头文件；（3）版本信息与编译选项信息）
+均会存放于PADDLE_ROOT目录中。目录结构如下：
+
+  .. code-block:: text
+
+     PaddleRoot/
+     ├── CMakeCache.txt
+     ├── paddle
+     │   └── fluid
+     │       ├── framework
+     │       ├── inference
+     │       ├── memory
+     │       ├── platform
+     │       ├── pybind
+     │       └── string
+     ├── third_party
+     │   ├── boost
+     │   │   └── boost
+     │   ├── eigen3
+     │   │   ├── Eigen
+     │   │   └── unsupported
+     │   └── install
+     │       ├── gflags
+     │       ├── glog
+     │       ├── mklml
+     │       ├── protobuf
+     │       ├── snappy
+     │       ├── snappystream
+     │       └── zlib
+     └── version.txt
+     
+version.txt 中记录了该预测库的版本信息，包括Git Commit ID、使用OpenBlas或MKL数学库、CUDA/CUDNN版本号，如：
+
+  .. code-block:: text
+
+     GIT COMMIT ID: c95cd4742f02bb009e651a00b07b21c979637dc8
+     WITH_MKL: ON
+     WITH_GPU: ON
+     CUDA version: 8.0
+     CUDNN version: v5
diff --git a/doc/fluid/howto/inference/index_cn.rst b/doc/fluid/howto/inference/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a903423548decd0992bf19772fb2cb143f6a12b5
--- /dev/null
+++ b/doc/fluid/howto/inference/index_cn.rst
@@ -0,0 +1,8 @@
+预测库
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  build_and_install_lib_cn.rst
+  inference_support_in_fluid_cn.md
diff --git a/doc/fluid/howto/inference/inference_support_in_fluid.md b/doc/fluid/howto/inference/inference_support_in_fluid_cn.md
similarity index 90%
rename from doc/fluid/howto/inference/inference_support_in_fluid.md
rename to doc/fluid/howto/inference/inference_support_in_fluid_cn.md
index d272cd3e3bdac49b9ed1a21531de1b0be03d881e..309b17fccd5c461c9c22beb64eb4c6792b7e4a7a 100644
--- a/doc/fluid/howto/inference/inference_support_in_fluid.md
+++ b/doc/fluid/howto/inference/inference_support_in_fluid_cn.md
@@ -1,9 +1,8 @@
-# Fluid Inference使用指南
+# 使用指南
 
 ## 目录：
 
 - Python Inference API
-- 编译Fluid Inference库
 - Inference C++ API
 - Inference实例
 - Inference计算优化
@@ -55,62 +54,6 @@
     return [program, feed_target_names, fetch_targets]
   ```
 
-
-## 编译Fluid Inference库
-
-  - **不需要额外的CMake选项**
-    - 1、 配置CMake命令，更多配置请参考[源码编译PaddlePaddle](http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/build_from_source_cn.html)
-      ```bash
-      $ git clone https://github.com/PaddlePaddle/Paddle.git
-      $ cd Paddle
-      $ mkdir build
-      $ cd build
-      $ cmake -DCMAKE_INSTALL_PREFIX=your/path/to/paddle_inference_lib \
-          -DCMAKE_BUILD_TYPE=Release \
-          -DWITH_PYTHON=ON \
-          -DWITH_MKL=OFF \
-          -DWITH_GPU=OFF \
-          ..
-      ```
-
-    - 2、 编译PaddlePaddle
-      ```bash
-      $ make
-      ```
-
-    - 3、 部署。执行如下命令将PaddlePaddle Fluid Inference库部署到`your/path/to/paddle_inference_lib`目录。
-      ```bash
-      $ make inference_lib_dist
-      ```
-
-- 目录结构
-
-  ```bash
-  $ cd your/path/to/paddle_inference_lib
-  $ tree
-  .
-  |-- paddle
-  |   `-- fluid
-  |       |-- framework
-  |       |-- inference
-  |       |   |-- io.h
-  |       |   `-- libpaddle_fluid.so
-  |       |-- memory
-  |       |-- platform
-  |       `-- string
-  |-- third_party
-  |   |-- eigen3
-  |   `-- install
-  |       |-- gflags
-  |       |-- glog
-  |       `-- protobuf
-  `-- ...
-  ```
-
-  假设`PADDLE_ROOT=your/path/to/paddle_inference_lib`。
-
-
-
 ## 链接Fluid Inference库
 - 示例项目([链接](https://github.com/luotao1/fluid_inference_example.git))
 
diff --git a/doc/fluid/images/1.png b/doc/fluid/images/1.png
new file mode 100644
index 0000000000000000000000000000000000000000..67daf566f91aab570e60971c4ea8e2be876e214d
Binary files /dev/null and b/doc/fluid/images/1.png differ
diff --git a/doc/fluid/images/2.png b/doc/fluid/images/2.png
new file mode 100644
index 0000000000000000000000000000000000000000..43367777f41449a666e7a3b571f09ac5d5dfb1ae
Binary files /dev/null and b/doc/fluid/images/2.png differ
diff --git a/doc/fluid/images/3.png b/doc/fluid/images/3.png
new file mode 100644
index 0000000000000000000000000000000000000000..481021ef306e2596818aab7fe17a570754f63635
Binary files /dev/null and b/doc/fluid/images/3.png differ
diff --git a/doc/fluid/images/4.png b/doc/fluid/images/4.png
new file mode 100644
index 0000000000000000000000000000000000000000..4279f41e06de459f18b9a622539511d555e9a0af
Binary files /dev/null and b/doc/fluid/images/4.png differ
diff --git a/doc/fluid/images/LoDTensor.png b/doc/fluid/images/LoDTensor.png
new file mode 100644
index 0000000000000000000000000000000000000000..75369f5378309e0f304b83f6bb69bdb195eac079
Binary files /dev/null and b/doc/fluid/images/LoDTensor.png differ
diff --git a/doc/fluid/images/compile_run_time.png b/doc/fluid/images/compile_run_time.png
new file mode 100644
index 0000000000000000000000000000000000000000..0bc9b2fd0e81b4851e6d96171ccb9a05d0f42a48
Binary files /dev/null and b/doc/fluid/images/compile_run_time.png differ
diff --git a/doc/fluid/images/executor.png b/doc/fluid/images/executor.png
new file mode 100644
index 0000000000000000000000000000000000000000..b29c0d779e3d46b779b5baeabe3176adaeb00a6d
Binary files /dev/null and b/doc/fluid/images/executor.png differ
diff --git a/doc/fluid/images/fluid_examples.png b/doc/fluid/images/fluid_examples.png
new file mode 100644
index 0000000000000000000000000000000000000000..aa99472c0f914cde128fd7b3bd8dc29ac24f94b6
Binary files /dev/null and b/doc/fluid/images/fluid_examples.png differ
diff --git a/doc/fluid/images/fluid_module_1.png b/doc/fluid/images/fluid_module_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..554782ba54e43efc3d6babbb94e3cac3530ac649
Binary files /dev/null and b/doc/fluid/images/fluid_module_1.png differ
diff --git a/doc/fluid/images/fluid_module_2.png b/doc/fluid/images/fluid_module_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..4219efccbb1e87839adf6b5720fe46808b7d2fcf
Binary files /dev/null and b/doc/fluid/images/fluid_module_2.png differ
diff --git a/doc/fluid/images/layer.png b/doc/fluid/images/layer.png
new file mode 100644
index 0000000000000000000000000000000000000000..e46db4c9c6f5b65ff274b498b716b11de343a8b0
Binary files /dev/null and b/doc/fluid/images/layer.png differ
diff --git a/doc/fluid/images/operator1.png b/doc/fluid/images/operator1.png
new file mode 100644
index 0000000000000000000000000000000000000000..3975b06f615b7a88dfc11e71b6451fdf4ce42d60
Binary files /dev/null and b/doc/fluid/images/operator1.png differ
diff --git a/doc/fluid/images/operator2.png b/doc/fluid/images/operator2.png
new file mode 100644
index 0000000000000000000000000000000000000000..b7bb1fae2050d3a70797517bc20dbbdef3dfcb7c
Binary files /dev/null and b/doc/fluid/images/operator2.png differ
diff --git a/doc/fluid/images/place.png b/doc/fluid/images/place.png
new file mode 100644
index 0000000000000000000000000000000000000000..14e77511d639af155e5a3725cde05323e0cc94f2
Binary files /dev/null and b/doc/fluid/images/place.png differ
diff --git a/doc/fluid/images/print_fluid_program.png b/doc/fluid/images/print_fluid_program.png
new file mode 100644
index 0000000000000000000000000000000000000000..e8e459e1b3d5c8706b3caa05dc371db8d46df4a5
Binary files /dev/null and b/doc/fluid/images/print_fluid_program.png differ
diff --git a/doc/fluid/images/program_desc1.png b/doc/fluid/images/program_desc1.png
new file mode 100644
index 0000000000000000000000000000000000000000..0656336914ece957f2e5bb4d70ad337a63e31d88
Binary files /dev/null and b/doc/fluid/images/program_desc1.png differ
diff --git a/doc/fluid/images/program_desc2.png b/doc/fluid/images/program_desc2.png
new file mode 100644
index 0000000000000000000000000000000000000000..db5bfa1231345add8661b4f8ef0fc9d861f40d24
Binary files /dev/null and b/doc/fluid/images/program_desc2.png differ
diff --git a/doc/fluid/images/raw_input.png b/doc/fluid/images/raw_input.png
new file mode 100644
index 0000000000000000000000000000000000000000..0725f92d2b169c2b59ec7c68b402859c2a2dd1d8
Binary files /dev/null and b/doc/fluid/images/raw_input.png differ
diff --git a/doc/fluid/images/scope_variable_tensor.png b/doc/fluid/images/scope_variable_tensor.png
new file mode 100644
index 0000000000000000000000000000000000000000..59b0de6fb36f9f6b469227c05760a7612bb30b4d
Binary files /dev/null and b/doc/fluid/images/scope_variable_tensor.png differ
diff --git a/doc/fluid/images/sorted_input.png b/doc/fluid/images/sorted_input.png
new file mode 100644
index 0000000000000000000000000000000000000000..ff601128368ee179e3fd33e5e295a9ddd3dcbaeb
Binary files /dev/null and b/doc/fluid/images/sorted_input.png differ
diff --git a/doc/fluid/images/transpiler.png b/doc/fluid/images/transpiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..422973c0dc7aa2b544d2fc86a97ace706388cb9e
Binary files /dev/null and b/doc/fluid/images/transpiler.png differ
diff --git a/doc/fluid/images/user_interface.png b/doc/fluid/images/user_interface.png
new file mode 100644
index 0000000000000000000000000000000000000000..ffc94e3d8945ec6291460afd90e8fcc600828390
Binary files /dev/null and b/doc/fluid/images/user_interface.png differ
diff --git a/doc/v2/build_and_install/build_from_source_cn.rst b/doc/v2/build_and_install/build_from_source_cn.rst
index 077f5e9b189269f9f6c9cf68310e2bfd43d8cb67..741c01ce5428c0046daa5a784da70d4bb492438c 100644
--- a/doc/v2/build_and_install/build_from_source_cn.rst
+++ b/doc/v2/build_and_install/build_from_source_cn.rst
@@ -35,13 +35,11 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
    # 2. 可选步骤：源码中构建用于编译PaddlePaddle的Docker镜像
    docker build -t paddle:dev .
    # 3. 执行下面的命令编译CPU-Only的二进制
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/paddle_build.sh build
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
    # 4. 或者也可以使用为上述可选步骤构建的镜像（必须先执行第2步）
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
 
-注：上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。如果使用自行
-构建的镜像（上述第4步）会执行 :code:`Dockerfile` 描述的默认入口程序 :code:`build.sh` 可以省略步骤3中
-最后的执行脚本的命令。
+注：上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。
 
 编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
 
@@ -72,15 +70,15 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 
 .. code-block:: bash
 
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
 
 如果期望执行其中一个单元测试，（比如 :code:`test_sum_op` ）：
 
 .. code-block:: bash
 
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
-   bash /paddle/paddle/scripts/docker/build.sh
-   cd /paddle/build
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   ./paddle/scripts/paddle_build.sh build
+   cd build
    ctest -R test_sum_op -V
 
 .. _faq_docker:
diff --git a/doc/v2/build_and_install/build_from_source_en.rst b/doc/v2/build_and_install/build_from_source_en.rst
index 545e61ce9602240807d515e9eae971dfca9ddd7f..b06c43e19dcfc52ad0f074a85517a16744895a3a 100644
--- a/doc/v2/build_and_install/build_from_source_en.rst
+++ b/doc/v2/build_and_install/build_from_source_en.rst
@@ -34,14 +34,12 @@ Or you can build your own image from source as the optional step below:
    # 2. Optional: build development docker image from source
    docker build -t paddle:dev .
    # 3. Run the following command to build a CPU-Only binaries
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/paddle_build.sh build
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
    # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
 
 NOTE: The above command try to mount the current working directory (root directory of source code)
-into :code:`/paddle` directory inside docker container. If you are using your own image
-(Step 4) it will run default entry-point :code:`build.sh` , so you could omit the last
-command in step 3.
+into :code:`/paddle` directory inside docker container.
 
 When the compile finishes, you can get the output whl package under
 build/python/dist, then you can choose to install the whl on local
@@ -74,15 +72,15 @@ Set :code:`WITH_GPU=ON` Can also run tests on GPU.
 
 .. code-block:: bash
 
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
 
 If you wish to run only one unit test, like :code:`test_sum_op`:
 
 .. code-block:: bash
 
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
-   bash /paddle/paddle/scripts/docker/build.sh
-   cd /paddle/build
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   ./paddle/scripts/paddle_build.sh build
+   cd build
    ctest -R test_sum_op -V
 
 .. _faq_docker:
diff --git a/doc/v2/build_and_install/docker_install_cn.rst b/doc/v2/build_and_install/docker_install_cn.rst
index da876b03e384a8175b27f78756af648c80fc6784..106c86bace075764c84bc2a7f7cb09d466fa8794 100644
--- a/doc/v2/build_and_install/docker_install_cn.rst
+++ b/doc/v2/build_and_install/docker_install_cn.rst
@@ -98,7 +98,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
 
 国内用户可以使用下面的镜像源来加速访问：
 
-  .. code-block: bash
+  .. code-block:: bash
 
     docker run -p 8888:8888 docker.paddlepaddlehub.com/book
 
diff --git a/doc/v2/build_and_install/docker_install_en.rst b/doc/v2/build_and_install/docker_install_en.rst
index 5dbdedc4cb064ef415e8d19f00727a16d1c175c6..25aecb8d0da9feb00006da6259b529b7011d91cb 100644
--- a/doc/v2/build_and_install/docker_install_en.rst
+++ b/doc/v2/build_and_install/docker_install_en.rst
@@ -105,7 +105,7 @@ We provide a packaged book image, simply issue the command:
 
 For users in China, we provide a faster mirror:
 
-  .. code-block: bash
+  .. code-block:: bash
 
     docker run -p 8888:8888 docker.paddlepaddlehub.com/book
 
diff --git a/doc/v2/dev/contribute_to_paddle_cn.md b/doc/v2/dev/contribute_to_paddle_cn.md
index d8bf093e09b53b302225739fa67146adc7976e4b..add06e42f1bbd221b48eb83e4e84d4a7c89e7483 100644
--- a/doc/v2/dev/contribute_to_paddle_cn.md
+++ b/doc/v2/dev/contribute_to_paddle_cn.md
@@ -51,6 +51,8 @@ Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理 G
 
 Paddle 使用 `clang-format` 来调整 C/C++ 源代码格式，请确保 `clang-format` 版本在 3.8 以上。
 
+注：通过`pip install pre-commit`和`conda install -c conda-forge pre-commit`安装的`yapf`稍有不同的，Paddle 开发人员使用的是`pip install pre-commit`。
+
 ## 开始开发
 
 在本例中，我删除了 README.md 中的一行，并创建了一个新文件。
diff --git a/paddle/.gitignore b/paddle/.gitignore
index 1c1c0c2c829f088d7e3f52ca007fcb8f33a16a36..01904aa6ef2057afee95ddd6e30cde064b06c52e 100644
--- a/paddle/.gitignore
+++ b/paddle/.gitignore
@@ -11,7 +11,6 @@ GTAGS
 *.pb.cc
 *.pb.h
 *_pb2.py
-paddle_*
 output/
 google/
 Makefile
diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp
index a3d6f0f080abcf1f45d9bc5fbdb39bb6b6ca1553..0d9ad30de9c1f3f8f58c856a748abdc050ff8740 100644
--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@@ -94,7 +94,7 @@ void UpdateCallback::apply(Parameter* p) {
 }
 
 class UpdateCallbackWrapper {
-public:
+ public:
   explicit UpdateCallbackWrapper(const UpdateCallback& callback)
       : callback(const_cast<UpdateCallback&>(callback)) {}
 
@@ -105,7 +105,7 @@ public:
     delete p;
   }
 
-private:
+ private:
   UpdateCallback& callback;
 };
 
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index 67368d1a99d980b248789d24a2ea4f466255687a..7866122006a996cbe5201c661cab9c81aa82a219 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -59,9 +59,10 @@ class RangeError {};
 
 /// Not support Error, such as access GPU memory directly, etc.
 class UnsupportError : public std::runtime_error {
-public:
-  UnsupportError() : std::runtime_error(" "){};
-  UnsupportError(const std::string& message) : std::runtime_error(message){};
+ public:
+  UnsupportError() : std::runtime_error(" ") {}
+  explicit UnsupportError(const std::string& message)
+      : std::runtime_error(message) {}
 };
 
 /// This type will map to python's list of float.
@@ -105,7 +106,7 @@ class Matrix {
   DISABLE_COPY(Matrix);
   static Matrix* createByPaddleMatrixPtr(void* sharedPtr);
 
-public:
+ public:
   virtual ~Matrix();
 
   /**
@@ -231,7 +232,7 @@ public:
 
   bool isGpu() const;
 
-private:
+ private:
   void* getSharedPtr() const;
 
   MatrixPrivate* m;
@@ -248,7 +249,7 @@ class Vector {
 
   void* getSharedPtr();
 
-public:
+ public:
   ~Vector();
 
   /// Create Vector filled with zero.
@@ -310,10 +311,10 @@ public:
   /// __len__ in python
   size_t getSize() const;
 
-private:
+ private:
   VectorPrivate* m;
 
-private:
+ private:
   friend class Parameter;
   friend class ParameterOptimizer;
   friend struct ParameterTraverseCallbackPrivate;
@@ -325,7 +326,7 @@ class IVector {
   DISABLE_COPY(IVector);
   static IVector* createByPaddleVectorPtr(void* ptr);
 
-public:
+ public:
   /// Create IVector filled with zero
   static IVector* createZero(size_t sz, bool useGpu = isUsingGpu());
 
@@ -389,7 +390,7 @@ public:
   /// This method will map to python __len__();
   size_t getSize() const;
 
-private:
+ private:
   void* getSharedPtr() const;
 
   friend class Arguments;
@@ -400,11 +401,11 @@ struct ArgumentsPrivate;
 
 /// The Arguments is actual a std::vector<paddle::Argument> in paddle.
 class Arguments {
-private:
+ private:
   Arguments();  // Internal Create.
   DISABLE_COPY(Arguments);
 
-public:
+ public:
   /**
    * Create a arguments with size.
    * Note that it can be zero.
@@ -475,12 +476,12 @@ public:
 
   float sum() const;
 
-private:
+ private:
   static Arguments* createByPaddleArgumentVector(void* ptr);
   static Arguments* createByPaddleArgument(const void* ptr);
   void* getInternalArgumentsPtr() const;
 
-private:
+ private:
   ArgumentsPrivate* m;
   friend class Trainer;
   friend class GradientMachine;
@@ -507,7 +508,7 @@ class ParameterConfig {
   static ParameterConfig* createParameterConfigFromParameterPtr(void* ptr);
   void* getRawPtr();
 
-public:
+ public:
   ~ParameterConfig();
 
   /**
@@ -515,10 +516,10 @@ public:
    */
   std::string toProtoString() const;
 
-private:
+ private:
   ParameterConfigPrivate* m;
 
-private:
+ private:
   friend class Parameter;
   friend class ParameterOptimizer;
   friend struct ParameterTraverseCallbackPrivate;
@@ -529,7 +530,7 @@ class OptimizationConfig {
   DISABLE_COPY(OptimizationConfig);
   OptimizationConfig();
 
-public:
+ public:
   static OptimizationConfig* createFromProtoString(const std::string& str);
   ~OptimizationConfig();
 
@@ -538,7 +539,7 @@ public:
    */
   std::string toProtoString();
 
-private:
+ private:
   OptimizationConfigPrivate* m;
 
   friend class TrainerConfig;
@@ -549,11 +550,11 @@ private:
 
 struct ParameterPrivate;
 class Parameter {
-private:
+ private:
   Parameter();
   DISABLE_COPY(Parameter);
 
-public:
+ public:
   virtual ~Parameter();
 
   /**
@@ -580,11 +581,11 @@ public:
 
   size_t getSize() const;
 
-private:
+ private:
   static Parameter* createFromRawPtr(void* ptr);
   static Parameter* createFromSharedPtr(void* ptr);
 
-private:
+ private:
   ParameterPrivate* m;
   friend class UpdateCallbackWrapper;
   friend class GradientMachine;
@@ -598,14 +599,14 @@ struct ModelConfigPrivate;
  * It is used by GradientMachine.
  */
 class ModelConfig {
-private:
+ private:
   ModelConfig();
   DISABLE_COPY(ModelConfig);
 
-public:
+ public:
   virtual ~ModelConfig();
 
-private:
+ private:
   ModelConfigPrivate* m;
   friend class TrainerConfig;
   friend struct TrainerConfigPrivate;
@@ -619,11 +620,11 @@ struct TrainerConfigPrivate;
  * It is used by GradientMachine.
  */
 class TrainerConfig {
-private:
+ private:
   TrainerConfig();
   DISABLE_COPY(TrainerConfig);
 
-public:
+ public:
   virtual ~TrainerConfig();
 
   static TrainerConfig* createFromTrainerConfigFile(
@@ -634,7 +635,7 @@ public:
 
   OptimizationConfig* getOptimizationConfig() const;
 
-private:
+ private:
   TrainerConfigPrivate* m;
   friend class Trainer;
 };
@@ -654,7 +655,7 @@ private:
  * @endcode
  */
 class UpdateCallback {
-public:
+ public:
   virtual ~UpdateCallback();
   virtual void apply(Parameter* p);
 };
@@ -664,14 +665,14 @@ class ParameterTraverseCallback {
   DISABLE_COPY(ParameterTraverseCallback);
   ParameterTraverseCallback();
 
-public:
+ public:
   ~ParameterTraverseCallback();
 
   void apply(const std::vector<Vector*>& vecs,
              const ParameterConfig& config,
              size_t sparseId);
 
-private:
+ private:
   ParameterTraverseCallbackPrivate* m;
   friend class ParameterOptimizer;
 };
@@ -686,7 +687,7 @@ class ParameterOptimizer {
   DISABLE_COPY(ParameterOptimizer);
   ParameterOptimizer();
 
-public:
+ public:
   static ParameterOptimizer* create(OptimizationConfig* config);
 
   ~ParameterOptimizer();
@@ -710,7 +711,7 @@ public:
   ParameterTraverseCallback* needSpecialTraversal(
       const ParameterConfig& config) const;
 
-private:
+ private:
   ParameterOptimizerPrivate* m;
 };
 
@@ -718,11 +719,11 @@ class SequenceGenerator;
 class Evaluator;
 struct GradientMachinePrivate;
 class GradientMachine {
-private:
+ private:
   GradientMachine();
   DISABLE_COPY(GradientMachine);
 
-public:
+ public:
   virtual ~GradientMachine();
 
   /**
@@ -817,7 +818,7 @@ public:
 
   void eval(Evaluator* evaluator);
 
-private:
+ private:
   GradientMachinePrivate* m;
 
   static GradientMachine* createFromPaddleModelPtr(
@@ -833,10 +834,10 @@ private:
 
 struct ParameterUpdaterPrivate;
 class ParameterUpdater {
-private:
+ private:
   ParameterUpdater();
 
-public:
+ public:
   static ParameterUpdater* createLocalUpdater(OptimizationConfig* config);
   static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
                                                int passCount,
@@ -911,17 +912,17 @@ public:
    */
   void catchUpWith();
 
-private:
+ private:
   ParameterUpdaterPrivate* m;
 };
 
 struct EvaluatorPrivate;
 class Evaluator {
-private:
+ private:
   Evaluator();
   DISABLE_COPY(Evaluator);
 
-public:
+ public:
   ~Evaluator();
 
   /**
@@ -945,7 +946,7 @@ public:
 
   double getValue(const std::string name) const;
 
-private:
+ private:
   EvaluatorPrivate* m;
 
   friend class GradientMachine;
@@ -953,13 +954,13 @@ private:
 
 struct TrainerPrivate;
 class Trainer {
-private:
+ private:
   TrainerPrivate* m;
   Trainer();
   Trainer(TrainerConfig* optConfig, GradientMachine* gm);
   DISABLE_COPY(Trainer);
 
-public:
+ public:
   virtual ~Trainer();
 
   /// Create A Trainer By TrainerConfig. using paddle command line.
@@ -1002,7 +1003,7 @@ public:
 
 /// the N-Best results generated from one input sequence.
 class ISequenceResults {
-public:
+ public:
   virtual ~ISequenceResults();
 
   /// Number of result.
@@ -1026,7 +1027,7 @@ class SequenceGenerator {
   DISABLE_COPY(SequenceGenerator);
   SequenceGenerator();
 
-public:
+ public:
   virtual ~SequenceGenerator();
 
   /**
@@ -1044,10 +1045,10 @@ public:
   void setMaxLength(size_t maxlength);
   void setBeamSize(size_t beamSize);
 
-private:
+ private:
   static SequenceGenerator* createByGradientMachineSharedPtr(void* ptr);
   friend class GradientMachine;
 
-private:
+ private:
   SequenceGeneratorPrivate* m;
 };
diff --git a/paddle/api/SequenceGenerator.cpp b/paddle/api/SequenceGenerator.cpp
index 1b30aec8f6b6b73764886a7c7274be67851e4815..1446c3084238859a759669f3a32c7efde67dcc2b 100644
--- a/paddle/api/SequenceGenerator.cpp
+++ b/paddle/api/SequenceGenerator.cpp
@@ -138,7 +138,7 @@ struct SequenceGeneratorPrivate {
         maxLength(0UL),
         feedback(__create_feedback__()) {}
 
-private:
+ private:
   static paddle::Argument __create_feedback__() {
     paddle::Argument feedback;
     feedback.ids = paddle::IVector::create(/* size= */ 1, FLAGS_use_gpu);
@@ -157,7 +157,7 @@ SequenceGenerator::~SequenceGenerator() { delete m; }
 
 class PathSequenceResults : public ISequenceResults {
   // ISequenceResults interface
-public:
+ public:
   PathSequenceResults(const std::shared_ptr<std::vector<Path>>& path,
                       const std::shared_ptr<std::vector<std::string>>& dict)
       : path_(path), dict_(dict) {}
@@ -196,7 +196,7 @@ public:
     }
   }
 
-private:
+ private:
   std::shared_ptr<std::vector<Path>> path_;
   std::shared_ptr<std::vector<std::string>> dict_;
 };
diff --git a/paddle/capi/gradient_machine.cpp b/paddle/capi/gradient_machine.cpp
index ea9aab00e3d05f1e2ef0c91eab93b67e0a3d5f37..8c3f504e5a2d807c0cc664af486ebab4a82ddec3 100644
--- a/paddle/capi/gradient_machine.cpp
+++ b/paddle/capi/gradient_machine.cpp
@@ -26,7 +26,7 @@ enum GradientMatchineCreateMode {
 namespace paddle {
 
 class MyNeuralNetwork : public NeuralNetwork {
-public:
+ public:
   MyNeuralNetwork(const std::string& name, NeuralNetwork* network)
       : NeuralNetwork(name, network) {}
 };
diff --git a/paddle/contrib/CMakeLists.txt b/paddle/contrib/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4b19256ef4533a09162edf907f6cd51146517e46
--- /dev/null
+++ b/paddle/contrib/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+add_subdirectory(inference)
diff --git a/paddle/contrib/float16/README.md b/paddle/contrib/float16/README.md
index ded959c47cb81b9384abbb9815773e25969344ec..58b4a50666bfb622af8acbce29355f2a4a870a82 100644
--- a/paddle/contrib/float16/README.md
+++ b/paddle/contrib/float16/README.md
@@ -89,7 +89,7 @@ cd Paddle
 # to `FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04` and similarly for other configurations
 nvidia-docker build -t paddle:float16 .
 # After running this, different results will be written to different log files in Paddle/contrib/float16/
-nvidia-docker run -it -v $PWD:/paddle paddle:float16 /paddle/contrib/float16/run_float16_demo.sh
+nvidia-docker run -it -v $PWD:/paddle paddle:float16 /paddle/paddle/contrib/float16/run_float16_demo.sh
 ```
 
 #### Accuracy
diff --git a/paddle/contrib/float16/run_float16_demo.sh b/paddle/contrib/float16/run_float16_demo.sh
index d8a34ee67b8fab214fa6e96104304689211f84da..031225a85dabb26e5d9ea06f58909c049e7f0c08 100755
--- a/paddle/contrib/float16/run_float16_demo.sh
+++ b/paddle/contrib/float16/run_float16_demo.sh
@@ -3,7 +3,7 @@
 BUILD_PATH=/paddle/fp16_build
 WHEEL_PATH=$BUILD_PATH/python/dist
 INFER_PATH=$BUILD_PATH/paddle/fluid/inference/tests/book
-DEMO_PATH=/paddle/contrib/float16
+DEMO_PATH=/paddle/paddle/contrib/float16
 
 # Use the single most powerful CUDA GPU on your machine
 export CUDA_VISIBLE_DEVICES=0
@@ -50,7 +50,6 @@ do
          --repeat=1 \
 
   $INFER_PATH/test_inference_image_classification_vgg \
-      --data_set=imagenet \
       --dirname=$DEMO_PATH/image_classification_imagenet_vgg.inference.model \
       --fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_vgg.inference.model \
       --repeat=$REPEAT \
@@ -68,7 +67,6 @@ do
          --repeat=1 \
 
   $INFER_PATH/test_inference_image_classification_resnet \
-      --data_set=imagenet \
       --dirname=$DEMO_PATH/image_classification_imagenet_resnet.inference.model \
       --fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_resnet.inference.model \
       --repeat=$REPEAT \
@@ -86,7 +84,6 @@ do
          --repeat=1 \
 
   $INFER_PATH/test_inference_image_classification_vgg \
-      --data_set=cifar10 \
       --dirname=$DEMO_PATH/image_classification_cifar10_vgg.inference.model \
       --fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_vgg.inference.model \
       --repeat=$REPEAT \
@@ -104,7 +101,6 @@ do
          --repeat=1 \
 
   $INFER_PATH/test_inference_image_classification_vgg \
-      --data_set=cifar10 \
       --dirname=$DEMO_PATH/image_classification_cifar10_resnet.inference.model \
       --fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_resnet.inference.model \
       --repeat=$REPEAT \
diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6847f7db7fc0f6b41ced1260d409ca6eba9b53eb
--- /dev/null
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -0,0 +1,49 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+if(APPLE)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
+endif(APPLE)
+
+function(inference_api_test TARGET_NAME)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs ARGS)
+    cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
+    cc_test(test_paddle_inference_${TARGET_NAME}
+            SRCS test_paddle_inference_${TARGET_NAME}.cc
+            DEPS paddle_fluid_api paddle_inference_api
+            ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
+    if(inference_test_ARGS)
+        set_tests_properties(test_paddle_inference_${TARGET_NAME}
+                 PROPERTIES DEPENDS "${inference_test_ARGS}")
+    endif()
+endfunction(inference_api_test)
+
+
+cc_library(paddle_inference_api
+    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+
+if(WITH_TESTING)
+    cc_test(test_paddle_inference_api
+            SRCS test_paddle_inference_api.cc
+            DEPS paddle_inference_api)
+
+    inference_api_test(api_impl
+                       ARGS test_word2vec test_image_classification)
+endif()
diff --git a/paddle/contrib/inference/paddle_inference_api.cc b/paddle/contrib/inference/paddle_inference_api.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d67e1e7667800d6dd00cb8915b0d6dc7c664970b
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api.cc
@@ -0,0 +1,15 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/contrib/inference/paddle_inference_api.h
index dbaa7c95b97e954537707566e5b7458e6afd14c8..5fe8399762bba69bc99ed9ae694db32f532ed953 100644
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -12,58 +12,98 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
+/*
+ * This file contains the definition of a simple Inference API for Paddle.
+ *
+ * ATTENTION: It requires some C++ features, for lower version C++ or C, we
+ * might release another API.
+ */
+
 #pragma once
 
+#include <memory>
 #include <string>
 #include <vector>
 
 namespace paddle {
 
-class Predictor {
-public:
-  struct Attr;
-  Predictor() = default;
+enum PaddleDType {
+  FLOAT32,
+  INT64,
+};
+
+struct PaddleBuf {
+  void* data;     // pointer to the data memory.
+  size_t length;  // number of memory bytes.
+};
+
+struct PaddleTensor {
+  std::string name;  // variable name.
+  std::vector<int> shape;
+  // TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
+  PaddleBuf data;  // blob of data.
+  PaddleDType dtype;
+};
+
+enum class PaddleEngineKind {
+  kNative = 0,  // Use the native Fluid facility.
+  // TODO(Superjomn) support following engines latter.
+  // kAnakin,             // Use Anakin for inference.
+  // kTensorRT,           // Use TensorRT for inference.
+  // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
+  // kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
+};
 
-  // Build the network before inference.
-  bool Init(const Attr& attr);
+/*
+ * A simple Inference API for Paddle. Currently this API can be used by
+ * non-sequence scenerios.
+ */
+class PaddlePredictor {
+ public:
+  struct Config;
+  PaddlePredictor() = default;
+  PaddlePredictor(const PaddlePredictor&) = delete;
 
   // Predict an record.
-  // Arguments:
-  //   inputs: the name of the input variables.
-  //   outputs: the name of the output varaibles.
-  //   input_shapes: the shape of the input variables.
-  //   output_shapes: the shape of the output variables.
-  //   input_data: the data of the input variables.
-  //   output_data: the data of the output variables.
-  bool Run(const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs,
-           const std::vector<std::vector<int>>& input_shapes,
-           const std::vector<std::vector<int>>& output_shapes,
-           const std::vector<std::vector<float>>& input_data,
-           std::vector<std::vector<float>>* output_data);
-
-  // Clone a predictor that share the model weights.
-  Predictor* Clone();
+  // The caller should be responsible for allocating and releasing the memory of
+  // `inputs`. `inputs` should be alive until Run returns. caller should be
+  // responsible for releasing the memory of `output_data`.
+  virtual bool Run(const std::vector<PaddleTensor>& inputs,
+                   std::vector<PaddleTensor>* output_data) = 0;
 
-  // Destroy the Predictor.
-  ~Predictor();
+  // Clone a predictor that share the model weights, the Cloned predictor should
+  // be thread-safe.
+  virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
 
-  struct Attr {
-    enum class EngineKind;
+  // Destroy the Predictor.
+  virtual ~PaddlePredictor() {}
 
+  // The common configs for all the predictors.
+  struct Config {
     std::string model_dir;      // path to the model directory.
     bool enable_engine{false};  // Enable to execute (part of) the model on
-                                // third-party engines.
-    EngineKind engine_kind{Attr::EngineKind::kNone};
-
-    enum class EngineKind {
-      kNone = -1,          // Use the native Fluid facility.
-      kAnakin,             // Use Anakin for inference.
-      kTensorRT,           // Use TensorRT for inference.
-      kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
-      kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
-    };
   };
 };
 
+struct NativeConfig : public PaddlePredictor::Config {
+  // GPU related fields.
+  bool use_gpu{false};
+  int device{0};
+  float fraction_of_gpu_memory{-1.f};  // Negative to notify initialization.
+
+  std::string prog_file;
+  std::string param_file;
+};
+
+// A factory to help create different predictors.
+//
+// FOR EXTENSION DEVELOPER:
+// Different predictors are designated by config type and engine kind. Similar
+// configs can be merged, but there shouldn't be a huge config containing
+// different fields for more than one kind of predictors.
+//
+// Similarly, each engine kind should map to a unique predictor implementation.
+template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+
 }  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.cc b/paddle/contrib/inference/paddle_inference_api_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..99a64662d4d04e3cf9dfdafe5b5ab9e5dac0af8a
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
@@ -0,0 +1,273 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <sys/time.h>
+#include <algorithm>
+#include <map>
+#include <set>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/contrib/inference/paddle_inference_api_impl.h"
+
+namespace paddle {
+namespace {
+
+// Timer for timer
+class Timer {
+ public:
+  double start;
+  double startu;
+  void tic() {
+    struct timeval tp;
+    gettimeofday(&tp, NULL);
+    start = tp.tv_sec;
+    startu = tp.tv_usec;
+  }
+  double toc() {
+    struct timeval tp;
+    gettimeofday(&tp, NULL);
+    double used_time_ms =
+        (tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
+    return used_time_ms;
+  }
+};
+
+template <class T>
+std::string num2str(T a) {
+  std::stringstream istr;
+  istr << a;
+  return istr.str();
+}
+}  // namespace
+
+bool NativePaddlePredictor::Init() {
+  VLOG(3) << "Predictor::init()";
+
+  if (config_.use_gpu) {
+    place_ = paddle::platform::CUDAPlace(config_.device);
+  } else {
+    place_ = paddle::platform::CPUPlace();
+  }
+  paddle::framework::InitDevices(false);
+  executor_.reset(new paddle::framework::Executor(place_));
+  scope_.reset(new paddle::framework::Scope());
+
+  // Initialize the inference program
+  if (!config_.model_dir.empty()) {
+    // Parameters are saved in separate files sited in
+    // the specified `dirname`.
+    inference_program_ = paddle::inference::Load(
+        executor_.get(), scope_.get(), config_.model_dir);
+  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
+    // All parameters are saved in a single file.
+    // The file names should be consistent with that used
+    // in Python API `fluid.io.save_inference_model`.
+    inference_program_ = paddle::inference::Load(
+        executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
+  } else {
+    LOG(ERROR) << "fail to load inference model.";
+    return false;
+  }
+  ctx_ = executor_->Prepare(*inference_program_, 0);
+
+  // Create temporary variables first, so that the first batch do not need to
+  // create variables in the runtime. This is the logics of the old inference
+  // API.
+  // TODO(Superjomn) this should be modified when `Clone` is valid for
+  // multi-thread application.
+  executor_->CreateVariables(*inference_program_, scope_.get(), 0);
+
+  // Get the feed_target_names and fetch_target_names
+  feed_target_names_ = inference_program_->GetFeedTargetNames();
+  fetch_target_names_ = inference_program_->GetFetchTargetNames();
+  return true;
+}
+
+bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
+                                std::vector<PaddleTensor> *output_data) {
+  VLOG(3) << "Predictor::predict";
+  Timer timer;
+  timer.tic();
+  // set feed variable
+  std::map<std::string, const framework::LoDTensor *> feed_targets;
+  std::vector<framework::LoDTensor> feeds;
+  if (!SetFeed(inputs, &feeds)) {
+    LOG(ERROR) << "fail to set feed";
+    return false;
+  }
+  for (size_t i = 0; i < feed_target_names_.size(); ++i) {
+    feed_targets[feed_target_names_[i]] = &feeds[i];
+  }
+  // get fetch variable
+  std::map<std::string, framework::LoDTensor *> fetch_targets;
+  std::vector<framework::LoDTensor> fetchs;
+  fetchs.resize(fetch_target_names_.size());
+  for (size_t i = 0; i < fetch_target_names_.size(); ++i) {
+    fetch_targets[fetch_target_names_[i]] = &fetchs[i];
+  }
+  // Run the inference program
+  // if share variables, we need not create variables
+  executor_->RunPreparedContext(ctx_.get(),
+                                scope_.get(),
+                                &feed_targets,
+                                &fetch_targets,
+                                false /* don't create variable eatch time */);
+  if (!GetFetch(fetchs, output_data)) {
+    LOG(ERROR) << "fail to get fetchs";
+    return false;
+  }
+  VLOG(3) << "predict cost: " << timer.toc() << "ms";
+  return true;
+}
+
+std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
+  VLOG(3) << "Predictor::clone";
+  std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
+
+  if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init()) {
+    LOG(ERROR) << "fail to call Init";
+    return nullptr;
+  }
+  // fix manylinux compile error.
+  return std::move(cls);
+}
+
+bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
+                                    std::vector<framework::LoDTensor> *feeds) {
+  VLOG(3) << "Predictor::set_feed";
+  if (inputs.size() != feed_target_names_.size()) {
+    LOG(ERROR) << "wrong feed input size.";
+    return false;
+  }
+  for (size_t i = 0; i < feed_target_names_.size(); ++i) {
+    framework::LoDTensor input;
+    framework::DDim ddim = framework::make_ddim(inputs[i].shape);
+    void *input_ptr;
+    if (inputs[i].dtype == PaddleDType::INT64) {
+      input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
+    } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
+      input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
+    } else {
+      LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
+      return false;
+    }
+
+    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
+    std::memcpy(static_cast<void *>(input_ptr),
+                inputs[i].data.data,
+                inputs[i].data.length);
+    feeds->push_back(input);
+  }
+  return true;
+}
+
+bool NativePaddlePredictor::GetFetch(
+    const std::vector<framework::LoDTensor> &fetchs,
+    std::vector<PaddleTensor> *outputs) {
+  VLOG(3) << "Predictor::get_fetch";
+  outputs->resize(fetchs.size());
+  for (size_t i = 0; i < fetchs.size(); ++i) {
+    // TODO(panyx0718): Support fetch of other types.
+    if (fetchs[i].type() != typeid(float)) {
+      LOG(ERROR) << "only support fetching float now.";
+      return false;
+    }
+    std::vector<int> shape;
+    auto dims_i = fetchs[i].dims();
+    auto lod = fetchs[i].lod();
+    const float *output_ptr = fetchs[i].data<float>();
+    // const int64_t* output_ptr = fetchs[i].data<int64_t>();
+    auto num = fetchs[i].numel();
+    std::vector<float> data;
+    if (0 == lod.size()) {
+      std::copy(output_ptr, output_ptr + num, std::back_inserter(data));
+      for (int j = 0; j < dims_i.size(); ++j) {
+        shape.push_back(dims_i[j]);
+      }
+    } else {
+      // for batch detection
+      // image[0] -> output[0] shape {145, 6}
+      // image[1] -> output[1] shape {176, 6}
+      // then,
+      // the batch output shape {321, 6}
+      // the lod {{0, 145, 321}}
+      // so we should append output[0] to {176, 6}
+      size_t max_dim = 0;
+      for (size_t j = 1; j < lod[0].size(); j++) {
+        max_dim = std::max(max_dim, lod[0][j] - lod[0][j - 1]);
+      }
+      size_t common_dim = lod[0].back() == 0 ? 0 : num / lod[0].back();
+      if (max_dim > 0) {
+        data.resize((lod[0].size() - 1) * max_dim * common_dim, 0);
+      }
+      for (size_t j = 1; j < lod[0].size(); j++) {
+        size_t start = lod[0][j - 1] * common_dim;
+        size_t end = lod[0][j] * common_dim;
+        if (end > start) {
+          std::copy(output_ptr + start,
+                    output_ptr + end,
+                    data.begin() + (j - 1) * max_dim * common_dim);
+        }
+      }
+      shape.push_back(lod[0].size() - 1);
+      shape.push_back(max_dim);
+      for (int j = 1; j < dims_i.size(); ++j) {
+        shape.push_back(dims_i[j]);
+      }
+    }
+
+    outputs->at(i).shape = shape;
+    outputs->at(i).data.length = sizeof(float) * data.size();
+    outputs->at(i).data.data = malloc(outputs->at(i).data.length);
+    std::memcpy(
+        outputs->at(i).data.data, data.data(), outputs->at(i).data.length);
+    outputs->at(i).dtype = PaddleDType::FLOAT32;
+    // TODO(panyx0718): support other types? fill tensor name? avoid a copy.
+  }
+  return true;
+}
+
+template <>
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
+    const NativeConfig &config) {
+  VLOG(3) << "create NativePaddlePredictor";
+  if (config.use_gpu) {
+    // 1. GPU memeroy
+    PADDLE_ENFORCE(
+        config.fraction_of_gpu_memory > 0.f,
+        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
+    std::vector<std::string> flags;
+    if (config.fraction_of_gpu_memory >= 0.0f ||
+        config.fraction_of_gpu_memory <= 0.95f) {
+      flags.push_back("dummpy");
+      std::string flag = "--fraction_of_gpu_memory_to_use=" +
+                         num2str<float>(config.fraction_of_gpu_memory);
+      flags.push_back(flag);
+      VLOG(3) << "set flag: " << flag;
+      framework::InitGflags(flags);
+    }
+  }
+
+  std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config));
+  if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init()) {
+    return nullptr;
+  }
+  return std::move(predictor);
+}
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.h b/paddle/contrib/inference/paddle_inference_api_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..84707e223d7aa3d1ebca933923e932b3973613ae
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_impl.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+
+class NativePaddlePredictor : public PaddlePredictor {
+ public:
+  explicit NativePaddlePredictor(const NativeConfig &config)
+      : config_(config) {}
+
+  bool Init();
+
+  bool Run(const std::vector<PaddleTensor> &inputs,
+           std::vector<PaddleTensor> *output_data) override;
+
+  std::unique_ptr<PaddlePredictor> Clone() override;
+
+  ~NativePaddlePredictor() override{};
+
+ private:
+  bool SetFeed(const std::vector<PaddleTensor> &input_datas,
+               std::vector<framework::LoDTensor> *feeds);
+  bool GetFetch(const std::vector<framework::LoDTensor> &fetchs,
+                std::vector<PaddleTensor> *output_data);
+
+  NativeConfig config_;
+  platform::Place place_;
+  std::unique_ptr<framework::Executor> executor_;
+  std::unique_ptr<framework::Scope> scope_;
+  std::unique_ptr<framework::ExecutorPrepareContext> ctx_;
+  std::unique_ptr<framework::ProgramDesc> inference_program_;
+  std::vector<std::string> feed_target_names_;
+  std::vector<std::string> fetch_target_names_;
+};
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/test_paddle_inference_api.cc b/paddle/contrib/inference/test_paddle_inference_api.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bc7faab6e208a66d7a56e41a56bd743c7644eea2
--- /dev/null
+++ b/paddle/contrib/inference/test_paddle_inference_api.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+namespace paddle {
+
+/*
+ * Do not use this, just a demo indicating how to customize a config for a
+ * specific predictor.
+ */
+struct DemoConfig : public PaddlePredictor::Config {
+  float other_config;
+};
+
+/*
+ * Do not use this, just a demo indicating how to customize a Predictor.
+ */
+class DemoPredictor : public PaddlePredictor {
+ public:
+  explicit DemoPredictor(const DemoConfig &config) {
+    LOG(INFO) << "I get other_config " << config.other_config;
+  }
+  bool Run(const std::vector<PaddleTensor> &inputs,
+           std::vector<PaddleTensor> *output_data) override {
+    LOG(INFO) << "Run";
+    return false;
+  }
+
+  std::unique_ptr<PaddlePredictor> Clone() override { return nullptr; }
+
+  ~DemoPredictor() override {}
+};
+
+template <>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<DemoConfig>(
+    const DemoConfig &config) {
+  std::unique_ptr<PaddlePredictor> x(new DemoPredictor(config));
+  return x;
+}
+
+TEST(paddle_inference_api, demo) {
+  DemoConfig config;
+  config.other_config = 1.7;
+  auto predictor = CreatePaddlePredictor(config);
+  std::vector<PaddleTensor> outputs;
+  predictor->Run({}, &outputs);
+}
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..07b17acd484b13af2ab4019aafa4a08c6b9f59d4
--- /dev/null
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -0,0 +1,152 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "gflags/gflags.h"
+#include "paddle/contrib/inference/paddle_inference_api_impl.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+namespace paddle {
+
+PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
+  PaddleTensor pt;
+  pt.data.data = t->data<void>();
+
+  if (t->type() == typeid(int64_t)) {
+    pt.data.length = t->numel() * sizeof(int64_t);
+    pt.dtype = PaddleDType::INT64;
+  } else if (t->type() == typeid(float)) {
+    pt.data.length = t->numel() * sizeof(float);
+    pt.dtype = PaddleDType::FLOAT32;
+  } else {
+    LOG(FATAL) << "unsupported type.";
+  }
+  pt.shape = framework::vectorize2int(t->dims());
+  return pt;
+}
+
+NativeConfig GetConfig() {
+  NativeConfig config;
+  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  LOG(INFO) << "dirname  " << config.model_dir;
+  config.fraction_of_gpu_memory = 0.15;
+  config.use_gpu = true;
+  config.device = 0;
+  return config;
+}
+
+TEST(paddle_inference_api_impl, word2vec) {
+  NativeConfig config = GetConfig();
+  auto predictor = CreatePaddlePredictor<NativeConfig>(config);
+
+  framework::LoDTensor first_word, second_word, third_word, fourth_word;
+  framework::LoD lod{{0, 1}};
+  int64_t dict_size = 2073;  // The size of dictionary
+
+  SetupLoDTensor(&first_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&second_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&third_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
+
+  std::vector<PaddleTensor> paddle_tensor_feeds;
+  paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&first_word));
+  paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&second_word));
+  paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&third_word));
+  paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&fourth_word));
+
+  std::vector<PaddleTensor> outputs;
+  ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
+  ASSERT_EQ(outputs.size(), 1UL);
+  size_t len = outputs[0].data.length;
+  float* data = static_cast<float*>(outputs[0].data.data);
+  for (int j = 0; j < len / sizeof(float); ++j) {
+    ASSERT_LT(data[j], 1.0);
+    ASSERT_GT(data[j], -1.0);
+  }
+
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&first_word);
+  cpu_feeds.push_back(&second_word);
+  cpu_feeds.push_back(&third_word);
+  cpu_feeds.push_back(&fourth_word);
+
+  framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  TestInference<platform::CPUPlace>(config.model_dir, cpu_feeds, cpu_fetchs1);
+
+  float* lod_data = output1.data<float>();
+  for (size_t i = 0; i < output1.numel(); ++i) {
+    EXPECT_LT(lod_data[i] - data[i], 1e-3);
+    EXPECT_GT(lod_data[i] - data[i], -1e-3);
+  }
+
+  free(outputs[0].data.data);
+}
+
+TEST(paddle_inference_api_impl, image_classification) {
+  int batch_size = 2;
+  bool use_mkldnn = false;
+  bool repeat = false;
+  NativeConfig config = GetConfig();
+  config.model_dir =
+      FLAGS_dirname + "image_classification_resnet.inference.model";
+
+  const bool is_combined = false;
+  std::vector<std::vector<int64_t>> feed_target_shapes =
+      GetFeedTargetShapes(config.model_dir, is_combined);
+
+  framework::LoDTensor input;
+  // Use normilized image pixels as input data,
+  // which should be in the range [0.0, 1.0].
+  feed_target_shapes[0][0] = batch_size;
+  framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]);
+  SetupTensor<float>(
+      &input, input_dims, static_cast<float>(0), static_cast<float>(1));
+  std::vector<framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&input);
+
+  framework::LoDTensor output1;
+  std::vector<framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  TestInference<platform::CPUPlace, false, true>(config.model_dir,
+                                                 cpu_feeds,
+                                                 cpu_fetchs1,
+                                                 repeat,
+                                                 is_combined,
+                                                 use_mkldnn);
+
+  auto predictor = CreatePaddlePredictor(config);
+  std::vector<PaddleTensor> paddle_tensor_feeds;
+  paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&input));
+
+  std::vector<PaddleTensor> outputs;
+  ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
+  ASSERT_EQ(outputs.size(), 1UL);
+  size_t len = outputs[0].data.length;
+  float* data = static_cast<float*>(outputs[0].data.data);
+  float* lod_data = output1.data<float>();
+  for (size_t j = 0; j < len / sizeof(float); ++j) {
+    EXPECT_NEAR(lod_data[j], data[j], 1e-3);
+  }
+  free(data);
+}
+
+}  // namespace paddle
diff --git a/paddle/cuda/include/hl_activation_functions.h b/paddle/cuda/include/hl_activation_functions.h
index 29ec248420058db08bd1932f702d26074d49f38c..66a69db545b541409f895820ad621a2a9a684e20 100644
--- a/paddle/cuda/include/hl_activation_functions.h
+++ b/paddle/cuda/include/hl_activation_functions.h
@@ -31,7 +31,7 @@ namespace hppl {
  */
 template <class T>
 class Active {
-public:
+ public:
   typedef T (*forward)(T);
   typedef T (*backward)(T, T);
 };
diff --git a/paddle/cuda/include/hl_tensor_ops.h b/paddle/cuda/include/hl_tensor_ops.h
index 85a022ff5e26daab97be52b7ea9814c6b8078561..bc5e5da53d5c6ac2bae3b0067f46e39accd1b9d8 100644
--- a/paddle/cuda/include/hl_tensor_ops.h
+++ b/paddle/cuda/include/hl_tensor_ops.h
@@ -23,128 +23,128 @@ namespace unary {
 
 template <class T>
 class add_scale {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE add_scale(const T s) : p(s) {}
   INLINE T operator()(const T a) const { return a + p; }
 };
 
 template <class T>
 class sub_scale {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE sub_scale(const T s) : p(s) {}
   INLINE T operator()(const T a) const { return a - p; }
 };
 
 template <class T>
 class mul_scale {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE mul_scale(const T s) : p(s) {}
   INLINE T operator()(const T a) const { return a * p; }
 };
 
 template <class T>
 class div_scale {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE div_scale(const T s) : p(s) {}
   INLINE T operator()(const T a) const { return a / p; }
 };
 
 template <class T>
 class neg {
-public:
+ public:
   INLINE T operator()(const T a) const { return -a; }
 };
 
 template <class T>
 class exp_op {
-public:
+ public:
   INLINE T operator()(const T a) const { return std::exp(a); }
 };
 
 template <class T>
 class log_op {
-public:
+ public:
   INLINE T operator()(const T a) const { return std::log(a); }
 };
 
 template <class T>
 class sqrt_op {
-public:
+ public:
   INLINE T operator()(const T a) const { return std::sqrt(a); }
 };
 
 template <class T>
 class square {
-public:
+ public:
   INLINE T operator()(const T a) const { return a * a; }
 };
 
 template <class T>
 class reciprocal {
-public:
+ public:
   INLINE T operator()(const T a) const { return T(1) / a; }
 };
 
 template <class T>
 class abs {
-public:
+ public:
   INLINE T operator()(const T a) const { return a > 0 ? a : -a; }
 };
 
 template <class T>
 class sign {
-public:
+ public:
   INLINE T operator()(const T a) const { return (a > 0) - (a < 0); }
 };
 
 template <class T>
 class min {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE min(const T s) : p(s) {}
   INLINE T operator()(const T a) const { return a > p ? p : a; }
 };
 
 template <class T>
 class max {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE max(const T s) : p(s) {}
   INLINE T operator()(const T a) const { return a < p ? p : a; }
 };
 
 template <class T>
 class pow_op {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE pow_op(const T s) : p(s) {}
   INLINE T operator()(const T a) const { return std::pow(a, p); }
 };
 
 template <class T>
 class constant {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE constant(const T s) : p(s) {}
   INLINE T operator()(int i) const { return p; }
   INLINE T operator()(int i, int j) const { return p; }
@@ -152,80 +152,80 @@ public:
 
 template <class T>
 class cmp_eq {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE cmp_eq(const T s) : p(s) {}
   INLINE bool operator()(const T a) const { return a == p; }
 };
 
 template <class T>
 class cmp_ne {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE cmp_ne(const T s) : p(s) {}
   INLINE bool operator()(const T a) const { return a != p; }
 };
 
 template <class T>
 class cmp_le {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE cmp_le(const T s) : p(s) {}
   INLINE bool operator()(const T a) const { return a <= p; }
 };
 
 template <class T>
 class cmp_lt {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE cmp_lt(const T s) : p(s) {}
   INLINE bool operator()(const T a) const { return a < p; }
 };
 
 template <class T>
 class cmp_ge {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE cmp_ge(const T s) : p(s) {}
   INLINE bool operator()(const T a) const { return a >= p; }
 };
 
 template <class T>
 class cmp_gt {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE cmp_gt(const T s) : p(s) {}
   INLINE bool operator()(const T a) const { return a > p; }
 };
 
 template <class T>
 class and_op {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE and_op(const T s) : p(s) {}
   INLINE bool operator()(const T a) const { return a && p; }
 };
 
 template <class T>
 class or_op {
-private:
+ private:
   const T p;
 
-public:
+ public:
   INLINE or_op(const T s) : p(s) {}
   INLINE bool operator()(const T a) const { return a || p; }
 };
@@ -235,96 +235,96 @@ public:
 namespace binary {
 template <class T>
 class add {
-public:
+ public:
   INLINE T operator()(const T a, const T b) const { return a + b; }
 };
 
 template <class T>
 class add_scale {
-private:
+ private:
   const T p1;
   const T p2;
 
-public:
+ public:
   INLINE add_scale(const T s1, const T s2) : p1(s1), p2(s2) {}
   INLINE T operator()(const T a, const T b) const { return p1 * a + p2 * b; }
 };
 
 template <class T>
 class sub {
-public:
+ public:
   INLINE T operator()(const T a, const T b) const { return a - b; }
 };
 
 template <class T>
 class mul {
-public:
+ public:
   INLINE T operator()(const T a, const T b) const { return a * b; }
 };
 
 template <class T>
 class div {
-public:
+ public:
   INLINE T operator()(const T a, const T b) const { return a / b; }
 };
 
 template <class T>
 class cmp_eq {
-public:
+ public:
   INLINE bool operator()(const T a, const T b) const { return a == b; }
 };
 
 template <class T>
 class cmp_ne {
-public:
+ public:
   INLINE bool operator()(const T a, const T b) const { return a != b; }
 };
 
 template <class T>
 class cmp_le {
-public:
+ public:
   INLINE bool operator()(const T a, const T b) const { return a <= b; }
 };
 
 template <class T>
 class cmp_lt {
-public:
+ public:
   INLINE bool operator()(const T a, const T b) const { return a < b; }
 };
 
 template <class T>
 class cmp_ge {
-public:
+ public:
   INLINE bool operator()(const T a, const T b) const { return a >= b; }
 };
 
 template <class T>
 class cmp_gt {
-public:
+ public:
   INLINE bool operator()(const T a, const T b) const { return a > b; }
 };
 
 template <class T>
 class and_op {
-public:
+ public:
   INLINE bool operator()(const T a, const T b) const { return a && b; }
 };
 
 template <class T>
 class or_op {
-public:
+ public:
   INLINE bool operator()(const T a, const T b) const { return a || b; }
 };
 
 template <class T>
 class min {
-public:
+ public:
   INLINE T operator()(const T a, const T b) const { return a > b ? b : a; }
 };
 
 template <class T>
 class max {
-public:
+ public:
   INLINE T operator()(const T a, const T b) const { return a < b ? b : a; }
 };
 
@@ -332,7 +332,7 @@ public:
 #ifndef PADDLE_TYPE_DOUBLE
 template <>
 class add<__m128> {
-public:
+ public:
   INLINE __m128 operator()(const __m128 a, const __m128 b) const {
     return _mm_add_ps(a, b);
   }
@@ -340,11 +340,11 @@ public:
 
 template <>
 class add_scale<__m128> {
-private:
+ private:
   const __m128 p1;
   const __m128 p2;
 
-public:
+ public:
   INLINE add_scale(const __m128 s1, const __m128 s2) : p1(s1), p2(s2) {}
   INLINE __m128 operator()(const __m128 a, const __m128 b) const {
     return _mm_add_ps(_mm_mul_ps(p1, a), _mm_mul_ps(p2, b));
@@ -353,7 +353,7 @@ public:
 
 template <>
 class sub<__m128> {
-public:
+ public:
   INLINE __m128 operator()(const __m128 a, const __m128 b) const {
     return _mm_sub_ps(a, b);
   }
@@ -361,7 +361,7 @@ public:
 
 template <>
 class mul<__m128> {
-public:
+ public:
   INLINE __m128 operator()(const __m128 a, const __m128 b) const {
     return _mm_mul_ps(a, b);
   }
@@ -369,7 +369,7 @@ public:
 
 template <>
 class div<__m128> {
-public:
+ public:
   INLINE __m128 operator()(const __m128 a, const __m128 b) const {
     return _mm_div_ps(a, b);
   }
@@ -377,7 +377,7 @@ public:
 
 template <>
 class min<__m128> {
-public:
+ public:
   INLINE __m128 operator()(const __m128 a, const __m128 b) const {
     return _mm_min_ps(a, b);
   }
@@ -385,7 +385,7 @@ public:
 
 template <>
 class max<__m128> {
-public:
+ public:
   INLINE __m128 operator()(const __m128 a, const __m128 b) const {
     return _mm_max_ps(a, b);
   }
@@ -393,7 +393,7 @@ public:
 #else
 template <>
 class add<__m128d> {
-public:
+ public:
   INLINE __m128d operator()(const __m128d a, const __m128d b) const {
     return _mm_add_pd(a, b);
   }
@@ -401,11 +401,11 @@ public:
 
 template <>
 class add_scale<__m128d> {
-private:
+ private:
   const __m128d p1;
   const __m128d p2;
 
-public:
+ public:
   INLINE add_scale(const __m128d s1, const __m128d s2) : p1(s1), p2(s2) {}
   INLINE __m128d operator()(const __m128d a, const __m128d b) const {
     return _mm_add_pd(_mm_mul_pd(p1, a), _mm_mul_pd(p2, b));
@@ -414,7 +414,7 @@ public:
 
 template <>
 class sub<__m128d> {
-public:
+ public:
   INLINE __m128d operator()(const __m128d a, const __m128d b) const {
     return _mm_sub_pd(a, b);
   }
@@ -422,7 +422,7 @@ public:
 
 template <>
 class mul<__m128d> {
-public:
+ public:
   INLINE __m128d operator()(const __m128d a, const __m128d b) const {
     return _mm_mul_pd(a, b);
   }
@@ -430,7 +430,7 @@ public:
 
 template <>
 class div<__m128d> {
-public:
+ public:
   INLINE __m128d operator()(const __m128d a, const __m128d b) const {
     return _mm_div_pd(a, b);
   }
@@ -438,7 +438,7 @@ public:
 
 template <>
 class min<__m128d> {
-public:
+ public:
   INLINE __m128d operator()(const __m128d a, const __m128d b) const {
     return _mm_min_pd(a, b);
   }
@@ -446,7 +446,7 @@ public:
 
 template <>
 class max<__m128d> {
-public:
+ public:
   INLINE __m128d operator()(const __m128d a, const __m128d b) const {
     return _mm_max_pd(a, b);
   }
@@ -458,7 +458,7 @@ public:
 #ifndef PADDLE_TYPE_DOUBLE
 template <>
 class add<float32x4_t> {
-public:
+ public:
   INLINE float32x4_t operator()(const float32x4_t a,
                                 const float32x4_t b) const {
     return vaddq_f32(a, b);
@@ -467,11 +467,11 @@ public:
 
 template <>
 class add_scale<float32x4_t> {
-private:
+ private:
   const float32x4_t p1;
   const float32x4_t p2;
 
-public:
+ public:
   INLINE add_scale(const float32x4_t s1, const float32x4_t s2)
       : p1(s1), p2(s2) {}
   INLINE float32x4_t operator()(const float32x4_t a,
@@ -482,7 +482,7 @@ public:
 
 template <>
 class sub<float32x4_t> {
-public:
+ public:
   INLINE float32x4_t operator()(const float32x4_t a,
                                 const float32x4_t b) const {
     return vsubq_f32(a, b);
@@ -491,7 +491,7 @@ public:
 
 template <>
 class mul<float32x4_t> {
-public:
+ public:
   INLINE float32x4_t operator()(const float32x4_t a,
                                 const float32x4_t b) const {
     return vmulq_f32(a, b);
@@ -500,7 +500,7 @@ public:
 
 template <>
 class div<float32x4_t> {
-public:
+ public:
   INLINE float32x4_t operator()(const float32x4_t a,
                                 const float32x4_t b) const {
     float32x4_t tmp = vrecpeq_f32(b);
@@ -510,7 +510,7 @@ public:
 
 template <>
 class min<float32x4_t> {
-public:
+ public:
   INLINE float32x4_t operator()(const float32x4_t a,
                                 const float32x4_t b) const {
     return vminq_f32(a, b);
@@ -519,7 +519,7 @@ public:
 
 template <>
 class max<float32x4_t> {
-public:
+ public:
   INLINE float32x4_t operator()(const float32x4_t a,
                                 const float32x4_t b) const {
     return vmaxq_f32(a, b);
diff --git a/paddle/cuda/src/hl_cuda_lstm.cu b/paddle/cuda/src/hl_cuda_lstm.cu
index e30fcddffdf99417a4b9b811a0b0cb0a12e79b99..b8c4e433a118fb1c5af753751f91c34543b1114c 100644
--- a/paddle/cuda/src/hl_cuda_lstm.cu
+++ b/paddle/cuda/src/hl_cuda_lstm.cu
@@ -30,7 +30,7 @@ bool hl_lstm_sequence_parallel(int frameSize) {
 }
 
 class frameValue {
-public:
+ public:
   real *value_;
   __device__ frameValue(real *value) : value_(value) {}
   template <int reversed, int frameSize>
diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc
index fd409ed4c0f7a504686765909e9c71692aab8824..e7842e9b8130d35e511e02dfb1dc27f307d17f38 100644
--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -200,7 +200,7 @@ BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc)
     vars_[var_desc.name()].reset(new VarDesc(var_desc));
   }
   for (const proto::OpDesc &op_desc : desc_->ops()) {
-    ops_.emplace_back(new OpDesc(op_desc, prog, this));
+    ops_.emplace_back(new OpDesc(op_desc, this));
   }
 }
 
@@ -209,7 +209,7 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
     : prog_(prog), desc_(desc) {
   need_update_ = true;
   for (auto &op : other.ops_) {
-    ops_.emplace_back(new OpDesc(*op->Proto(), prog, this));
+    ops_.emplace_back(new OpDesc(*op, this));
   }
   for (auto &it : other.vars_) {
     auto *var = new VarDesc(*it.second);
diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index 600601669c5d56a3ffc2fb9c804ffad5fde58f0b..189dd6c52f85b5bf623b98c64c07c0c7269505d4 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -105,7 +105,7 @@ class BlockDesc {
 
   size_t OpSize() const { return ops_.size(); }
 
-  OpDesc *Op(int idx) { return ops_.at(idx).get(); }
+  OpDesc *Op(int idx) const { return ops_.at(idx).get(); }
 
   void Flush();
 
diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc
index a876725ac0f17838458065c4b4753a03e2812801..6bcfc6cd55f02f0d4f0f6e3170e7cc19ce666a28 100644
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -16,31 +16,25 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-static const platform::DeviceContext* GetDeviceContext(
-    const platform::Place& src_place, const platform::Place& dst_place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-
-  if (platform::is_gpu_place(src_place) && platform::is_cpu_place(dst_place)) {
-    return pool.Get(src_place);
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
-    return pool.Get(dst_place);
-  } else {
-    PADDLE_THROW(
-        "Currently, model parallelism is only supported between CPU and CUDA");
-  }
-}
-
-void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
-                     Tensor* out) {
+void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
+                     Tensor *out) {
   VLOG(3) << "DeviceTransform in, src_place " << in.place()
           << " dst_place: " << dst_place;
-  auto* dev_ctx = GetDeviceContext(in.place(), dst_place);
 
-  TensorCopy(in, dst_place, *dev_ctx, out);
-  if (platform::is_gpu_place(in.place()) && platform::is_cpu_place(dst_place)) {
-    dev_ctx->Wait();
-  }
+  PADDLE_ENFORCE_NE(
+      in.place().which(), dst_place.which(),
+      "Currently, model parallelism is only supported between CPU and CUDA");
+
+  // FIXME(zcd): TransDataDevice is used to transform data from GPU to CPU and
+  // the enforced checkings have been done in GetDeviceContext, so the
+  // `dev_ctx->Wait()` is necessary. But `dev_ctx->Wait()` will make the program
+  // slow, especially when the number of elements is little, for example,
+  // the elements of learning rate are one and it's CPU side.
+  // One solution is to use a CUDA kernel to complete the copy operation when
+  // the transforming is from CPU to GPU and the number of elements is little.
+  // But the embarrassment is that this solution this solution makes training
+  // slower.
+  TensorCopySync(in, dst_place, out);
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index b69de2ced03569d5e9ffe313527ab776ee798496..1bcd8412eb2d618b923bcd0557d118af62271f4a 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -3,7 +3,7 @@ cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
-cc_library(send_op_handle SRCS send_op_handle.cc DEPS framework_proto scope place operator op_registry)
+cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)
 
 cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
 cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
@@ -26,7 +26,7 @@ endif()
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
-        scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle)
+        scale_loss_grad_op_handle rpc_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle)
 
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 35d23d68c0dd26a05544a72316d5764129aa8d40..17baacd13eecac8f410631fe9e94788da4fff848 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -11,13 +11,18 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include <algorithm>
+#include <fstream>
+#include <string>
 #include <utility>
+#include <vector>
+
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
+#include "paddle/fluid/framework/details/rpc_op_handle.h"
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
-#include "paddle/fluid/framework/details/send_op_handle.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/scope.h"
 
@@ -25,8 +30,9 @@
 #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
 #endif
 
-#include <string>
-#include <vector>
+DEFINE_string(ssa_graph_path, "/tmp/ssa_graph.dot",
+              "the ssa graph path only print with GLOG_v=10,"
+              "default /tmp/graph.dot");
 
 namespace paddle {
 namespace framework {
@@ -79,9 +85,44 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
   }
 }
 
-bool MultiDevSSAGraphBuilder::IsDistTrainOp(const OpDesc &op,
-                                            OpDesc *send_op) const {
-  if (send_op == nullptr) {
+std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars(
+    const ProgramDesc &program) const {
+  std::vector<std::string> send_vars;
+  // since parameters are all in block 0,
+  // it's enough to only scan send ops in block 0
+  for (auto *op : program.Block(0).AllOps()) {
+    // TODO(Yancey1989): use a graceful method to find send op,
+    // instead of the the hard code string
+    if (op->Type() == "send_vars") {
+      auto op_vars = op->InputArgumentNames();
+      send_vars.reserve(send_vars.size() +
+                        std::distance(op_vars.begin(), op_vars.end()));
+      send_vars.insert(send_vars.end(), op_vars.begin(), op_vars.end());
+    }
+  }
+  return send_vars;
+}
+
+std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainRecvVars(
+    const ProgramDesc &program) const {
+  std::vector<std::string> recv_vars;
+  for (auto *op : program.Block(0).AllOps()) {
+    // TODO(Yancey1989): use a graceful method to find recv op,
+    // instead of the hard code string
+    if (op->Type() == "recv") {
+      auto op_vars = op->OutputArgumentNames();
+      recv_vars.reserve(recv_vars.size() +
+                        std::distance(op_vars.begin(), op_vars.end()));
+      recv_vars.insert(recv_vars.end(), op_vars.begin(), op_vars.end());
+    }
+  }
+  return recv_vars;
+}
+
+bool MultiDevSSAGraphBuilder::IsDistTrainOp(
+    const OpDesc &op, const std::vector<std::string> &send_vars,
+    const std::vector<std::string> &recv_vars) const {
+  if (send_vars.size() == 0 || recv_vars.size() == 0) {
     return false;
   }
 
@@ -89,29 +130,28 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(const OpDesc &op,
    * Check any of opvars contains `.block` and in sendvars
    */
   auto checker = [](const std::vector<std::string> &opvars,
-                    const std::vector<std::string> &sendvars) -> bool {
+                    const std::vector<std::string> &rpc_vars) -> bool {
     for (auto &var : opvars) {
+      // a variable name with the suffix `.block` means it's a splited
+      // variable by (DistributeTranspiler)
+      // [python/paddle/fluid/transpiler/distribute_transpiler.py]
       if (var.find(".block") != std::string::npos &&
-          std::find(sendvars.begin(), sendvars.end(), var) != sendvars.end()) {
+          std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) {
         return true;
       }
     }
     return false;
   };
 
-  if (op.Type() == "split" || op.Type() == "split_byref") {
-    return checker(op.OutputArgumentNames(), send_op->InputArgumentNames());
-  } else if (op.Type() == "concat") {
-    return checker(op.InputArgumentNames(), send_op->OutputArgumentNames());
-  }
-  return false;
+  return checker(op.OutputArgumentNames(), send_vars) ||
+         checker(op.InputArgumentNames(), recv_vars);
 }
 
 std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
     const ProgramDesc &program) const {
-  std::unordered_map<std::string, proto::VarType::Type> var_types;
+  std::unordered_map<std::string, VarDesc *> all_vars;
   for (auto *var : program.Block(0).AllVars()) {
-    var_types[var->Name()] = var->GetType();
+    all_vars[var->Name()] = var;
   }
 
   auto graph = new SSAGraph();
@@ -123,23 +163,43 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
       std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>(
       places_.size());
 
-  // Find "send" op first for split is in front of send.
-  OpDesc *send_op = GetSendOpDesc(program);
+  // find send/recv vars so that we can place the distributed training
+  // realted op in the place 0
+  auto send_vars = FindDistTrainSendVars(program);
+  auto recv_vars = FindDistTrainRecvVars(program);
 
-  size_t cur_device_id = 0;
   std::vector<std::unordered_set<std::string>> var_name_on_devices;
   std::vector<std::unordered_set<std::string>> bcast_var_name_set;
   var_name_on_devices.resize(places_.size());
   bcast_var_name_set.resize(places_.size());
 
+  size_t cur_device_id = 0;
+  std::vector<int64_t> balance_grads(places_.size(), 0);
+
+  auto get_appropriate_dev = [&](std::string &g_name) -> size_t {
+    auto var_desc = all_vars.at(g_name);
+    PADDLE_ENFORCE_NOT_NULL(var_desc);
+    auto dim = framework::make_ddim(var_desc->GetShape());
+    int64_t numel = framework::product(dim);
+    PADDLE_ENFORCE_GE(numel, 0);
+    auto smallest =
+        std::min_element(std::begin(balance_grads), std::end(balance_grads));
+    size_t dev_id =
+        static_cast<size_t>(std::distance(std::begin(balance_grads), smallest));
+    balance_grads[dev_id] += numel;
+    return dev_id;
+  };
+
   bool is_forwarding = true;
   for (auto *op : program.Block(0).AllOps()) {
-    if (op->Type() == "send") {
-      // append send op if program is distributed trainer main program.
+    if (boost::get<int>(
+            op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+        static_cast<int>(OpRole::kRPC)) {
+      // append rpc op if program is distributed trainer main program.
       // always use the first device
-      CreateSendOp(&result, *op);
-    } else if (IsDistTrainOp(*op, send_op)) {
-      CreateComputationalOps(&result, *op, 1);
+      CreateRPCOp(&result, *op);
+    } else if (IsDistTrainOp(*op, send_vars, recv_vars)) {
+      CreateDistTrainOp(&result, *op);
     } else if (IsScaleLossOp(*op)) {
       // user can customize loss@grad if not use_default_grad_scale_
       if (strategy_.gradient_scale_ !=
@@ -177,13 +237,13 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
 
               switch (strategy_.reduce_) {
                 case BuildStrategy::ReduceStrategy::kReduce:
+                  cur_device_id = get_appropriate_dev(g_name);
                   CreateReduceOp(&result, g_name, cur_device_id);
                   var_name_on_devices[cur_device_id].emplace(g_name);
                   bcast_var_name_set[cur_device_id].emplace(p_name);
-                  cur_device_id = (cur_device_id + 1) % places_.size();
                   break;
                 case BuildStrategy::ReduceStrategy::kAllReduce:
-                  if (IsSparseGradient(var_types, g_name)) {
+                  if (IsSparseGradient(all_vars, g_name)) {
                     CreateReduceOp(&result, g_name, 0);
                     CreateBroadcastOp(&result, g_name, 0);
                   } else {
@@ -218,19 +278,18 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
   AddOutputToLeafOps(&result);
 
   if (VLOG_IS_ON(10)) {
-    std::ostringstream sout;
-    PrintGraphviz(*graph, sout);
-    VLOG(10) << sout.str();
+    std::ofstream fout(FLAGS_ssa_graph_path);
+    PrintGraphviz(*graph, fout);
   }
 
   return std::unique_ptr<SSAGraph>(graph);
 }
 
 bool MultiDevSSAGraphBuilder::IsSparseGradient(
-    const std::unordered_map<std::string, proto::VarType::Type> &var_types,
+    const std::unordered_map<std::string, VarDesc *> &all_vars,
     const std::string &og) const {
-  PADDLE_ENFORCE(var_types.count(og) != 0);
-  if (var_types.at(og) == proto::VarType::SELECTED_ROWS) {
+  PADDLE_ENFORCE(all_vars.count(og) != 0);
+  if (all_vars.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
     return true;
   }
   return false;
@@ -270,15 +329,6 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(SSAGraph *result,
   CreateOpHandleIOs(result, op, dev_id);
 }
 
-OpDesc *MultiDevSSAGraphBuilder::GetSendOpDesc(
-    const ProgramDesc &program) const {
-  for (auto *op : program.Block(0).AllOps()) {
-    if (op->Type() == "send") {
-      return op;
-    }
-  }
-  return nullptr;
-}
 void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp(
     SSAGraph *result, const std::string &og) const {
 #ifdef PADDLE_WITH_CUDA
@@ -401,14 +451,48 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result,
   return var;
 }
 
-void MultiDevSSAGraphBuilder::CreateSendOp(SSAGraph *result,
-                                           const OpDesc &op) const {
+void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op,
+                                        const std::string &prev_op_name) const {
+  for (auto &prev_op : result->ops_) {
+    if (prev_op->Name() == prev_op_name) {
+      auto *dep_var = new DummyVarHandle();
+      prev_op->AddOutput(dep_var);
+      result->dep_vars_.emplace(dep_var);
+      op->AddInput(dep_var);
+    }
+  }
+}
+
+void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
+                                                const OpDesc &op) const {
+  CreateComputationalOp(result, op, 0);
+  if (op.Type() == "concat") {
+    ConnectOp(result, result->ops_.back().get(), "fetch_barrier");
+  }
+}
+
+void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
+                                          const OpDesc &op) const {
   auto &p = places_[0];
   auto *s = local_scopes_[0];
-  // FIXME(wuyi): send op always copy from GPU 0
-  result->ops_.emplace_back(new SendOpHandle(op, s, p));
-  // Create inputs for output on original place and no ssa output
-  // is created for send op.
+  result->ops_.emplace_back(new RPCOpHandle(op, s, p, op.Type()));
+
+  if (op.Type() == "send_barrier") {
+    ConnectOp(result, result->ops_.back().get(), "send_vars");
+  } else if (op.Type() == "recv") {
+    ConnectOp(result, result->ops_.back().get(), "send_barrier");
+  } else if (op.Type() == "fetch_barrier") {
+    ConnectOp(result, result->ops_.back().get(), "recv");
+  } else if (op.Type() == "send_vars") {
+    // do nothing
+  } else {
+    PADDLE_THROW(
+        "rpc op should be in ["
+        "send_vars, send_barrier. recv, fetch_barrier]");
+  }
+
+  // TODO(Yancey1989): schedule rpc op on different place may
+  // increate throughput
   CreateOpHandleIOs(result, op, 0);
 }
 
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index 4f708521884247fc013f0ae336ab683c3fe7ef2f..544cbe585c7423b5f3eb98ee698ca5668376f1ca 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -64,12 +64,24 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 
   bool IsScaleLossOp(const OpDesc &op) const;
 
-  void CreateSendOp(SSAGraph *result, const OpDesc &op) const;
+  void CreateRPCOp(SSAGraph *result, const OpDesc &op) const;
+  void CreateDistTrainOp(SSAGraph *result, const OpDesc &op) const;
 
   /**
    * Is this operator as the end-point operator before/after send operator.
    */
-  bool IsDistTrainOp(const OpDesc &op, OpDesc *send_op) const;
+  bool IsDistTrainOp(const OpDesc &op,
+                     const std::vector<std::string> &send_vars,
+                     const std::vector<std::string> &recv_vars) const;
+
+  std::vector<std::string> FindDistTrainSendVars(
+      const ProgramDesc &program) const;
+
+  std::vector<std::string> FindDistTrainRecvVars(
+      const ProgramDesc &program) const;
+
+  void ConnectOp(SSAGraph *result, OpHandleBase *op,
+                 const std::string &prev_op_name) const;
 
   void CreateComputationalOps(SSAGraph *result, const OpDesc &op,
                               size_t num_places) const;
@@ -93,14 +105,8 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
   void CreateBroadcastOp(SSAGraph *result, const std::string &p_name,
                          size_t src_dev_id) const;
 
-  /**
-   * Get send op in the global block of program.
-   * nullptr if not found.
-   */
-  OpDesc *GetSendOpDesc(const ProgramDesc &program) const;
-
   bool IsSparseGradient(
-      const std::unordered_map<std::string, proto::VarType::Type> &var_types,
+      const std::unordered_map<std::string, VarDesc *> &all_vars,
       const std::string &og) const;
 
  private:
diff --git a/paddle/fluid/framework/details/send_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc
similarity index 75%
rename from paddle/fluid/framework/details/send_op_handle.cc
rename to paddle/fluid/framework/details/rpc_op_handle.cc
index 7109659dd7001f91e7674ac7bebbe3a59794cfc0..7f4da4c01de1010467d839ee5490c5e0d02d8c24 100644
--- a/paddle/fluid/framework/details/send_op_handle.cc
+++ b/paddle/fluid/framework/details/rpc_op_handle.cc
@@ -12,24 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/details/send_op_handle.h"
+#include "paddle/fluid/framework/details/rpc_op_handle.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
-SendOpHandle::SendOpHandle(const framework::OpDesc &op_desc,
-                           const Scope *local_scope,
-                           const platform::Place &place)
+RPCOpHandle::RPCOpHandle(const framework::OpDesc &op_desc,
+                         const Scope *local_scope, const platform::Place &place,
+                         const std::string &name)
     : op_(framework::OpRegistry::CreateOp(op_desc)),
       local_scope_(local_scope),
-      place_(place) {}
+      place_(place),
+      name_(name) {}
 
-void SendOpHandle::RunImpl() {
+void RPCOpHandle::RunImpl() {
   // TODO(wuyi): need further analysis whether wait VarDummyHandle.
   // Wait input done
   for (auto *in : inputs_) {
     auto &p = static_cast<VarHandle *>(in)->place_;
+    // FIXME(Yancey1989): need a better solution instead of use DebugString()
     if (in->DebugString() == "dummy") {  // HACK
       continue;
     }
@@ -43,7 +45,7 @@ void SendOpHandle::RunImpl() {
   op_->Run(*tmp_scope, place_);
 }
 
-std::string SendOpHandle::Name() const { return "send"; }
+std::string RPCOpHandle::Name() const { return name_; }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/send_op_handle.h b/paddle/fluid/framework/details/rpc_op_handle.h
similarity index 87%
rename from paddle/fluid/framework/details/send_op_handle.h
rename to paddle/fluid/framework/details/rpc_op_handle.h
index 2f78811fad50642b5e45776c41910df6f4cc48f6..d28b7721720d808a8d81701c3811eae16121fb41 100644
--- a/paddle/fluid/framework/details/send_op_handle.h
+++ b/paddle/fluid/framework/details/rpc_op_handle.h
@@ -27,9 +27,9 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-struct SendOpHandle : public OpHandleBase {
-  SendOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
-               const platform::Place& place);
+struct RPCOpHandle : public OpHandleBase {
+  RPCOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
+              const platform::Place& place, const std::string& name);
 
   std::string Name() const override;
 
@@ -44,6 +44,7 @@ struct SendOpHandle : public OpHandleBase {
   std::unique_ptr<OperatorBase> op_;
   const Scope* local_scope_;
   const platform::Place& place_;
+  const std::string name_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 4e431561f81b2a84c06dff9fcb041317ebc84ae3..863053c32b190f4e8497b16f3edd76cb2f76168b 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -24,9 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(benchmark);
-DEFINE_bool(check_nan_inf, false,
-            "Checking whether operator produce NAN/INF or not. It will be "
-            "extremely slow so please use this flag wisely.");
 
 namespace paddle {
 namespace framework {
@@ -78,21 +75,6 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
   }
 }
 
-static void CheckTensorNANOrInf(const std::string& name,
-                                const framework::Tensor& tensor) {
-  if (tensor.memory_size() == 0) {
-    return;
-  }
-  if (tensor.type().hash_code() != typeid(float).hash_code() &&   // NOLINT
-      tensor.type().hash_code() != typeid(double).hash_code()) {  // NOLINT
-    return;
-  }
-  PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
-                 "Tensor %s contains Inf", name);
-  PADDLE_ENFORCE(!framework::TensorContainsNAN(tensor),
-                 "Tensor %s contains NAN", name);
-}
-
 void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
                                int block_id) {
   auto& global_block = pdesc.Block(block_id);
@@ -340,15 +322,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
       VLOG(2) << "Memory used after operator " + op->Type() + " running: "
               << memory::memory_usage(place_);
     }
-    if (FLAGS_check_nan_inf) {
-      for (auto& vname : op->OutputVars(true)) {
-        auto* var = local_scope->FindVar(vname);
-        if (var == nullptr) continue;
-        if (var->IsType<framework::LoDTensor>()) {
-          CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
-        }
-      }
-    }
   }
   platform::DeviceContextPool::Instance().Get(place_)->Wait();
   if (create_vars && create_local_scope) {
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 1b9c685866763ed126a1bf5d7fdd851c38ac1c63..f92769192c218eb7cdc2350ff6e4721b45005806 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -103,7 +103,7 @@ void OpDesc::CopyFrom(const OpDesc &op_desc) {
   need_update_ = true;
 }
 
-OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block)
+OpDesc::OpDesc(const proto::OpDesc &desc, BlockDesc *block)
     : desc_(desc), need_update_(false) {
   // restore inputs_
   int input_size = desc_.inputs_size();
@@ -243,13 +243,8 @@ const std::unordered_map<std::string, Attribute> &OpDesc::GetAttrMap() const {
 }
 
 void OpDesc::Rename(const std::string &old_name, const std::string &new_name) {
-  for (auto &input : inputs_) {
-    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
-  }
-  for (auto &output : outputs_) {
-    std::replace(output.second.begin(), output.second.end(), old_name,
-                 new_name);
-  }
+  RenameInput(old_name, new_name);
+  RenameOutput(old_name, new_name);
   need_update_ = true;
 }
 
@@ -274,6 +269,13 @@ void OpDesc::RenameInput(const std::string &old_name,
   for (auto &input : inputs_) {
     std::replace(input.second.begin(), input.second.end(), old_name, new_name);
   }
+
+  auto it = attrs_.find(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName());
+  if (it != attrs_.end()) {
+    auto &op_vars = boost::get<std::vector<std::string>>(it->second);
+    std::replace(op_vars.begin(), op_vars.end(), old_name, new_name);
+  }
+
   need_update_ = true;
 }
 
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index 1a330db7cc5555a939950043ac90a321573b292d..a02d3e269129596f65a2fb346e76c1af7fbead95 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -33,13 +33,14 @@ class OpDesc {
   OpDesc(const std::string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const AttributeMap &attrs);
 
-  OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block);
+  OpDesc(const proto::OpDesc &desc, BlockDesc *block);
 
   explicit OpDesc(BlockDesc *block) : block_(block) {}
 
   OpDesc(const OpDesc &other, BlockDesc *block) {
     *this = other;
     block_ = block;
+    need_update_ = true;
   }
 
   void CopyFrom(const OpDesc &op_desc);
diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index 5a4380a83a2e5bf492098032cd9de7bf274fe47e..ae9f4efd44acdcdff2806deea6826e4089459a78 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -66,7 +66,7 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
       .InEnum(
           {static_cast<int>(OpRole::kForward),
            static_cast<int>(OpRole::kBackward),
-           static_cast<int>(OpRole::kOptimize),
+           static_cast<int>(OpRole::kOptimize), static_cast<int>(OpRole::kRPC),
            static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kForward),
            static_cast<int>(OpRole::kLoss) |
                static_cast<int>(OpRole::kBackward),
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 9bd6ca6ea32734707a5c37b3ecfe449436c04c8c..8493b9d8b326c71a33b95bf95e5fc1743c686eb7 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -24,6 +24,7 @@ enum class OpRole {
   kForward = 0x0000,
   kBackward = 0x0001,
   kOptimize = 0x0002,
+  kRPC = 0x0003,
 
   kLoss = 0x0100,
   // The default value of op's role. This should be only used for unittests and
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index d70f26026c28867e592a9f8e37cc53e6c1d6d85e..f87d5521492418d2daf5b7fba1500c4bb31e10f5 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -24,6 +24,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(benchmark);
+DEFINE_bool(check_nan_inf, false,
+            "Checking whether operator produce NAN/INF or not. It will be "
+            "extremely slow so please use this flag wisely.");
 
 namespace paddle {
 namespace framework {
@@ -466,6 +469,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
  protected:
   DDim GetDim(const std::string& name) const override {
     Variable* var = scope_.FindVar(name);
+    PADDLE_ENFORCE_NOT_NULL(var);
     if (var->IsType<LoDTensor>()) {
       return var->Get<LoDTensor>().dims();
     } else if (var->IsType<SelectedRows>()) {
@@ -513,6 +517,21 @@ class RuntimeInferShapeContext : public InferShapeContext {
   const Scope& scope_;
 };
 
+static void CheckTensorNANOrInf(const std::string& name,
+                                const framework::Tensor& tensor) {
+  if (tensor.memory_size() == 0) {
+    return;
+  }
+  if (tensor.type().hash_code() != typeid(float).hash_code() &&   // NOLINT
+      tensor.type().hash_code() != typeid(double).hash_code()) {  // NOLINT
+    return;
+  }
+  PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
+                 "Tensor %s contains Inf", name);
+  PADDLE_ENFORCE(!framework::TensorContainsNAN(tensor),
+                 "Tensor %s contains NAN", name);
+}
+
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
   RuntimeInferShapeContext infer_shape_ctx(*this, scope);
@@ -597,6 +616,16 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   if (FLAGS_benchmark) {
     new_dev_ctx->Wait();
   }
+
+  if (FLAGS_check_nan_inf) {
+    for (auto& vname : OutputVars(true)) {
+      auto* var = new_scope.FindVar(vname);
+      if (var == nullptr) continue;
+      if (var->IsType<framework::LoDTensor>()) {
+        CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
+      }
+    }
+  }
 }
 
 proto::VarType::Type OperatorWithKernel::IndicateDataType(
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index 64fb028f83a539d17885186d5d8ee6ef26f095e9..1e01a6e900404990e16674755367d2fc6d832725 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -51,12 +51,15 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
     auto *block = desc_.mutable_blocks(i);
     blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this));
   }
-  for (auto &block : blocks_) {
-    for (auto *op : block->AllOps()) {
-      for (const auto &attr : op->Proto()->attrs()) {
-        if (attr.type() == proto::AttrType::BLOCK) {
-          size_t blk_idx = attr.block_idx();
-          op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
+  for (size_t block_id = 0; block_id < blocks_.size(); ++block_id) {
+    auto all_ops = blocks_[block_id]->AllOps();
+    for (size_t op_id = 0; op_id < all_ops.size(); ++op_id) {
+      auto &op = all_ops[op_id];
+      for (const std::string &attr_name : op->AttrNames()) {
+        if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) {
+          int sub_block_id =
+              o.Block(block_id).Op(op_id)->GetBlockAttr(attr_name);
+          op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
         }
       }
     }
@@ -86,6 +89,16 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
   for (auto &block_desc : *desc_.mutable_blocks()) {
     blocks_.emplace_back(new BlockDesc(this, &block_desc));
   }
+  for (auto &block : blocks_) {
+    for (auto *op : block->AllOps()) {
+      for (const auto &attr : op->Proto()->attrs()) {
+        if (attr.type() == proto::AttrType::BLOCK) {
+          size_t blk_idx = attr.block_idx();
+          op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
+        }
+      }
+    }
+  }
 }
 
 const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc
index 76126f3dc64d71770d13f9d66bb30f176c112629..0b36f1116d15004b355e854e101abb9ad3297836 100644
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -25,8 +25,10 @@ void FileReader::ReadNext(std::vector<LoDTensor> *out) {
   if (out->empty()) {
     return;
   }
+
+  PADDLE_ENFORCE_EQ(out->size(), dims_.size());
   for (size_t i = 0; i < dims_.size(); ++i) {
-    auto &actual = out->at(i).dims();
+    auto &actual = (*out)[i].dims();
     auto &expect = dims_[i];
 
     PADDLE_ENFORCE_EQ(actual.size(), expect.size());
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 56cf6693caf4529d6e157e6e9a0d5c27d05ee0c3..06ed87e7e8a2d5324b48a466b05207042ec1b7fa 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -18,8 +18,8 @@ namespace paddle {
 namespace framework {
 
 struct ReAllocateVisitor {
-  ReAllocateVisitor(framework::Tensor* tensor, const framework::DDim& dims)
-      : tensor_(tensor), dims_(dims) {}
+  ReAllocateVisitor(const framework::DDim& dims, framework::Tensor* tensor)
+      : dims_(dims), tensor_(tensor) {}
 
   template <typename T>
   void operator()() const {
@@ -34,8 +34,8 @@ struct ReAllocateVisitor {
     tensor_->ShareDataWith(cpu_tensor);
   }
 
-  framework::Tensor* tensor_;
   framework::DDim dims_;
+  framework::Tensor* tensor_;
 };
 
 struct TensorCopyVisitor {
@@ -121,24 +121,29 @@ bool SelectedRows::HasKey(int64_t key) const {
 }
 
 std::vector<std::pair<int64_t, int64_t>> SelectedRows::Get(
-    std::vector<int64_t> keys, framework::Tensor* value) const {
+    const std::vector<int64_t>& keys, framework::Tensor* value) const {
   PADDLE_ENFORCE(value->IsInitialized(),
                  "The value tensor should be initialized.");
   std::vector<std::pair<int64_t, int64_t>> non_keys_pair;
-  int64_t value_width = value_->numel() / value_->dims()[0];
-  PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
-                    "output tensor should have the same shape with table "
-                    "execpt the dims[0].");
-
-  for (size_t i = 0; i < keys.size(); ++i) {
-    int64_t index = Index(keys[i]);
-    if (index == -1) {
-      non_keys_pair.push_back(std::make_pair(keys[i], static_cast<int64_t>(i)));
-    } else {
-      framework::VisitDataType(
-          framework::ToDataType(value_->type()),
-          TensorCopyVisitor(value, i * value_width, *value_.get(),
-                            index * value_width, value_width));
+  if (keys.empty()) {
+    VLOG(3) << "keys is empty, please check data!";
+  } else {
+    int64_t value_width = value_->numel() / value_->dims()[0];
+    PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
+                      "output tensor should have the same shape with table "
+                      "except the dims[0].");
+
+    for (size_t i = 0; i < keys.size(); ++i) {
+      int64_t index = Index(keys[i]);
+      if (index == -1) {
+        non_keys_pair.push_back(
+            std::make_pair(keys[i], static_cast<int64_t>(i)));
+      } else {
+        framework::VisitDataType(
+            framework::ToDataType(value_->type()),
+            TensorCopyVisitor(value, i * value_width, *value_.get(),
+                              index * value_width, value_width));
+      }
     }
   }
   return non_keys_pair;
@@ -153,6 +158,7 @@ bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
   }
   PADDLE_ENFORCE_EQ(value.dims()[0], static_cast<size_t>(1),
                     "The first dim of value should be 1.");
+  std::lock_guard<std::mutex> lock(*auto_grown_mutex_.get());
   auto index = Index(key);
   bool is_new_key = false;
   if (index == -1) {
@@ -164,7 +170,7 @@ bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
       auto dims = value_->dims();
       dims[0] = (dims[0] + 1) << 1;
       framework::VisitDataType(framework::ToDataType(value.type()),
-                               ReAllocateVisitor(value_.get(), dims));
+                               ReAllocateVisitor(dims, value_.get()));
     }
   }
 
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index c27c927ee751c4392840bfb71f4814991b23a8c9..7160670ddd204c20021ea87cdd67ee4721d03451 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include <algorithm>
+#include <memory>
+#include <mutex>  // NOLINT
 #include <utility>
 #include <vector>
 
@@ -46,11 +48,13 @@ class SelectedRows {
   SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
       : rows_(rows), height_(height) {
     value_.reset(new Tensor());
+    auto_grown_mutex_.reset(new std::mutex);
   }
 
   SelectedRows() {
     height_ = 0;
     value_.reset(new Tensor());
+    auto_grown_mutex_.reset(new std::mutex);
   }
 
   platform::Place place() const { return value_->place(); }
@@ -82,7 +86,7 @@ class SelectedRows {
    * @return a list of pair which contains the non-exists key and the index in
    * the value
    */
-  std::vector<std::pair<int64_t, int64_t>> Get(std::vector<int64_t> keys,
+  std::vector<std::pair<int64_t, int64_t>> Get(const std::vector<int64_t>& keys,
                                                framework::Tensor* value) const;
 
   /*
@@ -125,6 +129,7 @@ class SelectedRows {
   Vector<int64_t> rows_;
   std::unique_ptr<Tensor> value_{nullptr};
   int64_t height_;
+  std::unique_ptr<std::mutex> auto_grown_mutex_{nullptr};
 };
 
 /*
diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h
index 46c8feec001584a872f7f62682080e0e72c06f50..5f497cafa0f75f7c23d550ef767d55274de7c900 100644
--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
@@ -63,6 +63,7 @@ class InferShapeContext {
 
   std::vector<InferShapeVarPtr> GetInputVarPtrs(const std::string &name);
   std::vector<InferShapeVarPtr> GetOutputVarPtrs(const std::string &name);
+  virtual InferShapeVarPtr GetVarPtr(const std::string &name) = 0;
 
   // Note: In while op, we need this to be public
   void SetDims(const std::vector<std::string> &names,
@@ -81,8 +82,6 @@ class InferShapeContext {
       const std::vector<std::string> &names) const;
 
   virtual proto::VarType::Type GetVarType(const std::string &name) const = 0;
-
-  virtual InferShapeVarPtr GetVarPtr(const std::string &name) = 0;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 0a1db7758bd9ec0dac133efcbf495de1d690021d..2f19ec0f0a9338e2b96d1f64eac45387bae4d1eb 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -39,7 +39,7 @@ template <typename T>
 inline const T* Tensor::data() const {
   check_memory_size();
   PADDLE_ENFORCE(std::is_same<T, void>::value ||
-                     holder_->type().hash_code() == typeid(T).hash_code(),
+                     holder_->type() == std::type_index(typeid(T)),
                  "Tensor holds the wrong type, it holds %s",
                  this->holder_->type().name());
 
@@ -53,7 +53,7 @@ template <typename T>
 inline T* Tensor::data() {
   check_memory_size();
   PADDLE_ENFORCE(std::is_same<T, void>::value ||
-                     holder_->type().hash_code() == typeid(T).hash_code(),
+                     holder_->type() == std::type_index(typeid(T)),
                  "Tensor holds the wrong type, it holds %s",
                  this->holder_->type().name());
   return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index b98aeed8a0aaabfd39560fad3c074a6668b4f024..ec16a1c600a3bafc1c4cbbd920360253c106e3a1 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -1,17 +1,23 @@
 set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
 
+# TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
 cc_library(paddle_fluid_api
     SRCS io.cc
     DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 
-# Create static library
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-cc_library(paddle_fluid DEPS ${fluid_modules})
 
+if(WITH_CONTRIB)
+  set(fluid_modules "${fluid_modules}" paddle_inference_api)
+endif()
+
+# Create static library
+cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api)
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
     SRCS io.cc
-    DEPS ${fluid_modules})
+    DEPS ${fluid_modules} paddle_fluid_api)
+
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
 if(NOT APPLE)
   # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
index 9f6ce40ede25248a4f779b379c132806a4ec06ba..913e344d371ddf3ea05a53c216e5b3bea8f11c7b 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -21,7 +21,10 @@ limitations under the License. */
 
 #include <deque>
 #include <stack>
+#include <string>
 #include <unordered_set>
+#include <utility>
+#include <vector>
 
 #include "paddle/fluid/inference/analysis/graph_traits.h"
 #include "paddle/fluid/inference/analysis/node.h"
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
index 51d38d6251d853fa8a02a4e22f819cfc44294453..9d7cceeb65888b8ba3fdf39e88fc2877abd82d11 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
@@ -35,7 +35,7 @@ TEST(DataFlowGraph, BFS) {
 
   GraphTraits<DataFlowGraph> trait(&dfg);
   auto nodes = trait.nodes();
-  int count = 0;
+  size_t count = 0;
   for (auto it = nodes.begin(); it != nodes.end(); ++it) {
     LOG(INFO) << "visiting " << it->name();
     ++count;
@@ -49,7 +49,7 @@ TEST(DataFlowGraph, DFS) {
   dfg.Build();
   GraphTraits<DataFlowGraph> trait(&dfg);
   auto nodes = trait.nodes_in_DFS();
-  int count = 0;
+  size_t count = 0;
   for (auto it = nodes.begin(); it != nodes.end(); ++it) {
     LOG(INFO) << "visiting " << it->name();
     ++count;
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
index 60f159da9140516284449a0274906df004b23ac5..dcee75cee50ede1d2b660e88e06544440bd5ef77 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
@@ -44,6 +44,6 @@ TEST_F(DFG_Tester, Test) {
   LOG(INFO) << graph.nodes.size();
 }
 
-}  // analysis
-}  // inference
-}  // paddle
+};  // namespace analysis
+};  // namespace inference
+};  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
index f848a7d1add79c3032da7defc34a406dccf29d2e..9f67c989cca4a936cd320b73efaae277263fb3e2 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include <string>
 #include <vector>
 
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+
 namespace paddle {
 namespace inference {
 namespace analysis {
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
index cd0d4fabaafe844bcc5bb8bfc2586971197d9167..33517e57becdffc0416f204247eac5feadb7ed82 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
@@ -19,6 +19,8 @@
 
 #pragma once
 
+#include <string>
+
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/pass.h"
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
index 851c98bef305fa9e20dced5f7c26e9d1b6ddf4f2..817d32c92cdbdc234eef9ed5156891c2b11ced4c 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
@@ -32,6 +32,6 @@ TEST_F(DFG_Tester, Init) {
   LOG(INFO) << '\n' << graph.DotString();
 }
 
-}  // analysis
-}  // inference
-}  // paddle
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index ea39ba4ddb5e8d5d6cce9b116ab968764e578c26..153dca576bd6734d62f00c4a7cb9b503506b33e2 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -24,6 +24,15 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
+template <typename Vec>
+int AccuDims(Vec &&vec, int size) {
+  int res = 1;
+  for (int i = 0; i < size; i++) {
+    res *= std::forward<Vec>(vec)[i];
+  }
+  return res;
+}
+
 #define SET_TYPE(type__) dic_[typeid(type__).hash_code()] = #type__;
 /*
  * Map typeid to representation.
@@ -41,7 +50,7 @@ struct DataTypeNamer {
     return dic_.at(x);
   }
 
-  const std::string &repr(size_t &hash) const {
+  const std::string &repr(size_t &hash) const {  // NOLINT
     PADDLE_ENFORCE(dic_.count(hash), "unknown type for representation");
     return dic_.at(hash);
   }
@@ -53,7 +62,9 @@ struct DataTypeNamer {
     SET_TYPE(float);
   }
 
-  std::unordered_map<decltype(typeid(int).hash_code()), std::string> dic_;
+  std::unordered_map<decltype(typeid(int).hash_code()),  // NOLINT
+                     std::string>
+      dic_;
 };
 #undef SET_TYPE
 
@@ -101,7 +112,5 @@ class OrderedRegistry {
 }  // namespace paddle
 
 #define PADDLE_DISALLOW_COPY_AND_ASSIGN(type__) \
-                                                \
   type__(const type__ &) = delete;              \
-                                                \
   void operator=(const type__ &) = delete;
diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h
index 5c89b1304d84abc9a4942f12da46b4bfe76f44f5..aa0e8667b5e4a9e6156c25fcad03bb8eee3287f6 100644
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <iosfwd>
+#include <string>
 
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.h b/paddle/fluid/inference/analysis/subgraph_splitter.h
index ed90a0dcf31e154c4d82be08ce35e2f11d11c139..a31afbe6933da8d3c7a88142cc12d63b98b55796 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter.h
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.h
@@ -18,6 +18,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
+
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/node.h"
 
diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h
index c86083d12153921672e15c172b874f77a8b46cde..722fa99a48a5f2b0e778904de0c35977d0ee3cc0 100644
--- a/paddle/fluid/inference/analysis/ut_helper.h
+++ b/paddle/fluid/inference/analysis/ut_helper.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
+#include <string>
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 79b1a248a0acfded0d2fcfadc041a6ad2a92ff3d..23ca8bfac84f35ebdca2e2a1a8538d366358ca8b 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,5 +1,12 @@
-nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES})
-nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc
-    DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine
-    SERIAL)
+# Add TRT tests
+nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine)
+# This test is not stable
+# See https://paddleci.ngrok.io/viewLog.html?tab=buildLog&buildTypeId=Paddle_PrCi2&buildId=36834&_focus=8828 
+#nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc
+#    DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine
+#    SERIAL)
 nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
+nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
+nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
index 6297051e5a30f1daa512d25d5aa3ab3b2f79f1d1..79d01b640a214ed5eb86173a36d5e85a6626066f 100644
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -24,7 +24,7 @@ class ReluOpConverter : public OpConverter {
   void operator()(const framework::proto::OpDesc& op) override {
     // Here the two nullptr looks strange, that's because the
     // framework::OpDesc's constructor is strange.
-    framework::OpDesc op_desc(op, nullptr, nullptr);
+    framework::OpDesc op_desc(op, nullptr);
     LOG(INFO) << "convert a fluid relu op to tensorrt activation layer whose "
                  "type is Relu";
     const nvinfer1::ITensor* input_tensor =
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 209936c3bafb0d31546856dc36c1b48053a0634b..668d344f1bba1c012dcb42c71b996209b4703d78 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -21,7 +21,8 @@ namespace tensorrt {
 class Conv2dOpConverter : public OpConverter {
  public:
   Conv2dOpConverter() {}
-  void operator()(const framework::proto::OpDesc& op) override {
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope) override {
     LOG(INFO)
         << "convert a fluid conv2d op to tensorrt conv layer without bias";
   }
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bd05608d7620ee4d917b30f919fba70f6aeff17f
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+// Reorder the elements from istrides to ostrides, borrowed from TRT convert in
+// tensorflow.
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/tensorrt/convert/convert_nodes.cc#L318
+template <typename T>
+void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
+              T* odata, nvinfer1::DimsHW ostrides) {
+  for (int h = 0; h < shape.h(); ++h) {
+    for (int w = 0; w < shape.w(); ++w) {
+      odata[h * ostrides.h() + w * ostrides.w()] =
+          idata[h * ostrides.h() + w * ostrides.w()];
+    }
+  }
+}
+
+// Reorder the data layout from CK to KC.
+void ReorderCKtoKC(TensorRTEngine::Weight& iweights,
+                   TensorRTEngine::Weight* oweights) {
+  int c = iweights.dims[0];
+  int k = iweights.dims[1];
+  oweights->dims.assign({k, c});
+  nvinfer1::DimsHW istrides = {1, k};
+  nvinfer1::DimsHW ostrides = {c, 1};
+  Reorder2({k, c}, static_cast<float const*>(iweights.get().values), istrides,
+           static_cast<float*>(const_cast<void*>(oweights->get().values)),
+           ostrides);
+}
+
+/*
+ * FC converter convert a MUL op in Fluid to a FC layer in TRT.
+ */
+class FcOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope) override {
+    VLOG(4) << "convert a fluid fc op to tensorrt fc layer without bias";
+
+    framework::OpDesc op_desc(op, nullptr, nullptr);
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+    // Declare inputs
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+
+    // Declare weights
+    auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
+    PADDLE_ENFORCE_NOT_NULL(Y_v);
+    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+    // This may trigger a GPU->CPU copy, because TRT's weight can only be
+    // assigned from CPU memory, that can't be avoided.
+    auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
+    PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL);  // a matrix
+    size_t n_output = Y_t->dims()[1];
+
+    framework::LoDTensor tmp;
+    tmp.Resize(Y_t->dims());
+    memcpy(tmp.mutable_data<float>(platform::CPUPlace()), Y_t->data<float>(),
+           Y_t->dims()[0] * Y_t->dims()[1]);
+
+    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+                                  static_cast<void*>(weight_data),
+                                  Y_t->memory_size() / sizeof(float)};
+    TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT,
+                                      static_cast<void*>(tmp.data<float>()),
+                                      Y_t->memory_size() / sizeof(float));
+    weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]});
+    tmp_weight.dims = weight.dims;
+
+    // The data layout of TRT FC layer's weight is different from fluid's FC,
+    // need to reorder the elements.
+    ReorderCKtoKC(tmp_weight, &weight);
+
+    // Currently, the framework can only handle one fluid op -> one TRT layer,
+    // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just
+    // handle `mul`, leave `add` as another layer.
+    // DEBUG
+    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected,
+                                       *const_cast<nvinfer1::ITensor*>(X),
+                                       n_output, weight.get(), bias.get());
+
+    auto output_name = op_desc.Output("Out").front();
+    engine_->DeclareOutput(layer, 0, output_name);
+  }
+};
+
+REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter);
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(mul);
diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
index 3ca58b139bd3af1947ae7f063060e11d2ea7d577..6bb07709c7ee1c6b29c46425849a4f472d3df59d 100644
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
@@ -18,11 +18,26 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
+/*
+ * MulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights.
+ */
 class MulOpConverter : public OpConverter {
  public:
   MulOpConverter() {}
-  void operator()(const framework::proto::OpDesc& op) override {
-    LOG(INFO) << "convert a fluid mul op to tensorrt fc layer without bias";
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope) override {
+    VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias";
+
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
+    // Both the input1 and input2 do not need transpose.
+    auto* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, MatrixMultiply, *const_cast<nvinfer1::ITensor*>(input1), false,
+        *const_cast<nvinfer1::ITensor*>(input2), false);
+
+    engine_->DeclareOutput(layer, 0, op_desc.Output("Out")[0]);
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 1cd3ed9a00acead2599420f88499bd0d74c2974b..4d21e241c0fe0abd9d454aa4f5f5ffeda747bed9 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -31,27 +31,42 @@ namespace tensorrt {
 class OpConverter {
  public:
   OpConverter() {}
-  virtual void operator()(const framework::proto::OpDesc& op) {}
 
-  void Run(const framework::proto::OpDesc& op, TensorRTEngine* engine) {
-    std::string type = op.type();
-    auto* it = Registry<OpConverter>::Lookup(type);
-    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", type);
-    it->SetEngine(engine);
-    (*it)(op);
-  }
+  // Converter logic for an op.
+  virtual void operator()(const framework::proto::OpDesc& op,
+                          const framework::Scope& scope) {}
+
+  // Convert a single fluid operaotr and add the corresponding layer to TRT.
+  void ConvertOp(const framework::proto::OpDesc& op,
+                 const std::unordered_set<std::string>& parameters,
+                 const framework::Scope& scope, TensorRTEngine* engine) {
+    framework::OpDesc op_desc(op, nullptr, nullptr);
+
+    OpConverter* it{nullptr};
 
-  // convert fluid op to tensorrt layer
-  void ConvertOp(const framework::proto::OpDesc& op, TensorRTEngine* engine) {
-    OpConverter::Run(op, engine);
+    if (op_desc.Type() == "mul") {
+      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
+      std::string Y = op_desc.Input("Y")[0];
+      if (parameters.count(Y)) {
+        it = Registry<OpConverter>::Lookup("fc");
+      }
+    }
+    if (!it) {
+      it = Registry<OpConverter>::Lookup(op_desc.Type());
+    }
+    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
+                            op_desc.Type());
+    it->SetEngine(engine);
+    (*it)(op, scope);
   }
 
   // convert fluid block to tensorrt network
   void ConvertBlock(const framework::proto::BlockDesc& block,
-                    TensorRTEngine* engine) {
+                    const std::unordered_set<std::string>& parameters,
+                    const framework::Scope& scope, TensorRTEngine* engine) {
     for (int i = 0; i < block.ops_size(); i++) {
       const auto& op = block.ops(i);
-      OpConverter::Run(op, engine);
+      ConvertOp(op, parameters, scope, engine);
     }
   }
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
index ec33f97c8240dfc09a203d68599bffe78a4abb12..86ca2ca08eb14265e1bfe7abd5eb6af5c83b8a5c 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -102,3 +102,5 @@ TEST(OpConverter, ConvertRelu) {
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
+
+USE_OP(activation);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a30253072ac581ceca85ca10151a176f87a7cb39
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(fc_op, test) {
+  std::unordered_set<std::string> parameters({"mul-Y"});
+  framework::Scope scope;
+  TRTConvertValidation validator(20, parameters, scope, 1000);
+
+  validator.DeclInputVar("mul-X", nvinfer1::Dims4(8, 3, 1, 1));
+  validator.DeclParamVar("mul-Y", nvinfer1::Dims2(3, 2));
+  validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(8, 2));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("mul");
+  desc.SetInput("X", {"mul-X"});
+  desc.SetInput("Y", {"mul-Y"});
+  desc.SetOutput("Out", {"mul-Out"});
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(10);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1ce1130e5d660d717a1262a1fbdb4b620462c0b3
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(MulOpConverter, main) {
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6));
+  validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10));
+  validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("mul");
+  desc.SetInput("X", {"mul-X"});
+  desc.SetInput("Y", {"mul-Y"});
+  desc.SetOutput("Out", {"mul-Out"});
+
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+
+  validator.Execute(10);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(mul);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 8d66543eb7637c5a8ae670b89ef5996954ba2e7b..1d3f5eabb2f839b2acfa9da6527589df1ec3767f 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
 namespace paddle {
 namespace inference {
@@ -23,13 +24,13 @@ namespace tensorrt {
 TEST(OpConverter, ConvertBlock) {
   framework::ProgramDesc prog;
   auto* block = prog.MutableBlock(0);
-  auto* mul_op = block->AppendOp();
-  mul_op->SetType("mul");
   auto* conv2d_op = block->AppendOp();
   conv2d_op->SetType("conv2d");
 
   OpConverter converter;
-  converter.ConvertBlock(*block->Proto(), nullptr /*TensorRTEngine*/);
+  framework::Scope scope;
+  converter.ConvertBlock(*block->Proto(), {}, scope,
+                         nullptr /*TensorRTEngine*/);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7e05dd5b5b235b7b166b22c5b094dc364e28dfc
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -0,0 +1,171 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file implements a UT framework to make the validation of transforming
+ * Fluid Op to TRT Layer.
+ */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Get a random float value between [low, high]
+ */
+float random(float low, float high) {
+  static std::random_device rd;
+  static std::mt19937 mt(rd());
+  std::uniform_real_distribution<double> dist(1.0, 10.0);
+  return dist(mt);
+}
+
+void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place,
+                     const platform::DeviceContext& ctx) {
+  auto dims = tensor->dims();
+  size_t num_elements = analysis::AccuDims(dims, dims.size());
+  PADDLE_ENFORCE_GT(num_elements, 0);
+  auto* data = tensor->mutable_data<float>(place);
+  for (size_t i = 0; i < num_elements; i++) {
+    *(data + i) = random(0., 1.);
+  }
+}
+
+/*
+ * Help to validate the correctness between Fluid Op and the corresponding TRT
+ * layer.
+ */
+class TRTConvertValidation {
+ public:
+  TRTConvertValidation() = delete;
+
+  TRTConvertValidation(int batch_size,
+                       const std::unordered_set<std::string>& parameters,
+                       framework::Scope& scope, int workspace_size = 1 << 10)
+      : parameters_(parameters), scope_(scope) {
+    // create engine.
+    engine_.reset(new TensorRTEngine(10, 1 << 10, &stream_));
+    engine_->InitNetwork();
+
+    PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
+  }
+
+  // Declare a Variable as input with random initialization.
+  void DeclInputVar(const std::string& name, const nvinfer1::Dims& dims) {
+    DeclVar(name, dims);
+    // Declare TRT inputs.
+    engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, dims);
+  }
+
+  // Declare a parameter varaible in the scope.
+  void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) {
+    DeclVar(name, dims);
+  }
+
+  void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) {
+    DeclVar(name, dims);
+  }
+
+  // Declare a variable in a fluid Scope.
+  void DeclVar(const std::string& name, const nvinfer1::Dims& dims) {
+    platform::CPUPlace place;
+    platform::CPUDeviceContext ctx(place);
+
+    // Init Fluid tensor.
+    std::vector<int> dim_vec(dims.d, dims.d + dims.nbDims);
+    auto* x = scope_.Var(name);
+    auto* x_tensor = x->GetMutable<framework::LoDTensor>();
+    x_tensor->Resize(framework::make_ddim(dim_vec));
+    RandomizeTensor(x_tensor, place, ctx);
+  }
+
+  void SetOp(const framework::proto::OpDesc& desc) {
+    op_ = framework::OpRegistry::CreateOp(desc);
+
+    OpConverter op_converter;
+    op_converter.ConvertOp(desc, parameters_, scope_, engine_.get());
+
+    engine_->FreezeNetwork();
+
+    // Declare outputs.
+    op_desc_.reset(new framework::OpDesc(desc, nullptr));
+
+    // Set Inputs.
+    for (const auto& input : op_desc_->InputArgumentNames()) {
+      if (parameters_.count(input)) continue;
+      auto* var = scope_.FindVar(input);
+      PADDLE_ENFORCE(var);
+      auto tensor = var->GetMutable<framework::LoDTensor>();
+
+      engine_->SetInputFromCPU(
+          input, static_cast<void*>(tensor->data<void>()),
+          sizeof(float) *
+              analysis::AccuDims(tensor->dims(), tensor->dims().size()));
+    }
+  }
+
+  void Execute(int batch_size) {
+    // Execute Fluid Op
+    platform::CPUPlace place;
+    platform::CPUDeviceContext ctx(place);
+    op_->Run(scope_, place);
+    // Execute TRT.
+    engine_->Execute(batch_size);
+    cudaStreamSynchronize(*engine_->stream());
+
+    ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
+    const size_t output_space_size = 200;
+    for (const auto& output : op_desc_->OutputArgumentNames()) {
+      std::vector<float> fluid_out;
+      std::vector<float> trt_out(output_space_size);
+      engine_->GetOutputInCPU(output, &trt_out[0],
+                              output_space_size * sizeof(float));
+      cudaStreamSynchronize(*engine_->stream());
+
+      auto* var = scope_.FindVar(output);
+      auto tensor = var->GetMutable<framework::LoDTensor>();
+      framework::TensorToVector(*tensor, ctx, &fluid_out);
+      // Compare two output
+      ASSERT_FALSE(fluid_out.empty());
+      for (size_t i = 0; i < fluid_out.size(); i++) {
+        EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 1e-6);
+      }
+    }
+  }
+
+  framework::Scope& scope() { return scope_; }
+
+ private:
+  std::unique_ptr<TensorRTEngine> engine_;
+  cudaStream_t stream_;
+  std::unique_ptr<framework::OperatorBase> op_;
+  std::unique_ptr<framework::OpDesc> op_desc_;
+  const std::unordered_set<std::string>& parameters_;
+  framework::Scope& scope_;
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 1c296e33a610493b889359c43629003fd76b893c..3d75fefc1a735168131a6c67ac073e80aba32945 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <cuda.h>
 #include <glog/logging.h>
 #include <string>
+#include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -71,9 +72,10 @@ void TensorRTEngine::FreezeNetwork() {
   for (auto& item : buffer_sizes_) {
     if (item.second == 0) {
       auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
+      auto dims = infer_engine_->getBindingDimensions(slot_offset);
       item.second = kDataTypeSize[static_cast<int>(
                         infer_engine_->getBindingDataType(slot_offset))] *
-                    AccumDims(infer_engine_->getBindingDimensions(slot_offset));
+                    analysis::AccuDims(dims.d, dims.nbDims);
     }
     auto& buf = buffer(item.first);
     CHECK(buf.buffer == nullptr);  // buffer should be allocated only once.
@@ -85,14 +87,15 @@ void TensorRTEngine::FreezeNetwork() {
 
 nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
                                                 nvinfer1::DataType dtype,
-                                                const nvinfer1::Dims& dim) {
+                                                const nvinfer1::Dims& dims) {
   PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s",
                     name);
 
   PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first");
-  auto* input = infer_network_->addInput(name.c_str(), dtype, dim);
+  auto* input = infer_network_->addInput(name.c_str(), dtype, dims);
   PADDLE_ENFORCE(input, "infer network add input %s failed", name);
-  buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] * AccumDims(dim);
+  buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
+                        analysis::AccuDims(dims.d, dims.nbDims);
   TensorRTEngine::SetITensor(name, input);
   return input;
 }
@@ -103,6 +106,7 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
                     name);
 
   auto* output = layer->getOutput(offset);
+  SetITensor(name, output);
   PADDLE_ENFORCE(output != nullptr);
   output->setName(name.c_str());
   infer_network_->markOutput(*output);
@@ -128,6 +132,20 @@ void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
   return buffer(name).buffer;
 }
 
+void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst,
+                                    size_t max_size) {
+  // determine data size
+  auto it = buffer_sizes_.find(name);
+  PADDLE_ENFORCE(it != buffer_sizes_.end());
+  PADDLE_ENFORCE_GT(it->second, 0);
+  PADDLE_ENFORCE_GE(max_size, it->second);
+  auto& buf = buffer(name);
+  PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
+  PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, it->second,
+                                    cudaMemcpyDeviceToDevice, *stream_),
+                    0);
+}
+
 void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst,
                                     size_t max_size) {
   // determine data size
@@ -149,7 +167,7 @@ Buffer& TensorRTEngine::buffer(const std::string& name) {
   return buffers_[slot_offset];
 }
 
-void TensorRTEngine::SetInputFromCPU(const std::string& name, void* data,
+void TensorRTEngine::SetInputFromCPU(const std::string& name, const void* data,
                                      size_t size) {
   auto& buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer);
@@ -159,16 +177,26 @@ void TensorRTEngine::SetInputFromCPU(const std::string& name, void* data,
                                        cudaMemcpyHostToDevice, *stream_));
 }
 
+void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data,
+                                     size_t size) {
+  auto& buf = buffer(name);
+  PADDLE_ENFORCE_NOT_NULL(buf.buffer);
+  PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
+  PADDLE_ENFORCE(buf.device == DeviceType::GPU);
+  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
+                                       cudaMemcpyDeviceToDevice, *stream_));
+}
+
 void TensorRTEngine::SetITensor(const std::string& name,
                                 nvinfer1::ITensor* tensor) {
   PADDLE_ENFORCE(tensor != nullptr);
-  PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate itensor name %s",
+  PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s",
                     name);
   itensor_map_[name] = tensor;
 }
 
 nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) {
-  PADDLE_ENFORCE(itensor_map_.count(name), "no itensor %s", name);
+  PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name);
   return itensor_map_[name];
 }
 
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index b8298c6059e8644327194a1fcf7a7438cc9a7286..fabcfd9e80cc0ef2637201a1499ebbe2d6adfd8c 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -37,13 +37,15 @@ class TensorRTEngine : public EngineBase {
   // Weight is model parameter.
   class Weight {
    public:
-    Weight(nvinfer1::DataType dtype, void* value, int num_elem) {
+    Weight(nvinfer1::DataType dtype, void* value, size_t num_elem) {
       w_.type = dtype;
       w_.values = value;
       w_.count = num_elem;
     }
     const nvinfer1::Weights& get() { return w_; }
 
+    std::vector<int64_t> dims;
+
    private:
     nvinfer1::Weights w_;
   };
@@ -92,13 +94,15 @@ class TensorRTEngine : public EngineBase {
   cudaStream_t* stream() { return stream_; }
 
   // Fill an input from CPU memory with name and size.
-  void SetInputFromCPU(const std::string& name, void* data, size_t size);
+  void SetInputFromCPU(const std::string& name, const void* data, size_t size);
   // TODO(Superjomn) is this method necessary given that buffer(xxx) can be
   // accessed directly. Fill an input from GPU memory with name and size.
-  void SetInputFromGPU(const std::string& name, void* data, size_t size);
+  void SetInputFromGPU(const std::string& name, const void* data, size_t size);
   // Get an output called name, the output of tensorrt is in GPU, so this method
-  // will just return the output's GPU memory address.
+  // Return the output's GPU memory address without copy.
   void* GetOutputInGPU(const std::string& name);
+  // Copy data into dst inside the GPU device.
+  void GetOutputInGPU(const std::string& name, void* dst, size_t max_size);
   // LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU
   // to CPU.
   void GetOutputInCPU(const std::string& name, void* dst, size_t max_size);
diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h
index 2b402cce60762d774cd7b371e448b2b88794b6a8..b6e7968108403c9c9c192759c44eac040d1c5073 100644
--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@@ -26,15 +26,6 @@ namespace tensorrt {
 
 namespace dy = paddle::platform::dynload;
 
-static size_t AccumDims(nvinfer1::Dims dims) {
-  size_t num = dims.nbDims == 0 ? 0 : 1;
-  for (int i = 0; i < dims.nbDims; i++) {
-    PADDLE_ENFORCE_GT(dims.d[i], 0);
-    num *= dims.d[i];
-  }
-  return num;
-}
-
 // TensorRT data type to size
 const int kDataTypeSize[] = {
     4,  // kFLOAT
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index f72997ca24ed837f761b52cbecdc05998424a675..de6ff29c6f8edbcf930546ff157a1c226e1311db 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -168,6 +168,8 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP(relu);\n")
       elseif(${TARGET} STREQUAL "reduce")
         file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n")
+      elseif(${TARGET} STREQUAL "fake_dequantize")
+        file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
       else()
         file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
       endif()
@@ -200,7 +202,9 @@ if(WITH_DISTRIBUTE)
     op_library(send_vars_op DEPS ${DISTRIBUTE_DEPS})
     set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
+    op_library(fetch_barrier_op DEPS ${DISTRIBUTE_DEPS})
     set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(fetch_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
     #        listen_and_serv_op sum_op executor SERIAL)
@@ -214,13 +218,18 @@ if(WITH_DISTRIBUTE)
         set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op)
     endif()
 else()
-    set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op gen_nccl_id_op)
+    set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op fetch_barrier_op gen_nccl_id_op)
 endif()
 
 op_library(cross_entropy_op DEPS cross_entropy)
 op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(softmax_op DEPS softmax)
 op_library(sequence_softmax_op DEPS softmax)
+if (WITH_GPU AND TENSORRT_FOUND)
+    op_library(tensorrt_engine_op DEPS tensorrt_engine)
+else()
+    set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
+endif()
 op_library(sum_op DEPS selected_rows_functor)
 op_library(sgd_op DEPS selected_rows_functor)
 op_library(print_op DEPS lod_tensor)
diff --git a/paddle/fluid/operators/bilinear_interp_op.cc b/paddle/fluid/operators/bilinear_interp_op.cc
index d46fda54e7a9d5bc737a7ec2116daca33ffa015f..3321adf2743c28f6eeca8b5cc91ef89beed6b97c 100644
--- a/paddle/fluid/operators/bilinear_interp_op.cc
+++ b/paddle/fluid/operators/bilinear_interp_op.cc
@@ -34,9 +34,22 @@ class BilinearInterpOp : public framework::OperatorWithKernel {
     int out_w = ctx->Attrs().Get<int>("out_w");
     PADDLE_ENFORCE_EQ(dim_x.size(), 4, "X's dimension must be 4");
 
+    if (ctx->HasInput("OutSize")) {
+      auto out_size_dim = ctx->GetInputDim("OutSize");
+      PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
+                        "OutSize's dimension size must be 1");
+      PADDLE_ENFORCE_EQ(out_size_dim[0], 2, "OutSize's dim[0] must be 2");
+    }
     std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
     ctx->SetOutputDim("Out", framework::make_ddim(dim_out));
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+  }
 };
 
 class BilinearInterpOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -45,6 +58,10 @@ class BilinearInterpOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X",
              "(Tensor) The input tensor of bilinear interpolation, "
              "This is a 4-D tensor with shape of (N x C x h x w)");
+    AddInput("OutSize",
+             "(Tensor) This is a 1-D tensor with two number. "
+             "The first number is height and the second number is width.")
+        .AsDispensable();
     AddOutput("Out",
               "(Tensor) The dimension of output is (N x C x out_h x out_w]");
 
@@ -78,6 +95,12 @@ class BilinearInterpOpGrad : public framework::OperatorWithKernel {
       ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/bilinear_interp_op.cu b/paddle/fluid/operators/bilinear_interp_op.cu
index 510190f1aaf02960284216a1bedd409011088499..4c1971538495c6f111e9db18f4014786f6f0dd58 100644
--- a/paddle/fluid/operators/bilinear_interp_op.cu
+++ b/paddle/fluid/operators/bilinear_interp_op.cu
@@ -102,10 +102,21 @@ class BilinearInterpOpCUDAKernel : public framework::OpKernel<T> {
     auto* input_t = ctx.Input<Tensor>("X");      // float tensor
     auto* output_t = ctx.Output<Tensor>("Out");  // float tensor
     auto* input = input_t->data<T>();
-    auto* output = output_t->mutable_data<T>(ctx.GetPlace());
 
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
+    auto out_dims = output_t->dims();
+    auto out_size_t = ctx.Input<Tensor>("OutSize");
+    if (out_size_t != nullptr) {
+      Tensor sizes;
+      framework::TensorCopy(*out_size_t, platform::CPUPlace(), &sizes);
+      auto size_data = sizes.data<int>();
+      out_h = size_data[0];
+      out_w = size_data[1];
+    }
+    auto* output = output_t->mutable_data<T>(
+        {out_dims[0], out_dims[1], out_h, out_w}, ctx.GetPlace());
+
     int batch_size = input_t->dims()[0];
     int channels = input_t->dims()[1];
     int in_h = input_t->dims()[2];
@@ -139,8 +150,8 @@ class BilinearInterpGradOpCUDAKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* d_input_t = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* d_output_t = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
     auto* d_output = d_output_t->data<T>();
+    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
 
     auto& device_ctx =
         ctx.template device_context<platform::CUDADeviceContext>();
@@ -149,6 +160,16 @@ class BilinearInterpGradOpCUDAKernel : public framework::OpKernel<T> {
 
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
+
+    auto out_size_t = ctx.Input<Tensor>("OutSize");
+    if (out_size_t != nullptr) {
+      Tensor sizes;
+      framework::TensorCopy(*out_size_t, platform::CPUPlace(), &sizes);
+      auto size_data = sizes.data<int>();
+      out_h = size_data[0];
+      out_w = size_data[1];
+    }
+
     int batch_size = d_input_t->dims()[0];
     int channels = d_input_t->dims()[1];
     int in_h = d_input_t->dims()[2];
diff --git a/paddle/fluid/operators/bilinear_interp_op.h b/paddle/fluid/operators/bilinear_interp_op.h
index f6cd77e4d49b53ecde6a84908cdffc7e1e02ac6a..8b03cd5a0635584a45782fe5a4823c37fe4fa8e8 100644
--- a/paddle/fluid/operators/bilinear_interp_op.h
+++ b/paddle/fluid/operators/bilinear_interp_op.h
@@ -24,11 +24,18 @@ class BilinearInterpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* input_t = ctx.Input<Tensor>("X");      // float tensor
     auto* output_t = ctx.Output<Tensor>("Out");  // float tensor
+    auto out_dims = output_t->dims();
     auto* input = input_t->data<T>();
-    auto* output = output_t->mutable_data<T>(ctx.GetPlace());
-
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
+    auto out_size_t = ctx.Input<Tensor>("OutSize");
+    if (out_size_t != nullptr) {
+      auto out_size_data = out_size_t->data<int>();
+      out_h = out_size_data[0];
+      out_w = out_size_data[1];
+    }
+    auto* output = output_t->mutable_data<T>(
+        {out_dims[0], out_dims[1], out_h, out_w}, ctx.GetPlace());
     int batch_size = input_t->dims()[0];
     int channels = input_t->dims()[1];
     int in_h = input_t->dims()[2];
@@ -83,9 +90,8 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* d_input_t = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* d_output_t = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
     auto* d_output = d_output_t->data<T>();
-
+    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
     auto& device_ctx =
         ctx.template device_context<platform::CPUDeviceContext>();
     math::SetConstant<platform::CPUDeviceContext, T> zero;
@@ -93,6 +99,14 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> {
 
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
+
+    auto out_size_t = ctx.Input<Tensor>("OutSize");
+    if (out_size_t != nullptr) {
+      auto out_size_data = out_size_t->data<int>();
+      out_h = out_size_data[0];
+      out_w = out_size_data[1];
+    }
+
     int batch_size = d_input_t->dims()[0];
     int channels = d_input_t->dims()[1];
     int in_h = d_input_t->dims()[2];
diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc
index 84660d042c7b12283fabc316d29609f5eddb825d..8d6a498dc941e44688ec8a2b49a6e080608f9b85 100644
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -89,4 +89,5 @@ REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
                        ops::CastOpKernel<CPU, int>,
                        ops::CastOpKernel<CPU, int64_t>,
                        ops::CastOpKernel<CPU, bool>,
+                       ops::CastOpKernel<CPU, uint8_t>,
                        ops::CastOpKernel<CPU, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu
index c486c5850e25fcf4370f02cb145c244743a4cc4b..657d162878c108760585ca9bd58e2fd34bf1fef3 100644
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
@@ -21,5 +21,5 @@ using CastOpKernel =
 
 REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel<float>, CastOpKernel<double>,
                         CastOpKernel<int>, CastOpKernel<int64_t>,
-                        CastOpKernel<bool>,
+                        CastOpKernel<bool>, CastOpKernel<uint8_t>,
                         CastOpKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt
index b9a66474c9afc27462f9c47af1a0465e2cec70bc..cf20530513cf6cd420e56b2f6378225f73c2bc8b 100644
--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
@@ -1,6 +1,7 @@
 if(WITH_DISTRIBUTE)
   grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
-      grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
+      request_handler_impl.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor
+      selected_rows memory)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index 47892b1bcc073d24ea617ea1c680138a88925177..da9ca1a0c1d55018141f0e4285fe35d7c437fd55 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -25,6 +25,21 @@ namespace paddle {
 namespace operators {
 namespace detail {
 
+std::once_flag RPCClient::init_flag_;
+
+std::unique_ptr<RPCClient> RPCClient::rpc_client_(nullptr);
+
+RPCClient* RPCClient::GetInstance() {
+  std::call_once(init_flag_, &RPCClient::Init);
+  return rpc_client_.get();
+}
+
+void RPCClient::Init() {
+  if (rpc_client_.get() == nullptr) {
+    rpc_client_.reset(new RPCClient());
+  }
+}
+
 bool RPCClient::AsyncSendVariable(const std::string& ep,
                                   const platform::DeviceContext& ctx,
                                   const framework::Scope& scope,
@@ -60,7 +75,6 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
     call->StartCall();
     call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
   });
-
   req_count_++;
 
   return true;
@@ -191,6 +205,8 @@ void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) {
 }
 
 bool RPCClient::Wait() {
+  VLOG(3) << "RPCClient begin Wait()"
+          << " req_count_:" << req_count_;
   if (req_count_ <= 0) {
     return true;
   }
@@ -249,8 +265,9 @@ bool RPCClient::Proceed() {
   delete c;
   return true;
 }
-
 std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
+  // TODO(Yancey1989): make grpc client completely thread-safe
+  std::unique_lock<std::mutex> lock(mutex_);
   auto it = channels_.find(ep);
   if (it != channels_.end()) {
     return it->second;
@@ -263,7 +280,6 @@ std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
 
   auto ch =
       grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args);
-
   channels_[ep] = ch;
   return ch;
 }
diff --git a/paddle/fluid/operators/detail/grpc_client.h b/paddle/fluid/operators/detail/grpc_client.h
index dabce7414d2f0dca74193f1cd10c341793c10ec9..449d5105afb8c02294a0ef57610e7de1b1631b35 100644
--- a/paddle/fluid/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/detail/grpc_client.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <functional>
 #include <iostream>
 #include <map>
+#include <mutex>  // NOLINT
 #include <string>
 #include <vector>
 
@@ -35,6 +36,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
 namespace paddle {
 namespace operators {
@@ -161,6 +163,10 @@ class FetchBarrierProcessor : public BaseProcessor {
 
 class RPCClient {
  public:
+  RPCClient() {}
+
+  static RPCClient* GetInstance();
+
   bool AsyncSendVariable(const std::string& ep,
                          const platform::DeviceContext& ctx,
                          const framework::Scope& scope,
@@ -191,11 +197,17 @@ class RPCClient {
  private:
   bool Proceed();
   std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
+  // Init is called by GetInstance.
+  static void Init();
 
  private:
   grpc::CompletionQueue cq_;
   std::map<std::string, std::shared_ptr<grpc::Channel>> channels_;
-  int64_t req_count_ = 0;
+  std::atomic<int64_t> req_count_{0};
+  std::mutex mutex_;
+  static std::unique_ptr<RPCClient> rpc_client_;
+  static std::once_flag init_flag_;
+  DISABLE_COPY_AND_ASSIGN(RPCClient);
 };
 
 }  // namespace detail
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index 58faead2bdf9a89749e08207d964836bbf5cb68e..e73756d89004bc48339c0aa31dd0857c2ca6722d 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/*Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,19 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/detail/grpc_server.h"
-
 #include <limits>
 #include <string>
 
-using ::grpc::ServerAsyncResponseWriter;
+#include "paddle/fluid/operators/detail/grpc_server.h"
 
-DEFINE_int32(rpc_server_handle_send_threads, 20,
-             "Number of threads used to handle send at rpc server.");
-DEFINE_int32(rpc_server_handle_get_threads, 20,
-             "Number of threads used to handle get at rpc server.");
-DEFINE_int32(rpc_server_handle_prefetch_threads, 1,
-             "Number of threads used to handle prefetch at rpc server.");
+using ::grpc::ServerAsyncResponseWriter;
 
 namespace paddle {
 namespace operators {
@@ -36,49 +29,40 @@ enum CallStatus { PROCESS = 0, FINISH };
 class RequestBase {
  public:
   explicit RequestBase(GrpcService::AsyncService* service,
-                       ::grpc::ServerCompletionQueue* cq, bool sync_mode,
-                       const platform::DeviceContext* dev_ctx)
+                       ::grpc::ServerCompletionQueue* cq,
+                       RequestHandler* request_handler, int req_id)
       : service_(service),
         cq_(cq),
-        sync_mode_(sync_mode),
         status_(PROCESS),
-        dev_ctx_(dev_ctx) {
+        request_handler_(request_handler),
+        req_id_(req_id) {
     PADDLE_ENFORCE(cq_);
   }
   virtual ~RequestBase() {}
-  virtual void Process() { assert(false); }
+  virtual void Process() = 0;
 
   CallStatus Status() { return status_; }
   void SetStatus(CallStatus status) { status_ = status; }
-  virtual std::string GetReqName() {
-    assert(false);
-    return "";
-  }
+  virtual std::string GetReqName() = 0;
 
  protected:
   ::grpc::ServerContext ctx_;
   GrpcService::AsyncService* service_;
   ::grpc::ServerCompletionQueue* cq_;
-  const bool sync_mode_;
   CallStatus status_;
-  const platform::DeviceContext* dev_ctx_;
+  RequestHandler* request_handler_;
+  int req_id_;
 };
 
 class RequestSend final : public RequestBase {
  public:
   explicit RequestSend(GrpcService::AsyncService* service,
-                       ::grpc::ServerCompletionQueue* cq, bool sync_mode,
-                       framework::Scope* scope, ReceivedQueue* queue,
-                       const platform::DeviceContext* dev_ctx, int req_id)
-      : RequestBase(service, cq, sync_mode, dev_ctx),
-        queue_(queue),
-        responder_(&ctx_),
-        req_id_(req_id) {
-    if (sync_mode_) {
-      request_.reset(new VariableResponse(scope, dev_ctx_, false));
-    } else {
-      request_.reset(new VariableResponse(scope, dev_ctx_, true));
-    }
+                       ::grpc::ServerCompletionQueue* cq,
+                       RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
+    request_.reset(new VariableResponse(request_handler->scope(),
+                                        request_handler->dev_ctx(),
+                                        !request_handler->sync_mode()));
     int method_id = static_cast<int>(detail::GrpcMethod::kSendVariable);
     service_->RequestAsyncUnary(
         method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
@@ -87,12 +71,17 @@ class RequestSend final : public RequestBase {
 
   virtual ~RequestSend() {}
 
-  virtual std::string GetReqName() { return request_->Varname(); }
+  std::string GetReqName() override { return request_->Varname(); }
+
+  void Process() override {
+    std::string varname = GetReqName();
+    VLOG(3) << "RequestSend var_name:" << varname;
+
+    auto scope = request_->GetMutableLocalScope();
+    auto invar = request_->GetVar();
+    framework::Variable* outvar = nullptr;
 
-  virtual void Process() {
-    std::string var_name = GetReqName();
-    VLOG(3) << "RequestSend " << var_name;
-    queue_->Push(std::make_pair(var_name, request_));
+    request_handler_->Handle(varname, scope, invar, &outvar);
 
     status_ = FINISH;
     responder_.Finish(reply_, ::grpc::Status::OK,
@@ -102,108 +91,85 @@ class RequestSend final : public RequestBase {
  protected:
   sendrecv::VoidMessage reply_;
   std::shared_ptr<VariableResponse> request_;
-  ReceivedQueue* queue_;
   ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
-  int req_id_;
 };
 
 class RequestGet final : public RequestBase {
  public:
   explicit RequestGet(GrpcService::AsyncService* service,
-                      ::grpc::ServerCompletionQueue* cq, bool sync_mode,
-                      framework::Scope* scope,
-                      const platform::DeviceContext* dev_ctx,
-                      framework::BlockingQueue<MessageWithName>* queue,
-                      int req_id)
-      : RequestBase(service, cq, sync_mode, dev_ctx),
-        responder_(&ctx_),
-        scope_(scope),
-        queue_(queue),
-        req_id_(req_id) {
+                      ::grpc::ServerCompletionQueue* cq,
+                      RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
     auto method_id = static_cast<int>(detail::GrpcMethod::kGetVariable);
     service_->RequestAsyncUnary(
         method_id, &ctx_, &request_, &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
   }
 
   virtual ~RequestGet() {}
 
-  virtual std::string GetReqName() { return request_.varname(); }
+  std::string GetReqName() override { return request_.varname(); }
 
-  virtual void Process() {
+  void Process() override {
     // proc request.
-    std::string var_name = request_.varname();
-    VLOG(3) << "RequestGet " << var_name;
-    auto* var = scope_->FindVar(var_name);
+    std::string varname = request_.varname();
+    VLOG(3) << "RequestGet " << varname;
+
+    auto scope = request_handler_->scope();
+    auto invar = scope->FindVar(varname);
+    framework::Variable* outvar = nullptr;
+
+    request_handler_->Handle(varname, scope, invar, &outvar);
 
-    if (var_name != FETCH_BARRIER_MESSAGE) {
-      SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply_);
+    if (outvar) {
+      SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(),
+                            &reply_);
     }
 
     status_ = FINISH;
     responder_.Finish(reply_, ::grpc::Status::OK,
                       reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
-
-    if (var_name == FETCH_BARRIER_MESSAGE) {
-      sendrecv::VariableMessage msg;
-      MessageWithName msg_with_name = std::make_pair(var_name, msg);
-      queue_->Push(msg_with_name);
-    }
   }
 
  protected:
   sendrecv::VariableMessage request_;
   ::grpc::ByteBuffer reply_;
   ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-  framework::Scope* scope_;
-  framework::BlockingQueue<MessageWithName>* queue_;
-  int req_id_;
 };
 
 class RequestPrefetch final : public RequestBase {
  public:
   explicit RequestPrefetch(GrpcService::AsyncService* service,
-                           ::grpc::ServerCompletionQueue* cq, bool sync_mode,
-                           framework::Scope* scope,
-                           const platform::DeviceContext* dev_ctx,
-                           framework::Executor* executor,
-                           framework::ProgramDesc* program,
-                           framework::ExecutorPrepareContext* prefetch_ctx,
-                           int req_id)
-      : RequestBase(service, cq, sync_mode, dev_ctx),
+                           ::grpc::ServerCompletionQueue* cq,
+                           RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id),
         responder_(&ctx_),
-        scope_(scope),
-        executor_(executor),
-        program_(program),
-        prefetch_ctx_(prefetch_ctx),
-        req_id_(req_id) {
-    if (sync_mode_) {
-      request_.reset(new VariableResponse(scope, dev_ctx_, false));
-    } else {
-      request_.reset(new VariableResponse(scope, dev_ctx_, true));
-    }
+        local_scope_(nullptr) {
+    request_.reset(new VariableResponse(request_handler->scope(),
+                                        request_handler->dev_ctx(), true));
     int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
     service_->RequestAsyncUnary(
         method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
   }
 
   virtual ~RequestPrefetch() {}
 
-  virtual std::string GetReqName() { return request_->Varname(); }
+  std::string GetReqName() override { return request_->Varname(); }
 
-  virtual void Process() {
+  void Process() override {
     // prefetch process...
+    std::string varname = request_->OutVarname();
+    VLOG(3) << "RequestPrefetch " << varname;
+
+    auto scope = request_->GetMutableLocalScope();
+    auto invar = scope->FindVar(varname);
+    framework::Variable* outvar = nullptr;
 
-    std::string var_name = request_->OutVarname();
-    VLOG(3) << "RequestPrefetch " << var_name;
-    auto var_desc = program_->Block(0).FindVar(var_name);
-    framework::Scope* local_scope = &scope_->NewScope();
-    auto* var = local_scope->FindVar(var_name);
-    InitializeVariable(var, var_desc->GetType());
-    executor_->RunPreparedContext(prefetch_ctx_, scope_);
+    request_handler_->Handle(varname, scope, invar, &outvar);
 
-    SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply_);
+    SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(),
+                          &reply_);
 
     status_ = FINISH;
     responder_.Finish(reply_, ::grpc::Status::OK,
@@ -214,202 +180,169 @@ class RequestPrefetch final : public RequestBase {
   std::shared_ptr<VariableResponse> request_;
   ::grpc::ByteBuffer reply_;
   ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-  framework::Scope* scope_;
-  framework::Executor* executor_;
-  framework::ProgramDesc* program_;
-  framework::ExecutorPrepareContext* prefetch_ctx_;
-  int req_id_;
+  framework::Scope* local_scope_;
 };
 
-void AsyncGRPCServer::WaitClientGet(int count) {
-  int fetch_barriers = 0;
-  while (fetch_barriers < count) {
-    auto msg = var_get_queue_.Pop();
-    if (msg.first == FETCH_BARRIER_MESSAGE) {
-      fetch_barriers++;
-    }
-  }
-}
-
 void AsyncGRPCServer::WaitServerReady() {
+  VLOG(3) << "AsyncGRPCServer is wait server ready";
   std::unique_lock<std::mutex> lock(this->mutex_ready_);
   condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
+  VLOG(3) << "AsyncGRPCServer WaitSeverReady";
 }
 
-void AsyncGRPCServer::RunSyncUpdate() {
+void AsyncGRPCServer::StartServer() {
   ::grpc::ServerBuilder builder;
-  builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials(),
+  builder.AddListeningPort(bind_address_, ::grpc::InsecureServerCredentials(),
                            &selected_port_);
+
   builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
   builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
   builder.RegisterService(&service_);
 
-  cq_send_ = builder.AddCompletionQueue();
-  cq_get_ = builder.AddCompletionQueue();
-  cq_prefetch_ = builder.AddCompletionQueue();
+  for (auto t : rpc_call_map_) {
+    rpc_cq_[t.first].reset(builder.AddCompletionQueue().release());
+  }
 
   server_ = builder.BuildAndStart();
-  LOG(INFO) << "Server listening on " << address_
+  LOG(INFO) << "Server listening on " << bind_address_
             << " selected port: " << selected_port_;
 
-  std::function<void(int)> send_register = std::bind(
-      &AsyncGRPCServer::TryToRegisterNewSendOne, this, std::placeholders::_1);
-  std::function<void(int)> get_register = std::bind(
-      &AsyncGRPCServer::TryToRegisterNewGetOne, this, std::placeholders::_1);
-  std::function<void(int)> prefetch_register =
-      std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this,
-                std::placeholders::_1);
+  std::function<void(const std::string&, int)> f =
+      std::bind(&AsyncGRPCServer::TryToRegisterNewOne, this,
+                std::placeholders::_1, std::placeholders::_2);
 
-  for (int i = 0; i < kSendReqsBufSize; ++i) {
-    TryToRegisterNewSendOne(i);
-  }
-  for (int i = 0; i < kGetReqsBufSize; ++i) {
-    TryToRegisterNewGetOne(i);
-  }
-  for (int i = 0; i < kPrefetchReqsBufSize; ++i) {
-    TryToRegisterNewPrefetchOne(i);
-  }
+  for (auto& t : rpc_call_map_) {
+    auto& rpc_name = t.first;
+    auto& cq = rpc_cq_[rpc_name];
+    auto threadnum = rpc_thread_num_[rpc_name];
+    auto& reqs = rpc_reqs_[rpc_name];
 
-  for (int i = 0; i < FLAGS_rpc_server_handle_send_threads; ++i) {
-    t_sends_.emplace_back(
-        new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
-                                  cq_send_.get(), "cq_send", send_register)));
-  }
-  for (int i = 0; i < FLAGS_rpc_server_handle_get_threads; ++i) {
-    t_gets_.emplace_back(
-        new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
-                                  cq_get_.get(), "cq_get", get_register)));
-  }
-  for (int i = 0; i < FLAGS_rpc_server_handle_prefetch_threads; ++i) {
-    t_prefetchs_.emplace_back(new std::thread(
-        std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(),
-                  "cq_prefetch", prefetch_register)));
+    reqs.reserve(kRequestBufSize);
+
+    for (int i = 0; i < kRequestBufSize; i++) {
+      TryToRegisterNewOne(rpc_name, i);
+    }
+
+    for (int i = 0; i < threadnum; i++) {
+      rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind(
+          &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f)));
+      VLOG(3) << t.first << " creates threads!";
+    }
   }
+
   {
     std::lock_guard<std::mutex> lock(this->mutex_ready_);
     ready_ = 1;
   }
   condition_ready_.notify_all();
+
   // wait server
   server_->Wait();
-  for (int i = 0; i < FLAGS_rpc_server_handle_send_threads; ++i) {
-    t_sends_[i]->join();
-  }
-  for (int i = 0; i < FLAGS_rpc_server_handle_get_threads; ++i) {
-    t_gets_[i]->join();
-  }
-  for (int i = 0; i < FLAGS_rpc_server_handle_prefetch_threads; ++i) {
-    t_prefetchs_[i]->join();
+
+  for (auto& t : rpc_threads_) {
+    auto& threads = t.second;
+    for (size_t i = 0; i < threads.size(); ++i) {
+      threads[i]->join();
+      VLOG(3) << t.first << " threads ends!";
+    }
   }
 }
 
 void AsyncGRPCServer::ShutdownQueue() {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  cq_send_->Shutdown();
-  cq_get_->Shutdown();
-  cq_prefetch_->Shutdown();
+  for (auto& t : rpc_cq_) {
+    t.second->Shutdown();
+    VLOG(3) << t.first << " shutdown!";
+  }
 }
 
-// This URL explains why shutdown is complicate:
-void AsyncGRPCServer::ShutDown() {
+void AsyncGRPCServer::ShutDownImpl() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
   is_shut_down_ = true;
   ShutdownQueue();
+
+  VLOG(3) << "server_ shutdown!";
   server_->Shutdown();
 }
 
-void AsyncGRPCServer::TryToRegisterNewSendOne(int i) {
+void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
+                                          int req_id) {
   std::unique_lock<std::mutex> lock(cq_mutex_);
   if (is_shut_down_) {
     VLOG(3) << "shutdown, do not TryToRegisterNewSendOne";
     return;
   }
-  RequestSend* send = new RequestSend(&service_, cq_send_.get(), sync_mode_,
-                                      scope_, &var_recv_queue_, dev_ctx_, i);
-  send_reqs_[i] = static_cast<RequestBase*>(send);
-  VLOG(4) << "Create RequestSend status:" << send->Status();
-}
 
-void AsyncGRPCServer::TryToRegisterNewGetOne(int req_id) {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  if (is_shut_down_) {
-    VLOG(3) << "shutdown, do not TryToRegisterNewGetOne";
-    return;
+  VLOG(4) << "register send rpc_name:" << rpc_name
+          << ", handler:" << rpc_call_map_[kRequestSend];
+
+  auto& reqs = rpc_reqs_[rpc_name];
+  auto& handler = rpc_call_map_[rpc_name];
+  auto& cq = rpc_cq_[rpc_name];
+
+  RequestBase* b = nullptr;
+  if (rpc_name == kRequestSend) {
+    b = new RequestSend(&service_, cq.get(), handler, req_id);
+  } else if (rpc_name == kRequestGet) {
+    b = new RequestGet(&service_, cq.get(), handler, req_id);
+  } else if (rpc_name == kRequestPrefetch) {
+    b = new RequestPrefetch(&service_, cq.get(), handler, req_id);
+  } else {
+    PADDLE_ENFORCE(false, "not surpported rpc");
   }
-  RequestGet* get = new RequestGet(&service_, cq_get_.get(), sync_mode_, scope_,
-                                   dev_ctx_, &var_get_queue_, req_id);
-  get_reqs_[req_id] = static_cast<RequestBase*>(get);
-  VLOG(4) << "Create RequestGet status:" << get->Status();
-}
 
-void AsyncGRPCServer::TryToRegisterNewPrefetchOne(int req_id) {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  if (is_shut_down_) {
-    VLOG(3) << "shutdown, do not TryToRegisterNewPrefetchOne";
-    return;
-  }
-  RequestPrefetch* prefetch = new RequestPrefetch(
-      &service_, cq_prefetch_.get(), sync_mode_, scope_, dev_ctx_, executor_,
-      program_, prefetch_ctx_.get(), req_id);
-  prefetch_reqs_[req_id] = static_cast<RequestBase*>(prefetch);
+  reqs[req_id] = b;
 
-  VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status();
+  VLOG(4) << "Create RequestSend status:" << b->Status();
 }
 
-// FIXME(typhoonzero): change cq_name to enum.
 void AsyncGRPCServer::HandleRequest(
-    ::grpc::ServerCompletionQueue* cq, const std::string& cq_name,
-    std::function<void(int)> TryToRegisterNewOne) {
+    ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name,
+    std::function<void(const std::string&, int)> TryToRegisterNewOne) {
   void* tag = NULL;
   bool ok = false;
 
   while (true) {
-    VLOG(3) << "HandleRequest for " << cq_name << " wait Next";
+    VLOG(3) << "HandleRequest " << rpc_name << " wait next";
     if (!cq->Next(&tag, &ok)) {
-      LOG(INFO) << cq_name << " CompletionQueue shutdown!";
+      LOG(INFO) << "CompletionQueue " << rpc_name << " shutdown!";
       break;
     }
-    VLOG(3) << "HandleRequest for " << cq_name << " get Next";
-    int req_id = static_cast<int>(reinterpret_cast<intptr_t>(tag));
 
-    if (sync_mode_) {
-      // FIXME(typhoonzero): de-couple the barriers with recv_op
-      if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1);
-      if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0);
-      VLOG(3) << "HandleRequest for " << cq_name << " after WaitCond";
-    }
+    int req_id = static_cast<int>(reinterpret_cast<intptr_t>(tag));
+    VLOG(3) << "HandleRequest " << rpc_name << ", req_id:" << req_id
+            << " get next";
 
+    auto& reqs = rpc_reqs_[rpc_name];
     RequestBase* base = nullptr;
     {
-      std::lock_guard<std::mutex> l(cq_mutex_);
-      if (cq_name == "cq_get") {
-        base = get_reqs_[req_id];
-      } else if (cq_name == "cq_send") {
-        base = send_reqs_[req_id];
-      } else if (cq_name == "cq_prefetch") {
-        base = prefetch_reqs_[req_id];
-      }
+      PADDLE_ENFORCE(req_id >= 0 && req_id < kRequestBufSize);
+      std::unique_lock<std::mutex> lock(cq_mutex_);
+      base = reqs[req_id];
     }
+
     // reference:
     // https://github.com/tensorflow/tensorflow/issues/5596
     // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
     // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
     if (!ok) {
-      LOG(WARNING) << cq_name << " recv no regular event:argument name["
+      LOG(WARNING) << "completion queue:" << rpc_name
+                   << " recv no regular event:argument name["
                    << base->GetReqName() << "]";
-      TryToRegisterNewOne(req_id);
+      TryToRegisterNewOne(rpc_name, req_id);
       delete base;
       continue;
     }
 
+    VLOG(3) << "queue id:" << rpc_name << ", req_id:" << req_id
+            << ", status:" << base->Status();
+
     switch (base->Status()) {
       case PROCESS: {
         base->Process();
-        VLOG(4) << cq_name << " PROCESS status:" << base->Status();
         break;
       }
       case FINISH: {
-        TryToRegisterNewOne(req_id);
-        VLOG(4) << cq_name << " FINISH status:" << base->Status();
+        TryToRegisterNewOne(rpc_name, req_id);
         delete base;
         break;
       }
@@ -418,20 +351,6 @@ void AsyncGRPCServer::HandleRequest(
   }
 }
 
-void AsyncGRPCServer::WaitCond(int cond) {
-  std::unique_lock<std::mutex> lock(this->barrier_mutex_);
-  barrier_condition_.wait(lock,
-                          [=] { return this->barrier_cond_step_ == cond; });
-}
-
-void AsyncGRPCServer::SetCond(int cond) {
-  {
-    std::lock_guard<std::mutex> lock(this->barrier_mutex_);
-    barrier_cond_step_ = cond;
-  }
-  barrier_condition_.notify_all();
-}
-
 }  // namespace detail
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
index bdff9801a928699f8391bfb68c1c7bd2d75aa642..d1fcbc414f123c5c4810d9cecf807a406aa2c405 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <map>
+#include <set>
 #include <string>
 #include <thread>  // NOLINT
 #include <utility>
@@ -28,6 +30,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/detail/grpc_service.h"
+#include "paddle/fluid/operators/detail/request_handler.h"
+#include "paddle/fluid/operators/detail/rpc_server.h"
 #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
 #include "paddle/fluid/operators/detail/send_recv.pb.h"
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
@@ -37,106 +41,48 @@ namespace paddle {
 namespace operators {
 namespace detail {
 
-typedef std::pair<std::string, std::shared_ptr<VariableResponse>>
-    ReceivedMessage;
-typedef framework::BlockingQueue<ReceivedMessage> ReceivedQueue;
-
-typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
 class RequestBase;
 
-class AsyncGRPCServer final {
+class AsyncGRPCServer final : public RPCServer {
  public:
-  explicit AsyncGRPCServer(const std::string &address, bool sync_mode)
-      : address_(address), sync_mode_(sync_mode), ready_(0) {}
-
-  ~AsyncGRPCServer() {}
-  void WaitServerReady();
-  void RunSyncUpdate();
-
-  // functions to sync server barrier status.
-  void WaitCond(int cond);
-  void SetCond(int cond);
-  void WaitClientGet(int count);
-
-  void SetScope(framework::Scope *scope) { scope_ = scope; }
-
-  void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; }
-
-  void SetProgram(framework::ProgramDesc *program) { program_ = program; }
-
-  void SetExecutor(framework::Executor *executor) { executor_ = executor; }
-
-  void SetPrefetchPreparedCtx(
-      std::unique_ptr<framework::ExecutorPrepareContext> prepared) {
-    prefetch_ctx_.reset(prepared.release());
-  }
-
-  int GetSelectedPort() const { return selected_port_; }
-
-  const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); }
+  explicit AsyncGRPCServer(const std::string& address, int client_num)
+      : RPCServer(address, client_num), ready_(0) {}
 
-  void Push(const std::string &msg_name) {
-    this->var_recv_queue_.Push(std::make_pair(msg_name, nullptr));
-  }
+  virtual ~AsyncGRPCServer() {}
+  void WaitServerReady() override;
+  void StartServer() override;
 
-  void ShutDown();
+ private:
+  void HandleRequest(
+      ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name,
+      std::function<void(const std::string&, int)> TryToRegisterNewOne);
 
- protected:
-  void HandleRequest(::grpc::ServerCompletionQueue *cq,
-                     const std::string &cq_name,
-                     std::function<void(int)> TryToRegisterNewOne);
-  void TryToRegisterNewSendOne(int req_id);
-  void TryToRegisterNewGetOne(int req_id);
-  void TryToRegisterNewPrefetchOne(int req_id);
+  void TryToRegisterNewOne(const std::string& rpc_name, int req_id);
   void ShutdownQueue();
+  void ShutDownImpl() override;
 
  private:
-  static const int kSendReqsBufSize = 100;
-  static const int kGetReqsBufSize = 100;
-  static const int kPrefetchReqsBufSize = 10;
+  static const int kRequestBufSize = 100;
 
   std::mutex cq_mutex_;
   volatile bool is_shut_down_ = false;
-  std::unique_ptr<::grpc::ServerCompletionQueue> cq_send_;
-  std::unique_ptr<::grpc::ServerCompletionQueue> cq_get_;
-  std::unique_ptr<::grpc::ServerCompletionQueue> cq_prefetch_;
-
-  RequestBase *send_reqs_[kSendReqsBufSize];
-  RequestBase *get_reqs_[kGetReqsBufSize];
-  RequestBase *prefetch_reqs_[kPrefetchReqsBufSize];
 
   GrpcService::AsyncService service_;
   std::unique_ptr<::grpc::Server> server_;
 
-  std::string address_;
-  const bool sync_mode_;
-  framework::Scope *scope_;
-  const platform::DeviceContext *dev_ctx_;
-
-  // received variable from RPC, operators fetch variable from this queue.
-  framework::BlockingQueue<MessageWithName> var_get_queue_;
-  // client send variable to this queue.
-  ReceivedQueue var_recv_queue_;
-
   // condition of the sub program
   std::mutex barrier_mutex_;
   mutable int barrier_cond_step_;
   std::condition_variable barrier_condition_;
 
-  std::vector<std::unique_ptr<std::thread>> t_sends_;
-  std::vector<std::unique_ptr<std::thread>> t_gets_;
-  std::vector<std::unique_ptr<std::thread>> t_prefetchs_;
-
-  std::unique_ptr<std::thread> t_prefetch_;
-
-  std::unique_ptr<framework::ExecutorPrepareContext> prefetch_ctx_;
-  framework::ProgramDesc *program_;
-  framework::Executor *executor_;
-  int selected_port_;
-
   std::mutex mutex_ready_;
   std::condition_variable condition_ready_;
+
   int ready_;
+
+  std::map<std::string, std::unique_ptr<::grpc::ServerCompletionQueue>> rpc_cq_;
+  std::map<std::string, std::vector<std::unique_ptr<std::thread>>> rpc_threads_;
+  std::map<std::string, std::vector<RequestBase*>> rpc_reqs_;
 };
 
 };  // namespace detail
diff --git a/paddle/fluid/operators/detail/grpc_server_test.cc b/paddle/fluid/operators/detail/grpc_server_test.cc
index 73e75c9087fef756840c76db249f8996253ced64..f97f638701cfb263f28dddbdc3bc80fb16468744 100644
--- a/paddle/fluid/operators/detail/grpc_server_test.cc
+++ b/paddle/fluid/operators/detail/grpc_server_test.cc
@@ -24,13 +24,16 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
+#include "paddle/fluid/operators/detail/request_handler_impl.h"
+
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace detail = paddle::operators::detail;
 
 USE_OP(lookup_table);
 
-std::unique_ptr<detail::AsyncGRPCServer> rpc_service_;
+std::unique_ptr<detail::AsyncGRPCServer> g_rpc_service;
+std::unique_ptr<detail::RequestHandler> g_req_handler;
 
 framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
   auto root_block = program->MutableBlock(0);
@@ -88,8 +91,7 @@ void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
   }
 }
 
-void StartServer(const std::string& endpoint) {
-  rpc_service_.reset(new detail::AsyncGRPCServer(endpoint, true));
+void StartServer() {
   framework::ProgramDesc program;
   framework::Scope scope;
   platform::CPUPlace place;
@@ -99,42 +101,59 @@ void StartServer(const std::string& endpoint) {
   auto prepared = exe.Prepare(program, block->ID());
   InitTensorsOnServer(&scope, &place, 10);
 
-  rpc_service_->SetProgram(&program);
-  rpc_service_->SetPrefetchPreparedCtx(std::move(prepared));
-  rpc_service_->SetDevCtx(&ctx);
-  rpc_service_->SetScope(&scope);
-  rpc_service_->SetExecutor(&exe);
+  g_req_handler->SetProgram(&program);
+  g_req_handler->SetPrefetchPreparedCtx(std::move(prepared));
+  g_req_handler->SetDevCtx(&ctx);
+  g_req_handler->SetScope(&scope);
+  g_req_handler->SetExecutor(&exe);
+
+  g_rpc_service->RegisterRPC(detail::kRequestPrefetch, g_req_handler.get());
+  g_req_handler->SetRPCServer(g_rpc_service.get());
 
-  rpc_service_->RunSyncUpdate();
+  std::thread server_thread(
+      std::bind(&detail::AsyncGRPCServer::StartServer, g_rpc_service.get()));
+
+  // FIXME(gongwb): don't use hard time.
+  sleep(10);
+  LOG(INFO) << "got nccl id and stop server...";
+  g_rpc_service->ShutDown();
+  server_thread.join();
 }
 
-TEST(PREFETCH, DISABLED_CPU) {
-  // start up a server instance backend
-  std::thread server_thread(StartServer, "127.0.0.1:8889");
-  sleep(2);
-  framework::Scope scope;
-  platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
-  // create var on local scope
-  int64_t rows_numel = 5;
-  InitTensorsOnClient(&scope, &place, rows_numel);
-  std::string in_var_name("ids");
-  std::string out_var_name("out");
+TEST(PREFETCH, CPU) {
+  g_req_handler.reset(new detail::RequestPrefetchHandler(true));
+  g_rpc_service.reset(new detail::AsyncGRPCServer("127.0.0.1:0", 1));
+
+  std::thread server_thread(StartServer);
+  g_rpc_service->WaitServerReady();
 
   detail::RPCClient client;
-  client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, in_var_name,
-                               out_var_name);
-  client.Wait();
+  int port = g_rpc_service->GetSelectedPort();
+  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
 
-  auto var = scope.Var(out_var_name);
-  auto value = var->GetMutable<framework::SelectedRows>()->value();
-  auto ptr = value.mutable_data<float>(place);
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+  {
+    // create var on local scope
+    int64_t rows_numel = 5;
+    InitTensorsOnClient(&scope, &place, rows_numel);
+    std::string in_var_name("ids");
+    std::string out_var_name("out");
+
+    client.AsyncPrefetchVariable(ep, ctx, scope, in_var_name, out_var_name);
+    client.Wait();
+    auto var = scope.Var(out_var_name);
+    auto value = var->GetMutable<framework::SelectedRows>()->value();
+    auto ptr = value.mutable_data<float>(place);
+
+    for (int64_t i = 0; i < rows_numel; ++i) {
+      EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast<float>(i * 2));
+    }
+  }
 
-  rpc_service_->ShutDown();
   server_thread.join();
-  rpc_service_.reset(nullptr);
-
-  for (int64_t i = 0; i < rows_numel; ++i) {
-    EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast<float>(i * 2));
-  }
+  LOG(INFO) << "begin reset";
+  g_rpc_service.reset(nullptr);
+  g_req_handler.reset(nullptr);
 }
diff --git a/paddle/fluid/operators/detail/request_handler.h b/paddle/fluid/operators/detail/request_handler.h
new file mode 100644
index 0000000000000000000000000000000000000000..4bc5e7f10ee2a8939d230fe96517bd9f56c13933
--- /dev/null
+++ b/paddle/fluid/operators/detail/request_handler.h
@@ -0,0 +1,127 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <time.h>
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+constexpr char kRequestSend[] = "RequestSend";
+constexpr char kRequestGet[] = "RequestGet";
+constexpr char kRequestPrefetch[] = "RequestPrefetch";
+
+class RPCServer;
+
+class RequestHandler {
+ public:
+  explicit RequestHandler(bool sync_mode)
+      : sync_mode_(sync_mode),
+        dev_ctx_(nullptr),
+        executor_(nullptr),
+        scope_(nullptr),
+        program_(nullptr),
+        rpc_server_(nullptr) {}
+
+  virtual ~RequestHandler() {}
+
+  // Set attributes.
+  void SetScope(framework::Scope* scope) { scope_ = scope; }
+  void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; }
+  void SetProgram(framework::ProgramDesc* program) { program_ = program; }
+  void SetExecutor(framework::Executor* executor) { executor_ = executor; }
+  void SetPrefetchPreparedCtx(
+      std::unique_ptr<framework::ExecutorPrepareContext> prepared) {
+    prefetch_ctx_.reset(prepared.release());
+  }
+
+  // Used for async.
+  void SetGradToPreparedCtx(
+      std::unordered_map<
+          std::string, std::shared_ptr<framework::ExecutorPrepareContext>>* g) {
+    grad_to_prepared_ctx_ = g;
+  }
+
+  void SetRPCServer(RPCServer* rpc_server) { rpc_server_ = rpc_server; }
+
+  // Get attributes.
+  bool sync_mode() { return sync_mode_; }
+  framework::Scope* scope() { return scope_; }
+  const platform::DeviceContext* dev_ctx() { return dev_ctx_; }
+  framework::ExecutorPrepareContext* prefetch_ctx() {
+    return prefetch_ctx_.get();
+  }
+  framework::ProgramDesc* program() { return program_; }
+  framework::Executor* executor() { return executor_; }
+  std::vector<framework::Variable*>& sparse_vars() { return sparse_vars_; }
+
+  // This function processes user's rpc request.
+  // The implemention is in request_handler_impl.
+  // example:
+  //    std::string varname = request_.varname();
+  //
+  //    auto scope = request_handler_->scope();
+  //    auto invar = scope->FindVar(varname);
+  //    framework::Variable* outvar = nullptr;
+  //
+  //    request_handler_->Handle(varname, scope, invar, &outvar);
+  //    if (outvar) {
+  //        SerializeToByteBuffer(varname, outvar,
+  //           *request_handler_->dev_ctx(), &reply_);
+  //    }
+  virtual bool Handle(const std::string& varname, framework::Scope* scope,
+                      framework::Variable* var,
+                      framework::Variable** outvar) = 0;
+
+ protected:
+  const bool sync_mode_;
+
+  const platform::DeviceContext* dev_ctx_;
+  framework::Executor* executor_;
+  framework::Scope* scope_;
+  framework::ProgramDesc* program_;
+  std::unique_ptr<framework::ExecutorPrepareContext> prefetch_ctx_;
+
+  // Used for async.
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>*
+      grad_to_prepared_ctx_;
+
+  // Record received sparse variables, so that
+  // we could reset those after execute optimize program
+  std::vector<framework::Variable*> sparse_vars_;
+  RPCServer* rpc_server_;
+
+  std::mutex sparse_var_mutex_;
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/request_handler_impl.cc b/paddle/fluid/operators/detail/request_handler_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f16c06d52f4fb86d51083a8b3b98d05a64c1af74
--- /dev/null
+++ b/paddle/fluid/operators/detail/request_handler_impl.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/detail/request_handler_impl.h"
+#include "paddle/fluid/operators/detail/rpc_server.h"
+#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/detail/variable_response.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+bool RequestSendHandler::Handle(const std::string& varname,
+                                framework::Scope* scope,
+                                framework::Variable* invar,
+                                framework::Variable** outvar) {
+  VLOG(4) << "RequestSendHandler:" << varname;
+
+  // Async
+  if (!sync_mode_) {
+    try {
+      executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
+                                    scope);
+    } catch (std::exception& e) {
+      LOG(ERROR) << "async: run sub program error " << e.what();
+      return false;
+    }
+    return true;
+  }
+
+  // Sync
+  if (varname == BATCH_BARRIER_MESSAGE) {
+    VLOG(3) << "sync: recv batch barrier message";
+    rpc_server_->IncreaseBatchBarrier(kRequestSend);
+  } else {
+    VLOG(3) << "sync: received var_name: " << varname;
+    if (sync_mode_) {
+      rpc_server_->WaitCond(kRequestSend);
+    }
+
+    if (invar == nullptr) {
+      LOG(ERROR) << "sync: Can not find server side var: " << varname;
+      PADDLE_THROW("sync: Can not find server side var");
+      return false;
+    }
+
+    if (invar->IsType<framework::SelectedRows>()) {
+      std::unique_lock<std::mutex> lock(sparse_var_mutex_);
+      sparse_vars_.push_back(invar);
+    }
+  }
+
+  return true;
+}
+
+bool RequestGetHandler::Handle(const std::string& varname,
+                               framework::Scope* scope,
+                               framework::Variable* invar,
+                               framework::Variable** outvar) {
+  VLOG(4) << "RequestGetHandler:" << varname;
+
+  if (varname != FETCH_BARRIER_MESSAGE) {
+    if (sync_mode_) {
+      rpc_server_->WaitCond(kRequestGet);
+    }
+    *outvar = scope_->FindVar(varname);
+    return true;
+  }
+
+  // FETCH_BARRIER_MESSAGE
+  if (sync_mode_) {
+    VLOG(3) << "sync: recv fetch barrier message";
+    rpc_server_->IncreaseBatchBarrier(kRequestGet);
+  }
+
+  return true;
+}
+
+bool RequestPrefetchHandler::Handle(const std::string& varname,
+                                    framework::Scope* scope,
+                                    framework::Variable* invar,
+                                    framework::Variable** outvar) {
+  VLOG(4) << "RequestPrefetchHandler " << varname;
+
+  auto var_desc = program_->Block(0).FindVar(varname);
+  *outvar = scope->FindVar(varname);
+  InitializeVariable(*outvar, var_desc->GetType());
+  executor_->RunPreparedContext(prefetch_ctx_.get(), scope);
+
+  return true;
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/request_handler_impl.h b/paddle/fluid/operators/detail/request_handler_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d0c62232b68ad6c05e751c25103802ee12db57e
--- /dev/null
+++ b/paddle/fluid/operators/detail/request_handler_impl.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <time.h>
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/detail/request_handler.h"
+#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+class RequestSendHandler final : public RequestHandler {
+ public:
+  explicit RequestSendHandler(bool sync_mode) : RequestHandler(sync_mode) {}
+  virtual ~RequestSendHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar) override;
+};
+
+class RequestGetHandler final : public RequestHandler {
+ public:
+  explicit RequestGetHandler(bool sync_mode) : RequestHandler(sync_mode) {}
+  virtual ~RequestGetHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar) override;
+};
+
+class RequestPrefetchHandler final : public RequestHandler {
+ public:
+  explicit RequestPrefetchHandler(bool sync_mode) : RequestHandler(sync_mode) {}
+  virtual ~RequestPrefetchHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar) override;
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/rpc_server.cc b/paddle/fluid/operators/detail/rpc_server.cc
new file mode 100644
index 0000000000000000000000000000000000000000..448763372a8c224cc68319a4a444915896b68234
--- /dev/null
+++ b/paddle/fluid/operators/detail/rpc_server.cc
@@ -0,0 +1,113 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <string>
+
+#include "paddle/fluid/operators/detail/rpc_server.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+void RPCServer::ShutDown() {
+  LOG(INFO) << "RPCServer ShutDown ";
+  ShutDownImpl();
+
+  exit_flag_ = true;
+  barrier_cond_.notify_all();
+  rpc_cond_.notify_all();
+}
+
+void RPCServer::SavePort() const {
+  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
+  std::ofstream port_file;
+  port_file.open(file_path);
+  port_file << selected_port_;
+  port_file.close();
+  VLOG(4) << "selected port written to " << file_path;
+}
+
+void RPCServer::WaitBarrier(const std::string& rpc_name) {
+  std::unique_lock<std::mutex> lock(this->mutex_);
+  barrier_cond_.wait(lock, [=] {
+    return (barrier_counter_[rpc_name] >= client_num_ || exit_flag_.load());
+  });
+
+  VLOG(3) << "batch_barrier_:" << barrier_counter_[rpc_name];
+}
+
+void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
+  VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
+  int b = 0;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    b = ++barrier_counter_[rpc_name];
+  }
+
+  VLOG(3) << "RPCServer IncreaseBatchBarrier " << rpc_name
+          << ", barrier_count:" << b << ", fan_in" << client_num_;
+
+  if (b >= client_num_) {
+    barrier_cond_.notify_all();
+  }
+}
+
+void RPCServer::ResetBarrierCounter() {
+  VLOG(3) << "RPCServer ResetBarrierCounter ";
+  std::unique_lock<std::mutex> lock(mutex_);
+  for (auto& t : barrier_counter_) {
+    t.second = 0;
+  }
+}
+
+void RPCServer::RegisterRPC(const std::string& rpc_name,
+                            RequestHandler* handler, int thread_num) {
+  rpc_call_map_[rpc_name] = handler;
+  rpc_thread_num_[rpc_name] = thread_num;
+
+  static int cond = -1;
+  rpc_cond_map_[rpc_name] = ++cond;
+  VLOG(4) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler
+          << ", cond:" << rpc_cond_map_[rpc_name];
+}
+
+void RPCServer::SetCond(const std::string& rpc_name) {
+  VLOG(3) << "RPCServer SetCond " << rpc_name;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cur_cond_ = rpc_cond_map_[rpc_name];
+  }
+
+  rpc_cond_.notify_all();
+}
+
+void RPCServer::WaitCond(const std::string& rpc_name) {
+  VLOG(3) << "RPCServer WaitCond " << rpc_name;
+  int cond = 0;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cond = rpc_cond_map_[rpc_name];
+  }
+
+  std::unique_lock<std::mutex> lock(mutex_);
+  rpc_cond_.wait(
+      lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/rpc_server.h b/paddle/fluid/operators/detail/rpc_server.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2e7ae706c9dc6776e09b25e424b30f110c3855d
--- /dev/null
+++ b/paddle/fluid/operators/detail/rpc_server.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <set>
+#include <string>
+#include <thread>  // NOLINT
+#include <utility>
+#include <vector>
+#include "paddle/fluid/operators/detail/request_handler.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+class RPCServer {
+ public:
+  explicit RPCServer(const std::string& address, int client_num)
+      : cur_cond_(0),
+        bind_address_(address),
+        exit_flag_(false),
+        selected_port_(0),
+        client_num_(client_num) {}
+
+  virtual ~RPCServer() {}
+  virtual void StartServer() = 0;
+  virtual void WaitServerReady() = 0;
+
+  void ShutDown();
+
+  bool IsExit() { return exit_flag_.load(); }
+
+  int GetSelectedPort() const { return selected_port_; }
+  void SavePort() const;
+
+  // RegisterRPC, register the rpc method name to a handler
+  // class, and auto generate a condition id for this call
+  // to be used for the barrier.
+  void RegisterRPC(const std::string& rpc_name, RequestHandler* handler,
+                   int thread_num = 5);
+
+  // Wait util all the clients have reached the barrier for one
+  // rpc method. This function should be called in the
+  // RequestHandler if you want to run the server/client in a
+  // synchronous mode.
+  void WaitBarrier(const std::string& rpc_name);
+
+  void SetCond(const std::string& rpc_name);
+  void WaitCond(const std::string& rpc_name);
+  void IncreaseBatchBarrier(const std::string rpc_name);
+  void ResetBarrierCounter();
+
+ protected:
+  virtual void ShutDownImpl() = 0;
+
+ private:
+  std::mutex mutex_;
+  std::unordered_map<std::string, int> barrier_counter_;
+  std::condition_variable barrier_cond_;
+
+  std::unordered_map<std::string, int> rpc_cond_map_;
+  std::atomic<int> cur_cond_;
+  std::condition_variable rpc_cond_;
+
+ protected:
+  std::string bind_address_;
+  std::atomic<int> exit_flag_;
+  int selected_port_;
+
+  const int client_num_;
+
+  std::unordered_map<std::string, RequestHandler*> rpc_call_map_;
+  std::unordered_map<std::string, int> rpc_thread_num_;
+  friend class RequestHandler;
+};
+
+};  // namespace detail
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc
index 3bae56532d655a1725e18276e09e0cade47b5c68..507b465435609a91ebca97dd70b176c3b79bee02 100644
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@@ -149,12 +149,14 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
   }
 
   if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
     // GPU data is copied to CPU buffer when sending,
     // free the buffer when possible.
     destroy_callback = [](void* backing) {
       platform::CUDAPinnedPlace cuda_pinned;
       memory::Free(cuda_pinned, backing);
     };
+#endif
   }
 
   std::string header;
diff --git a/paddle/fluid/operators/detail/variable_response.h b/paddle/fluid/operators/detail/variable_response.h
index bf624da2a6c26472e47711b3c6409f78afba0a64..69cfd784f8dd4f129f50c6882061e53e8535b949 100644
--- a/paddle/fluid/operators/detail/variable_response.h
+++ b/paddle/fluid/operators/detail/variable_response.h
@@ -67,8 +67,8 @@ class VariableResponse {
 
   framework::Scope* GetMutableLocalScope() const { return local_scope_; }
 
-  inline std::string Varname() { return meta_.varname(); }
-  inline std::string OutVarname() { return meta_.out_varname(); }
+  inline std::string Varname() const { return meta_.varname(); }
+  inline std::string OutVarname() const { return meta_.out_varname(); }
 
   // should call parse first.
   framework::Variable* GetVar() {
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index a5bb58c2f4047a3bf2f8592b605772b4fa166c57..20d960f9fee1eae42b2241fb96c163e15db5e24d 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -24,6 +24,8 @@ detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc)
 detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
 detection_library(target_assign_op SRCS target_assign_op.cc
 target_assign_op.cu)
+detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
+    polygon_box_transform_op.cu)
 
 # Export local libraries to parent
 set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cc b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..335e8dd470f851d8c5f6bdbc94cfc343da269034
--- /dev/null
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
@@ -0,0 +1,105 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class PolygonBoxTransformCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto* in = ctx.Input<Tensor>("Input");
+    auto in_dims = in->dims();
+    const T* in_data = in->data<T>();
+    auto* out = ctx.Output<Tensor>("Output");
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    int batch_size = in_dims[0];
+    int geo_channel = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    int id = 0;
+    for (int id_n = 0; id_n < batch_size * geo_channel; ++id_n) {
+      for (int id_h = 0; id_h < height; ++id_h) {
+        for (int id_w = 0; id_w < width; ++id_w) {
+          id = id_n * height * width + width * id_h + id_w;
+          if (id_n % 2 == 0) {
+            out_data[id] = id_w - in_data[id];
+          } else {
+            out_data[id] = id_h - in_data[id];
+          }
+        }
+      }
+    }
+  }
+};
+
+class PolygonBoxTransformOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("Input"),
+        "Input (Input) of polygon_box transform op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Output"),
+        "Output (Output) of polygon_box transform op should not be null.");
+
+    auto in_dim = ctx->GetInputDim("Input");
+
+    PADDLE_ENFORCE_EQ(in_dim.size(), 4, "input's rank must be 4.");
+    PADDLE_ENFORCE_EQ(in_dim[1] % 2, 0,
+                      "input's second dimension must be even.");
+
+    ctx->SetOutputDim("Output", in_dim);
+  }
+};
+
+class PolygonBoxTransformOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "Input",
+        "The input with shape [batch_size, geometry_channels, height, width]");
+    AddOutput("Output", "The output with the same shape as input");
+
+    AddComment(R"DOC(
+PolygonBoxTransform Operator.
+The input is the final geometry output in detection network.
+We use 2*n numbers to denote the coordinate shift from n corner vertices of
+the polygon_box to the pixel location. As each distance offset contains two numbers (xi, yi),
+the geometry output contains 2*n channels.
+PolygonBoxTransform Operator is used to transform the coordinate shift to the real coordinate.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(polygon_box_transform, ops::PolygonBoxTransformOp,
+                  ops::PolygonBoxTransformOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    polygon_box_transform,
+    ops::PolygonBoxTransformCPUKernel<paddle::platform::CPUPlace, float>,
+    ops::PolygonBoxTransformCPUKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6187ac6622c65d2bbc525c3fe2cb397cf74ac612
--- /dev/null
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
@@ -0,0 +1,76 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using platform::PADDLE_CUDA_NUM_THREADS;
+#define CUDA_BLOCK_SIZE 16
+
+template <typename T>
+__global__ void PolygonBoxTransformKernel(const int n, const int h, const int w,
+                                          const T* input, T* output) {
+  int id_n = threadIdx.x + blockDim.x * blockIdx.x;
+  int id_h = threadIdx.y + blockDim.y * blockIdx.y;
+  int id_w = threadIdx.z + blockDim.z * blockIdx.z;
+  if (id_n < n && id_h < h && id_w < w) {
+    int id = id_n * h * w + w * id_h + id_w;
+    if (id_n % 2 == 0) {
+      output[id] = id_w - input[id];
+    } else {
+      output[id] = id_h - input[id];
+    }
+  }
+}
+
+template <typename T>
+class PolygonBoxTransformOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto* in = ctx.Input<Tensor>("Input");
+    auto in_dims = in->dims();
+    const T* in_data = in->data<T>();
+    auto* out = ctx.Output<Tensor>("Output");
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    int batch_size = in_dims[0];
+    int geo_channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    dim3 threadsPerBlock(
+        PADDLE_CUDA_NUM_THREADS / (CUDA_BLOCK_SIZE * CUDA_BLOCK_SIZE),
+        CUDA_BLOCK_SIZE, CUDA_BLOCK_SIZE);
+    dim3 numBlocks((batch_size * geo_channels) / threadsPerBlock.x,
+                   (height + threadsPerBlock.y - 1) / threadsPerBlock.y,
+                   (width + threadsPerBlock.z - 1) / threadsPerBlock.z);
+    auto stream = ctx.cuda_device_context().stream();
+    PolygonBoxTransformKernel<T><<<numBlocks, threadsPerBlock, 0, stream>>>(
+        batch_size * geo_channels, height, width, in_data, out_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(
+    polygon_box_transform,
+    paddle::operators::PolygonBoxTransformOpCUDAKernel<float>,
+    paddle::operators::PolygonBoxTransformOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..43f949111104ee56efc8625bdd609e412ef7f37d
--- /dev/null
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -0,0 +1,76 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fake_dequantize_op.h"
+#include <string>
+
+namespace paddle {
+namespace operators {
+
+class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel {
+ public:
+  FakeDequantizeMaxAbsOp(const std::string &type,
+                         const framework::VariableNameMap &inputs,
+                         const framework::VariableNameMap &outputs,
+                         const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of FakeDequantizeMaxAbsOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of FakeDequantizeMaxAbsOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class FakeDequantizeMaxAbsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor) The input with float-32/64 type is the "
+             "low precision tensor.");
+    AddOutput("Out",
+              "(Tensor) The output is the dequantized high "
+              "precision tensor.");
+    AddAttr<int>("num_bits",
+                 "(int) `num_bits` is the quantization level bits, "
+                 "such as 2, 5, 8.");
+    AddAttr<float>("scale",
+                   "(float) The maximum absolute value of low precision tensor."
+                   "It is usually calculated by the fake_quantize_max_abs_op.");
+    AddComment(R"DOC(
+FakeDequantizeMaxAbsOp operator.
+
+This calculation is an opposite operation of FakeQuantizeMaxAbsOp:
+
+$$Out = \frac{scale*X}{2^{num_bits} - 1}$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsOp,
+                  ops::FakeDequantizeMaxAbsOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(fake_dequantize_max_abs,
+                       ops::FakeDequantizeMaxAbsKernel<CPU, float>,
+                       ops::FakeDequantizeMaxAbsKernel<CPU, double>);
diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1bd38d1bd2c3a6f90d2fbad415d61efaead3afe9
--- /dev/null
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fake_dequantize_op.h"
+
+namespace ops = paddle::operators;
+using CUDA = paddle::platform::CUDADeviceContext;
+REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs,
+                        ops::FakeDequantizeMaxAbsKernel<CUDA, float>,
+                        ops::FakeDequantizeMaxAbsKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..0901e68b3761159c3cc9c6684567bee38ec3f16d
--- /dev/null
+++ b/paddle/fluid/operators/fake_dequantize_op.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& ctx) const {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(in->place());
+
+    int num_bits = ctx.Attr<int>("num_bits");
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    int range = std::pow(2, num_bits) - 1;
+
+    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
+    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+    eigen_out.device(dev) = (scale / range) * eigen_in;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..79ec02f52094121d01c6bda2a5d99d2211893e89
--- /dev/null
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <future>  // NOLINT
+#include <ostream>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+
+class FetchBarrierOp : public framework::OperatorBase {
+ public:
+  FetchBarrierOp(const std::string& type,
+                 const framework::VariableNameMap& inputs,
+                 const framework::VariableNameMap& outputs,
+                 const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
+
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(place);
+    // For profiling
+    platform::RecordEvent record_event(Type(), &ctx);
+
+    auto rpc_client = detail::RPCClient::GetInstance();
+
+    PADDLE_ENFORCE(rpc_client->Wait());
+
+    for (auto& ep : eps) {
+      VLOG(3) << "fetch barrier, ep: " << ep;
+      rpc_client->AsyncSendFetchBarrier(ep);
+    }
+    PADDLE_ENFORCE(rpc_client->Wait());
+  }
+};
+
+class FetchBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddComment(R"DOC(
+SendBarrier operator
+
+This operator will send a send barrier signal to list_and_serv op, so that
+the Parameter Server would knew all variables have been sent.
+)DOC");
+
+    AddAttr<std::vector<std::string>>("endpoints",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints to send variables to.")
+        .SetDefault({"127.0.0.1:6164"});
+  }
+};
+
+class FetchBarrierOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(fetch_barrier, ops::FetchBarrierOp,
+                  paddle::framework::EmptyGradOpMaker, ops::FetchBarrierOpMaker,
+                  ops::FetchBarrierOpShapeInference);
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.h b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
index 2a7df149a9f4b03676f172da980c927d7fa5e8a4..63ea60678f80708f5a8340edd22588553b9ec139 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.h
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
@@ -24,6 +24,14 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* in = ctx.Input<framework::LoDTensor>("Input");
+    if (in->lod().size() && ctx.Attr<int>("input_dim_idx") == 0) {
+      // set the correct batch size for the LoDTensor.
+      auto odims = out->dims();
+      int output_dim_idx = ctx.Attr<int>("output_dim_idx");
+      odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1;
+      out->mutable_data<T>(odims, ctx.GetPlace());
+    }
     out->mutable_data<T>(ctx.GetPlace());
     auto value = ctx.Attr<float>("value");
 
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index e21b57258928856a10d6e86c3e2c6e81fb241ee3..aa3e05b83b23569a4dd9c83294916e289f993abc 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -33,7 +33,6 @@ class GatherOp : public framework::OperatorWithKernel {
     auto index_dims = ctx->GetInputDim("Index");
     PADDLE_ENFORCE(index_dims.size() == 1);
     int batch_size = ctx->GetInputDim("Index")[0];
-    PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
     framework::DDim output_dims(ctx->GetInputDim("X"));
     output_dims[0] = batch_size;
     ctx->SetOutputDim("Out", output_dims);
diff --git a/paddle/fluid/operators/gen_nccl_id_op.cc b/paddle/fluid/operators/gen_nccl_id_op.cc
index a5678f63466d368b3dd59380c18f9625cabd368b..4bce2d322d825110a446c9bc5eccdacf0ba3c943 100644
--- a/paddle/fluid/operators/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/gen_nccl_id_op.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/detail/grpc_client.h"
 #include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/detail/request_handler_impl.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 
 namespace paddle {
@@ -75,19 +76,23 @@ class GenNCCLIdOp : public framework::OperatorBase {
     // NOTE: Can not use unique_ptr here because the default
     // deleter will call GRPC Server's base class's dtor and
     // that will cause a wired crash.
-    detail::AsyncGRPCServer rpc_service(endpoint, true);
+    detail::RequestSendHandler rpc_h(true);
+    detail::AsyncGRPCServer rpc_service(endpoint, 1);
+    rpc_service.RegisterRPC(detail::kRequestSend, &rpc_h);
+    rpc_h.SetRPCServer(&rpc_service);
+
     framework::ProgramDesc empty_program;
     framework::Executor executor(dev_ctx.GetPlace());
-    rpc_service.SetScope(scope);
-    rpc_service.SetDevCtx(&dev_ctx);
-    rpc_service.SetProgram(&empty_program);
-    rpc_service.SetExecutor(&executor);
+    rpc_h.SetScope(scope);
+    rpc_h.SetDevCtx(&dev_ctx);
+    rpc_h.SetProgram(&empty_program);
+    rpc_h.SetExecutor(&executor);
 
     std::thread server_thread(
-        std::bind(&detail::AsyncGRPCServer::RunSyncUpdate, &rpc_service));
-    rpc_service.SetCond(0);
+        std::bind(&detail::AsyncGRPCServer::StartServer, &rpc_service));
+    rpc_service.SetCond(detail::kRequestSend);
     VLOG(3) << "start getting nccl id from trainer 0...";
-    auto recv = rpc_service.Get();
+    rpc_service.WaitBarrier(detail::kRequestSend);
     VLOG(3) << "got nccl id and stop server...";
     rpc_service.ShutDown();
     VLOG(3) << "rpc server stopped";
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 3e693ed7170530c5ca5cf8820e469146c2eb0c02..71e75c25321812c849e205460217b174d80654be 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -13,22 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <stdio.h>  // for removing the port file
+#include <csignal>
+#include <cstdlib>
 #include <fstream>
-#include <ostream>
 #include <thread>  // NOLINT
 #include <vector>
 
+#include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/detail/request_handler_impl.h"
 #include "paddle/fluid/operators/listen_and_serv_op.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
 
-void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
-  service->RunSyncUpdate();
+void RunServer(std::shared_ptr<detail::RPCServer> service) {
+  service->StartServer();
   VLOG(4) << "RunServer thread end";
 }
-
 static void split(const std::string &str, char sep,
                   std::vector<std::string> *pieces) {
   pieces->clear();
@@ -59,7 +61,7 @@ static void ParallelExecuteBlocks(
           int run_block = idx;  // thread local
           try {
             executor->RunPreparedContext(prepared[run_block].get(), scope);
-          } catch (std::exception &e) {
+          } catch (const std::exception &e) {
             LOG(ERROR) << "run sub program error " << e.what();
           }
         }));
@@ -67,16 +69,16 @@ static void ParallelExecuteBlocks(
   for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
 }
 
-std::atomic_int ListenAndServOp::selected_port_{0};
-
 ListenAndServOp::ListenAndServOp(const std::string &type,
                                  const framework::VariableNameMap &inputs,
                                  const framework::VariableNameMap &outputs,
                                  const framework::AttributeMap &attrs)
     : OperatorBase(type, inputs, outputs, attrs) {}
 
+ListenAndServOp::~ListenAndServOp() { Stop(); }
+
 void ListenAndServOp::Stop() {
-  rpc_service_->Push(LISTEN_TERMINATE_MESSAGE);
+  rpc_service_->ShutDown();
   server_thread_->join();
   auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
   remove(file_path.c_str());
@@ -84,26 +86,13 @@ void ListenAndServOp::Stop() {
 
 void ListenAndServOp::SavePort() const {
   // NOTE: default write file to /tmp/paddle.selected_port
-  selected_port_ = rpc_service_->GetSelectedPort();
-  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
-  std::ofstream port_file;
-  port_file.open(file_path);
-  port_file << selected_port_.load();
-  port_file.close();
-  VLOG(4) << "selected port written to " << file_path;
-}
-
-void ListenAndServOp::WaitServerReady() {
-  while (selected_port_.load() == 0) {
-  }
+  rpc_service_->SavePort();
 }
 
 void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
                                   framework::ProgramDesc *program,
                                   framework::Scope *recv_scope,
                                   framework::BlockDesc *prefetch_block) const {
-  auto fan_in = Attr<int>("Fanin");
-
   size_t num_blocks = program->Size();
   PADDLE_ENFORCE_GE(num_blocks, 2,
                     "server program should have at least 2 blocks");
@@ -118,49 +107,24 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
       optimize_prepared.begin(),
       std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
 
-  bool exit_flag = false;
+  rpc_service_->ResetBarrierCounter();
   // Record received sparse variables, so that
   // we could reset those after execute optimize program
   std::vector<framework::Variable *> sparse_vars;
-  while (!exit_flag) {
+  while (true) {
     // Get from multiple trainers, we don't care about the order in which
     // the gradients arrives, just add suffix 0~n and merge the gradient.
-    rpc_service_->SetCond(0);
-    size_t recv_var_cnt = 0;
-    int batch_barrier = 0;
-    while (batch_barrier != fan_in) {
-      const detail::ReceivedMessage v = rpc_service_->Get();
-      auto recv_var_name = v.first;
-      if (recv_var_name == LISTEN_TERMINATE_MESSAGE) {
-        LOG(INFO) << "received terminate message and exit";
-        exit_flag = true;
-        break;
-      } else if (recv_var_name == BATCH_BARRIER_MESSAGE) {
-        VLOG(3) << "recv batch barrier message";
-        batch_barrier++;
-        continue;
-      } else {
-        VLOG(3) << "received grad: " << recv_var_name;
-        recv_var_cnt++;
-        auto var = v.second->GetVar();
-        if (var == nullptr) {
-          LOG(ERROR) << "Can not find server side var: " << recv_var_name;
-          PADDLE_THROW("Can not find server side var");
-        }
-        if (var->IsType<framework::SelectedRows>()) {
-          sparse_vars.push_back(var);
-        }
-      }
-    }
-    if (exit_flag) {
-      rpc_service_->SetCond(1);
-      rpc_service_->ShutDown();
+    rpc_service_->SetCond(detail::kRequestSend);
+    rpc_service_->WaitBarrier(detail::kRequestSend);
+
+    if (rpc_service_->IsExit()) {
+      LOG(WARNING) << "get exit!rpc_processor break!";
+      rpc_service_->SetCond(detail::kRequestGet);
       break;
     }
 
     // NOTE: if is_gpu_place, CUDA kernels are launched by multiple threads
     // and this will still work.
-
     // The optimize blocks which have the same parent ID would run parallel
     // TODO(Yancey1989): need to use ParallelExecutor for future
     int32_t last_parent_blkid = program->Block(1).Parent();
@@ -187,63 +151,34 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
     // mini-batch.
     // TODO(Yancey1989): move the reset action into an operator, we couldn't
     // have any hide logic in the operator.
-    for (auto &var : sparse_vars) {
+    for (framework::Variable *var : sparse_vars) {
       var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
     }
 
-    rpc_service_->SetCond(1);
-    // FIXME(typhoonzero): use another condition to sync wait clients get.
-    rpc_service_->WaitClientGet(fan_in);
-    sparse_vars.clear();
+    rpc_service_->SetCond(detail::kRequestGet);
+    rpc_service_->WaitBarrier(detail::kRequestGet);
+    rpc_service_->ResetBarrierCounter();
   }  // while(true)
 }
 
-static void AsyncUpdateThread(
-    const std::string &var_name, const bool &exit_flag,
-    const std::shared_ptr<detail::ReceivedQueue> &queue,
-    framework::Executor *executor,
-    framework::ExecutorPrepareContext *prepared) {
-  VLOG(3) << "update thread for " << var_name << " started";
-  while (!exit_flag) {
-    const detail::ReceivedMessage v = queue->Pop();
-    auto recv_var_name = v.first;
-    auto var = v.second->GetVar();
-    if (var == nullptr) {
-      LOG(ERROR) << "Can not find server side var: " << recv_var_name;
-      PADDLE_THROW("Can not find server side var");
-    }
-    auto fs = framework::Async([var_name, &executor, &v, prepared] {
-      try {
-        executor->RunPreparedContext(prepared,
-                                     v.second->GetMutableLocalScope());
-      } catch (std::exception &e) {
-        LOG(ERROR) << "run sub program error " << e.what();
-      }
-    });
-    fs.wait();
-  }
-}
-
 void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
                                    framework::ProgramDesc *program) const {
   VLOG(3) << "RunAsyncLoop in";
   // grad name to block id
   std::unordered_map<std::string, int32_t> grad_to_block_id;
   std::unordered_map<int32_t, std::string> id_to_grad;
-  std::unordered_map<std::string, std::shared_ptr<detail::ReceivedQueue>>
-      grad_to_queue;
 
   auto grad_to_block_id_str =
       Attr<std::vector<std::string>>("grad_to_block_id");
-  for (auto &grad_and_id : grad_to_block_id_str) {
+  for (const auto &grad_and_id : grad_to_block_id_str) {
     std::vector<std::string> pieces;
     split(grad_and_id, ':', &pieces);
     VLOG(3) << "after split, grad = " << pieces[0] << ", id=" << pieces[1];
     PADDLE_ENFORCE_EQ(pieces.size(), 2);
     PADDLE_ENFORCE_EQ(grad_to_block_id.count(pieces[0]), 0);
+
     int block_id = std::stoi(pieces[1]);
     grad_to_block_id[pieces[0]] = block_id;
-    grad_to_queue[pieces[0]] = std::make_shared<detail::ReceivedQueue>();
     id_to_grad[block_id] = pieces[0];
   }
   size_t num_blocks = program->Size();
@@ -262,40 +197,36 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
     grad_to_prepared_ctx[id_to_grad[block_list[i]]] = optimize_prepared[i];
   }
 
-  bool exit_flag = false;
-
-  VLOG(3) << "start async optimize threads";
-  std::vector<std::future<void>> fs;
-  for (auto iter = grad_to_queue.begin(); iter != grad_to_queue.end(); iter++) {
-    std::string grad_name = iter->first;
-    VLOG(3) << "create async update thread for " << grad_name;
-    fs.push_back(framework::AsyncIO([grad_name, &exit_flag, &executor,
-                                     &grad_to_queue, &grad_to_prepared_ctx]() {
-      AsyncUpdateThread(grad_name, exit_flag, grad_to_queue[grad_name],
-                        executor, grad_to_prepared_ctx[grad_name].get());
-    }));
-  }
+  request_send_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
+  request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
+  request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
 
   VLOG(3) << "RunAsyncLoop into while";
-  while (!exit_flag) {
-    const detail::ReceivedMessage v = rpc_service_->Get();
-    auto recv_var_name = v.first;
-    if (recv_var_name == LISTEN_TERMINATE_MESSAGE) {
-      LOG(INFO) << "received terminate message and exit";
-      exit_flag = true;
+  while (true) {
+    if (rpc_service_->IsExit()) {
+      LOG(INFO) << "get exit!rpc_processor break!";
       break;
-    } else {
-      VLOG(3) << "received grad: " << recv_var_name;
-      grad_to_queue[recv_var_name]->Push(v);
     }
 
-    if (exit_flag) {
-      rpc_service_->ShutDown();
-      break;
-    }
+    sleep(1);
   }  // while(true)
 }
 
+static void FillRequestCtx(detail::RequestHandler *h, framework::Scope *scope,
+                           platform::DeviceContext *dev_ctx,
+                           framework::Executor *executor,
+                           framework::ProgramDesc *program,
+                           framework::ExecutorPrepareContext *prefetch_ctx,
+                           detail::RPCServer *rpc_server) {
+  h->SetScope(scope);
+  h->SetDevCtx(dev_ctx);
+  h->SetExecutor(executor);
+  h->SetProgram(program);
+  h->SetPrefetchPreparedCtx(std::move(
+      std::unique_ptr<framework::ExecutorPrepareContext>(prefetch_ctx)));
+  h->SetRPCServer(rpc_server);
+}
+
 void ListenAndServOp::RunImpl(const framework::Scope &scope,
                               const platform::Place &dev_place) const {
   // Mark this as PS that it should decide profiling by listening from trainer.
@@ -305,36 +236,53 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   framework::Scope &recv_scope = scope.NewScope();
 
   bool sync_mode = Attr<bool>("sync_mode");
+  auto fan_in = Attr<int>("Fanin");
 
   PADDLE_ENFORCE(!rpc_service_);
   std::string endpoint = Attr<std::string>("endpoint");
 
-  rpc_service_.reset(new detail::AsyncGRPCServer(endpoint, sync_mode));
+  LOG(INFO) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in
+            << ", end_point:" << endpoint;
+
+  // request_handler_.reset(new detail::GRPCRequestSendHandler(sync_mode));
+  rpc_service_.reset(new detail::AsyncGRPCServer(endpoint, fan_in));
+  request_send_handler_.reset(new detail::RequestSendHandler(sync_mode));
+  request_get_handler_.reset(new detail::RequestGetHandler(sync_mode));
+  request_prefetch_handler_.reset(
+      new detail::RequestPrefetchHandler(sync_mode));
+
+  rpc_service_->RegisterRPC(detail::kRequestSend, request_send_handler_.get());
+  rpc_service_->RegisterRPC(detail::kRequestGet, request_get_handler_.get());
+  rpc_service_->RegisterRPC(detail::kRequestPrefetch,
+                            request_prefetch_handler_.get());
 
   auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock);
   auto *prefetch_block = Attr<framework::BlockDesc *>(kPrefetchBlock);
   auto *program = optimize_block->Program();
   framework::Executor executor(dev_place);
 
-  // prepare rpc_service
-  rpc_service_->SetScope(&recv_scope);
-  rpc_service_->SetDevCtx(&dev_ctx);
-  rpc_service_->SetProgram(program);
-  rpc_service_->SetExecutor(&executor);
-
   // prepare for prefetch
   VLOG(3) << "prefetch block id is " << prefetch_block->ID();
   auto prefetch_prepared = executor.Prepare(*program, prefetch_block->ID());
-  rpc_service_->SetPrefetchPreparedCtx(std::move(prefetch_prepared));
+
+  auto f = std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope,
+                     &dev_ctx, &executor, program, prefetch_prepared.release(),
+                     rpc_service_.get());
+
+  f(request_send_handler_.get());
+  f(request_get_handler_.get());
+  f(request_prefetch_handler_.get());
 
   // start the server listening after all member initialized.
   server_thread_.reset(new std::thread(RunServer, rpc_service_));
   VLOG(3) << "wait server thread to become ready...";
   rpc_service_->WaitServerReady();
 
+  // register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers
+  signal(SIGINT, SignalHandler::StopAndExit);
+  signal(SIGTERM, SignalHandler::StopAndExit);
+
   // Write to a file of server selected port for python use.
-  std::string file_path = string::Sprintf("/tmp/paddle.%d.selected_port",
-                                          static_cast<int>(::getpid()));
   SavePort();
   if (sync_mode) {
     RunSyncLoop(&executor, program, &recv_scope, prefetch_block);
@@ -347,12 +295,9 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
     AddInput("X", "(Tensor) Variables that server recv.").AsDuplicable();
-    AddComment(R"DOC(
-ListenAndServ operator
-
-This operator will start a RPC server which can receive variables
-from send_op and send back variables to recv_op.
-)DOC");
+    AddComment(R"DOC(" + "ListenAndServ operator" + "\n" + "This operator" +
+" will start a RPC server which can receive variables from send_op and send" +
+"back variables to recv_op.)DOC");
     AddAttr<std::string>("endpoint",
                          "(string, default 127.0.0.1:6164)"
                          "IP address to listen on.")
@@ -373,6 +318,11 @@ from send_op and send back variables to recv_op.
   }
 };
 
+void SignalHandler::StopAndExit(int signal_num) {
+  VLOG(3) << "Catch interrupt signal: " << signal_num << ", program will exit";
+  exit(0);
+}
+
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h
index 8af061eaf2bec4a9edd264c8c77ac69e228b0669..87952cb0e683596b2b0395890b6e25b15f74d7e2 100644
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -16,14 +16,15 @@ limitations under the License. */
 
 #include <stdint.h>
 #include <atomic>
-#include <ostream>
+#include <set>
 #include <string>
 
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/detail/request_handler.h"
+#include "paddle/fluid/operators/detail/rpc_server.h"
 
 namespace paddle {
 namespace operators {
@@ -31,7 +32,7 @@ namespace operators {
 constexpr char kOptimizeBlock[] = "OptimizeBlock";
 constexpr char kPrefetchBlock[] = "PrefetchBlock";
 
-void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service);
+void RunServer(std::shared_ptr<detail::RPCServer> service);
 
 class ListenAndServOp : public framework::OperatorBase {
  public:
@@ -40,6 +41,8 @@ class ListenAndServOp : public framework::OperatorBase {
                   const framework::VariableNameMap& outputs,
                   const framework::AttributeMap& attrs);
 
+  virtual ~ListenAndServOp();
+
   void RunSyncLoop(framework::Executor* executor,
                    framework::ProgramDesc* program,
                    framework::Scope* recv_scope,
@@ -50,22 +53,28 @@ class ListenAndServOp : public framework::OperatorBase {
 
   void SavePort() const;
 
-  void WaitServerReady();
-
-  int GetSelectedPort() { return selected_port_; }
+  int GetSelectedPort() { return rpc_service_->GetSelectedPort(); }
 
   void Stop() override;
 
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override;
 
-  static void ResetPort() { selected_port_ = 0; }
-
  protected:
-  mutable std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
+  mutable std::shared_ptr<detail::RPCServer> rpc_service_;
+  mutable std::shared_ptr<detail::RequestHandler> request_send_handler_;
+  mutable std::shared_ptr<detail::RequestHandler> request_get_handler_;
+  mutable std::shared_ptr<detail::RequestHandler> request_prefetch_handler_;
+
   mutable std::shared_ptr<std::thread> server_thread_;
-  // FIXME(wuyi): it's static so that the operator can be cloned.
-  static std::atomic_int selected_port_;
+};
+
+class SignalHandler {
+ public:
+  static void StopAndExit(int signal_num);
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(SignalHandler);
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/lookup_sparse_table_op.cc b/paddle/fluid/operators/lookup_sparse_table_op.cc
index d07a81968565f095cdb6425d104bc7a11bc9cfad..2ce11e712fb1a8aa9748313ec7cf4e895a931465 100644
--- a/paddle/fluid/operators/lookup_sparse_table_op.cc
+++ b/paddle/fluid/operators/lookup_sparse_table_op.cc
@@ -127,7 +127,7 @@ class LookupSparseTableOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(-1.0f);
     AddAttr<float>("max",
                    "(float, default 1.0) "
-                   "Maximun value of uniform random")
+                   "Maximum value of uniform random")
         .SetDefault(1.0f);
     AddAttr<int>("seed",
                  "(int, default 0) "
diff --git a/paddle/fluid/operators/math/cross_entropy.cc b/paddle/fluid/operators/math/cross_entropy.cc
index fc0fca5ad3370633b2f60db65fdb7c01c417dc50..caff35e03ae3a144f799d982c859ded62cb3e93d 100644
--- a/paddle/fluid/operators/math/cross_entropy.cc
+++ b/paddle/fluid/operators/math/cross_entropy.cc
@@ -46,7 +46,10 @@ class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
 
       const int64_t* label_data = labels->data<int64_t>();
       for (int i = 0; i < batch_size; ++i) {
-        int index = i * class_num + label_data[i];
+        int lbl = label_data[i];
+        PADDLE_ENFORCE_GE(lbl, 0);
+        PADDLE_ENFORCE_LT(lbl, class_num);
+        int index = i * class_num + lbl;
         loss_data[i] = -math::TolerableValue<T>()(std::log(prob_data[index]));
       }
     }
diff --git a/paddle/fluid/operators/mul_mkldnn_op.cc b/paddle/fluid/operators/mul_mkldnn_op.cc
deleted file mode 100644
index a5f3a98f678a870d30eebfc4cf329de7c93266ee..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mul_mkldnn_op.cc
+++ /dev/null
@@ -1,197 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "mkldnn.hpp"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/mul_op.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
-
-namespace paddle {
-namespace operators {
-
-using paddle::framework::Tensor;
-using paddle::platform::MKLDNNDeviceContext;
-
-template <typename Format = mkldnn::memory::format>
-mkldnn::memory::desc type(const std::vector<int>& dims, Format&& f) {
-  return platform::MKLDNNMemDesc(dims, mkldnn::memory::data_type::f32, f);
-}
-
-template <typename T>
-class MulMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-
-    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-    auto mkldnn_engine = dev_ctx.GetEngine();
-
-    auto input = ctx.Input<Tensor>("X");
-    auto weight = ctx.Input<Tensor>("Y");
-
-    PADDLE_ENFORCE(input->dims().size() & (2 | 4),
-                   "Input must be with 2 or 4 dimensions, i.e. NC or NCHW");
-    PADDLE_ENFORCE(weight->dims().size() & (2 | 4),
-                   "Weights must be with 2 or 4 dimensions, i.e. OI or OIHW");
-
-    std::vector<int> w_tz = paddle::framework::vectorize2int(weight->dims());
-    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
-
-    auto src_md =
-        src_tz.size() != 2
-            ? type(src_tz, mkldnn::memory::format::nchw)
-            : type({src_tz[0], src_tz[1]}, mkldnn::memory::format::nc);
-
-    auto dst_md = type({src_tz[0], w_tz[1]}, mkldnn::memory::format::nc);
-
-    auto weights_md =
-        src_tz.size() != 2
-            ? type({w_tz[1], src_tz[1], src_tz[2], src_tz[3]},
-                   mkldnn::memory::format::oihw)
-            : type({w_tz[1], src_tz[1]}, mkldnn::memory::format::oi);
-
-    auto output = ctx.Output<Tensor>("Out");
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
-
-    const std::string key = ctx.op().Output("Out");
-    const std::string key_fc_pd = key + "@mul_pd";
-
-    const T* input_data = input->data<T>();
-    const T* w_data = weight->data<T>();
-
-    auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine}, output_data);
-
-    auto src_memory = mkldnn::memory({src_md, mkldnn_engine},
-                                     platform::to_void_cast(input_data));
-
-    auto weights_memory = mkldnn::memory({weights_md, mkldnn_engine},
-                                         platform::to_void_cast(w_data));
-
-    auto pd = platform::MKLDNNFwdPrimitiveDesc<mkldnn::inner_product_forward>(
-        mkldnn_engine, src_md, weights_md, dst_md);
-
-    dev_ctx.SetBlob(key_fc_pd, pd);
-
-    auto forward = mkldnn::inner_product_forward(*pd, src_memory,
-                                                 weights_memory, dst_memory);
-
-    std::vector<mkldnn::primitive> pipeline = {forward};
-    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
-  }
-};
-
-template <typename T>
-class MulMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-
-    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-    auto mkldnn_engine = dev_ctx.GetEngine();
-
-    const Tensor* input = ctx.Input<Tensor>("X");
-    const Tensor* w = ctx.Input<Tensor>("Y");
-
-    const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    Tensor* w_grad = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    const std::string key = ctx.op().Input("Out");
-    const std::string key_fc_pd = key + "@mul_pd";
-
-    const T* input_data = input->data<T>();
-    const T* w_data = w->data<T>();
-    const T* out_grad_data = out_grad->data<T>();
-    T* input_grad_data = nullptr;
-    T* w_grad_data = nullptr;
-
-    if (input_grad) {
-      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-    }
-    if (w_grad) {
-      w_grad_data = w_grad->mutable_data<T>(ctx.GetPlace());
-    }
-
-    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
-    std::vector<int> w_tz = paddle::framework::vectorize2int(w->dims());
-
-    auto src_md =
-        src_tz.size() != 2
-            ? type(src_tz, mkldnn::memory::format::nchw)
-            : type({src_tz[0], src_tz[1]}, mkldnn::memory::format::nc);
-
-    auto dst_md = type({src_tz[0], w_tz[1]}, mkldnn::memory::format::nc);
-
-    auto weights_md =
-        src_tz.size() != 2
-            ? type({w_tz[1], src_tz[1], src_tz[2], src_tz[3]},
-                   mkldnn::memory::format::oihw)
-            : type({w_tz[1], src_tz[1]}, mkldnn::memory::format::oi);
-
-    auto src_memory = mkldnn::memory({src_md, mkldnn_engine},
-                                     platform::to_void_cast(input_data));
-
-    auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine},
-                                     platform::to_void_cast(out_grad_data));
-
-    auto weight_memory = mkldnn::memory({weights_md, mkldnn_engine},
-                                        platform::to_void_cast(w_data));
-
-    auto pd =
-        std::static_pointer_cast<mkldnn::inner_product_forward::primitive_desc>(
-            dev_ctx.GetBlob(key_fc_pd));
-
-    PADDLE_ENFORCE(pd != nullptr, "Fail to find pd in device context");
-
-    if (w_grad) {
-      auto weights_grad_memory = mkldnn::memory(
-          {weights_md, mkldnn_engine}, platform::to_void_cast(w_grad_data));
-
-      auto bwd_weight_pd = platform::MKLDNNBwdPrimitiveDesc<
-          mkldnn::inner_product_backward_weights>(mkldnn_engine, *pd, src_md,
-                                                  weights_md, dst_md);
-
-      auto bwd_weights_prim = mkldnn::inner_product_backward_weights(
-          bwd_weight_pd, src_memory, dst_memory, weights_grad_memory);
-
-      std::vector<mkldnn::primitive> pipeline{bwd_weights_prim};
-      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
-    }
-
-    if (input_grad) {
-      auto src_grad_memory = mkldnn::memory(
-          {src_md, mkldnn_engine}, platform::to_void_cast(input_grad_data));
-
-      auto bwd_data_pd =
-          platform::MKLDNNBwdPrimitiveDesc<mkldnn::inner_product_backward_data>(
-              mkldnn_engine, *pd, src_md, weights_md, dst_md);
-
-      auto bwd_data_prim = mkldnn::inner_product_backward_data(
-          bwd_data_pd, dst_memory, weight_memory, src_grad_memory);
-
-      std::vector<mkldnn::primitive> pipeline{bwd_data_prim};
-      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_KERNEL(mul, MKLDNN, ::paddle::platform::CPUPlace,
-                   paddle::operators::MulMKLDNNOpKernel<float>);
-
-REGISTER_OP_KERNEL(mul_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   paddle::operators::MulMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index a43739463c85b38e1dba04c6ec1bfcf4b6cbfa63..51993398bd3427e1f0da155918395bc50fa65e45 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -16,10 +16,6 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
-
 namespace paddle {
 namespace operators {
 
@@ -76,22 +72,6 @@ class MulOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
     ctx->ShareLoD("X", /*->*/ "Out");
   }
-
- private:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    framework::LibraryType library{framework::LibraryType::kPlain};
-#ifdef PADDLE_WITH_MKLDNN
-    if (library == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library = framework::LibraryType::kMKLDNN;
-    }
-#endif
-    framework::DataLayout layout{framework::DataLayout::kAnyLayout};
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        layout, library);
-  }
 };
 
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -120,9 +100,6 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker {
         )DOC")
         .SetDefault(1)
         .EqualGreaterThan(1);
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
     AddAttr<int>(
         "y_num_col_dims",
         R"DOC((int, default 1), The mul_op can take tensors with more than two,
@@ -177,22 +154,6 @@ class MulGradOp : public framework::OperatorWithKernel {
       ctx->SetOutputDim(y_grad_name, y_dims);
     }
   }
-
- private:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    framework::LibraryType library{framework::LibraryType::kPlain};
-#ifdef PADDLE_WITH_MKLDNN
-    if (library == framework::LibraryType::kPlain &&
-        platform::CanMKLDNNBeUsed(ctx)) {
-      library = framework::LibraryType::kMKLDNN;
-    }
-#endif
-    framework::DataLayout layout{framework::DataLayout::kAnyLayout};
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        layout, library);
-  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc
index 4cfea958e8e50156c90af8806414b043e15f8a9c..e0a9b24ac8978418a1a4ece62286e022bec8b834 100644
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
@@ -41,12 +41,7 @@ class PrefetchOp : public framework::OperatorBase {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
 
-    auto client_var_name = Output("RPCClient");
-    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
-                            "Can not find variable '%s' in the scope.",
-                            client_var_name);
-    auto* client_var = scope.FindVar(client_var_name);
-    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
+    auto rpc_client = detail::RPCClient::GetInstance();
 
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
@@ -66,9 +61,6 @@ class PrefetchOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
     AddInput("X", "(LoDTensor) Input Id variables to be sent").AsDuplicable();
-    AddOutput("RPCClient",
-              "(RPCClient) The RPC client object which will be"
-              "initialized at most once.");
     AddOutput("Out",
               "(LoDTensor) result "
               "to be fetched from parameter server")
@@ -87,17 +79,6 @@ the parameter server and fetch result back.
   }
 };
 
-class PrefetchOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto out_var_name = op_desc.Output("RPCClient").front();
-    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarType::RAW;
-    out_var.SetType(var_type);
-  }
-};
-
 class PrefetchOpShapeInference : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext* ctx) const override {}
@@ -110,5 +91,4 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(prefetch, ops::PrefetchOp,
                   paddle::framework::EmptyGradOpMaker, ops::PrefetchOpMaker,
-                  ops::PrefetchOpVarTypeInference,
                   ops::PrefetchOpShapeInference);
diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b14b559e31dd422f8ebe4002988a9746dfdf28a2
--- /dev/null
+++ b/paddle/fluid/operators/random_crop_op.cc
@@ -0,0 +1,81 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/random_crop_op.h"
+
+namespace paddle {
+namespace operators {
+
+class RandomCropOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "A batch of instances to random crop.");
+    AddInput("Seed", "The random seed.");
+    AddOutput("Out", "The cropped instance batch.");
+    AddOutput("SeedOut", "The random seed after random cropping.")
+        .AsDispensable();
+    AddAttr<std::vector<int>>("shape", "The shape of a cropped instance.");
+    AddComment(R"DOC(
+      This operator takes a batch of instance, and do random cropping on each instance. 
+      It means that cropping positions differs on each instance, which is determined 
+      by an uniform random generator. All cropped instances have the same shape, which 
+      is determined by the operator's attribute 'shape'.
+    )DOC");
+  }
+};
+
+class RandomCropOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    auto seed_dim = ctx->GetInputDim("Seed");
+    PADDLE_ENFORCE(seed_dim.size() == 1 && seed_dim[0] == 1);
+    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    auto x_dim = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_GT(x_dim.size(), static_cast<int64_t>(shape.size()));
+    auto out_dim = framework::vectorize2int(x_dim);
+    for (size_t i = 1; i <= shape.size(); ++i) {
+      size_t x_i = x_dim.size() - i;
+      size_t shape_i = shape.size() - i;
+      PADDLE_ENFORCE_GE(x_dim[x_i], shape[shape_i]);
+      out_dim[x_i] = shape[shape_i];
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dim));
+    ctx->SetOutputDim("SeedOut", framework::make_ddim({1}));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace f = paddle::framework;
+REGISTER_OPERATOR(random_crop, ops::RandomCropOp, ops::RandomCropOpMaker,
+                  ops::RandomCropOpInferShape, f::EmptyGradOpMaker);
+
+template <typename T>
+using Kernel = ops::RandomCropKernel<paddle::platform::CPUDeviceContext, T>;
+REGISTER_OP_CPU_KERNEL(random_crop, Kernel<float>, Kernel<int>, Kernel<double>,
+                       Kernel<uint8_t>, Kernel<int16_t>);
diff --git a/paddle/fluid/operators/random_crop_op.cu b/paddle/fluid/operators/random_crop_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6fc9bedc55b4d349ddf3d109c7f9049113235f0c
--- /dev/null
+++ b/paddle/fluid/operators/random_crop_op.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/random_crop_op.h"
+
+namespace ops = paddle::operators;
+template <typename T>
+using Kernel = ops::RandomCropKernel<paddle::platform::CUDADeviceContext, T>;
+REGISTER_OP_CUDA_KERNEL(random_crop, Kernel<float>, Kernel<int>, Kernel<double>,
+                        Kernel<uint8_t>, Kernel<int16_t>);
diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3261cbdc986b0cc724315c1eb92b8b84e18c742
--- /dev/null
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -0,0 +1,181 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/for_range.h"
+#ifdef PADDLE_WITH_CUDA
+#include <thrust/random.h>
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext>
+struct Random;
+
+template <>
+struct Random<platform::CPUDeviceContext> {
+  using Engine = std::minstd_rand;
+
+  template <typename T>
+  using UniformIntDist = std::uniform_int_distribution<T>;
+};
+
+#ifdef PADDLE_WITH_CUDA
+template <>
+struct Random<platform::CUDADeviceContext> {
+  using Engine = thrust::minstd_rand;
+
+  template <typename T>
+  using UniformIntDist = thrust::uniform_int_distribution<T>;
+};
+#endif
+
+template <typename T>
+HOSTDEVICE inline void StridedMemcpy(const T* x, const size_t* x_dims, T* out,
+                                     const size_t* out_dims, int i, int rank,
+                                     size_t prod_x_remain,
+                                     size_t prod_out_remain,
+                                     const size_t* offsets) {
+  size_t x_dim_i = x_dims[i];
+  size_t out_dim_i = out_dims[i];
+  size_t x_stride = prod_x_remain / x_dim_i;
+  size_t out_stride = prod_out_remain / out_dim_i;
+  size_t offset_i = offsets[i];
+
+  if (i == rank - 1) {
+    PADDLE_ASSERT(x_stride == 1 && out_stride == 1);
+    x += offset_i;
+    for (size_t j = 0; j < out_dim_i; ++j) {
+      *out++ = *x++;
+    }
+  } else {
+    x += offset_i * x_stride;
+    for (size_t j = 0; j < out_dim_i; ++j) {
+      StridedMemcpy<T>(x, x_dims, out, out_dims, i + 1, rank, x_stride,
+                       out_stride, offsets);
+      x += x_stride;
+      out += out_stride;
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+struct RandomCropFunctor {
+  const T* x_;
+  T* out_;
+  size_t x_dims_[9];
+  size_t out_dims_[9];
+  int num_batchsize_dims_;
+  int rank_;
+  int64_t seed_;
+
+  size_t prod_batchsize_dims_;
+  size_t prod_x_ins_dims_;
+  size_t prod_out_ins_dims_;
+
+  RandomCropFunctor(const T* x, T* out, const framework::DDim& x_dims,
+                    const framework::DDim& out_dims, int num_batchsize_dims,
+                    int64_t seed)
+      : x_(x),
+        out_(out),
+        num_batchsize_dims_(num_batchsize_dims),
+        rank_(x_dims.size()),
+        seed_(seed) {
+    PADDLE_ENFORCE_EQ(x_dims.size(), out_dims.size());
+    PADDLE_ENFORCE_GT(rank_, num_batchsize_dims_);
+    prod_batchsize_dims_ = 1;
+    prod_x_ins_dims_ = 1;
+    prod_out_ins_dims_ = 1;
+    for (size_t i = 0; i < static_cast<size_t>(rank_); ++i) {
+      size_t x_dim_i = x_dims[i];
+      size_t out_dim_i = out_dims[i];
+      x_dims_[i] = x_dim_i;
+      out_dims_[i] = out_dim_i;
+      if (i < static_cast<size_t>(num_batchsize_dims_)) {
+        PADDLE_ENFORCE_EQ(x_dim_i, out_dim_i);
+        prod_batchsize_dims_ *= x_dim_i;
+      } else {
+        prod_x_ins_dims_ *= x_dim_i;
+        prod_out_ins_dims_ *= out_dim_i;
+      }
+    }
+  }
+
+  HOSTDEVICE void operator()(size_t ins_idx) {
+    typename Random<DeviceContext>::Engine engine(seed_);
+    engine.discard(ins_idx * (rank_ - num_batchsize_dims_));
+    size_t offsets[9];
+    for (int i = num_batchsize_dims_; i < rank_; ++i) {
+      typename Random<DeviceContext>::template UniformIntDist<size_t> dist(
+          0, x_dims_[i] - out_dims_[i]);
+      offsets[i - num_batchsize_dims_] = dist(engine);
+    }
+
+    const T* x = x_ + ins_idx * prod_x_ins_dims_;
+    T* out = out_ + ins_idx * prod_out_ins_dims_;
+
+    StridedMemcpy<T>(x, x_dims_ + num_batchsize_dims_, out,
+                     out_dims_ + num_batchsize_dims_, 0,
+                     rank_ - num_batchsize_dims_, prod_x_ins_dims_,
+                     prod_out_ins_dims_, offsets);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class RandomCropKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& ctx) const {
+    auto& seed_tensor = detail::Ref(ctx.Input<framework::LoDTensor>("Seed"));
+    int64_t seed = 0;
+    if (platform::is_cpu_place(seed_tensor.place())) {
+      seed = *seed_tensor.data<int64_t>();
+    } else {
+      LOG(WARNING) << "It is slow to place seed in GPU memory. Please verify "
+                      "your program";
+      framework::LoDTensor cpu_seed;
+      framework::TensorCopySync(seed_tensor, platform::CPUPlace(), &cpu_seed);
+      seed = *cpu_seed.data<int64_t>();
+    }
+    auto shape = ctx.Attr<std::vector<int>>("shape");
+    auto& x = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
+    auto& out = detail::Ref(ctx.Output<framework::LoDTensor>("Out"));
+
+    int num_batchsize_dims = x.dims().size() - shape.size();
+    RandomCropFunctor<DeviceContext, T> functor(
+        x.data<T>(), out.mutable_data<T>(ctx.GetPlace()), x.dims(), out.dims(),
+        num_batchsize_dims, seed);
+    platform::ForRange<DeviceContext> for_range(
+        ctx.template device_context<DeviceContext>(),
+        functor.prod_batchsize_dims_);
+
+    for_range(functor);
+
+    Random<platform::CPUDeviceContext>::Engine engine(seed);
+    engine.discard(functor.prod_batchsize_dims_ *
+                   (functor.rank_ - functor.num_batchsize_dims_));
+    *ctx.Output<framework::LoDTensor>("SeedOut")->mutable_data<int64_t>(
+        platform::CPUPlace()) = engine();
+  }
+};
+
+// TODO(fengjiayi): Backward of random crop op
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt
index 3106978eb0149b14849dfd1aaad8bbe76791f2f6..62532036f86bfb82465ccd9e0ec526299489932a 100644
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -23,6 +23,7 @@ reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_o
 reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc)
 reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
 reader_library(create_threaded_reader_op SRCS create_threaded_reader_op.cc)
+reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
 
 cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc)
 # Export local libraries to parent
diff --git a/paddle/fluid/operators/reader/create_custom_reader_op.cc b/paddle/fluid/operators/reader/create_custom_reader_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..331224a59899b4a7d517ca4f7141fb5b8f4f5168
--- /dev/null
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -0,0 +1,187 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class CustomReader : public framework::DecoratedReader {
+ public:
+  CustomReader(ReaderBase* reader, const framework::BlockDesc& sub_block,
+               const std::vector<std::string>& source_var_names,
+               const std::vector<std::string>& sink_var_names)
+      : DecoratedReader(reader),
+        program_(*sub_block.Program()),
+        sub_block_id_(sub_block.ID()),
+        exe_(framework::Executor(platform::CPUPlace())),
+        source_var_names_(source_var_names),
+        sink_var_names_(sink_var_names) {}
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+
+ private:
+  const framework::ProgramDesc program_;
+  int sub_block_id_;
+  framework::Executor exe_;
+
+  std::vector<std::string> source_var_names_;
+  std::vector<std::string> sink_var_names_;
+};
+
+class CreateCustomReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    auto* sub_block = Attr<framework::BlockDesc*>("sub_block");
+    if (out->Get() != nullptr) {
+      return;
+    }
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    out->Reset(
+        new CustomReader(underlying_reader.Get(), *sub_block,
+                         Attr<std::vector<std::string>>("source_var_names"),
+                         Attr<std::vector<std::string>>("sink_var_names")));
+  }
+};
+
+class CreateCustomReaderOpMaker : public DecoratedReaderMakerBase {
+ protected:
+  void Apply() override {
+    AddAttr<framework::BlockDesc*>(
+        "sub_block", "The block to hold all preprocessing operators.");
+    AddAttr<std::vector<std::string>>(
+        "source_var_names",
+        "Source variables are starting points of data preprocessing. They hold "
+        "preprocessing's input tensors. Each source variable corresponds to "
+        "one of underlying reader's output datas.");
+    AddAttr<std::vector<std::string>>(
+        "sink_var_names",
+        "Sink variables are ending points of data preprocessing. They hold "
+        "preprocessing's output tensors. Each sink variable corresponds to "
+        "one of custom reader's output datas.");
+    AddComment(R"DOC(
+      CreateCustomReader Operator
+
+      A custom reader can be used for input data preprocessing. 
+      A custom reader holds its own sub-block, which will be executed in CPU 
+      in its 'ReadNext()' function. Users can configurate their own 
+      preprocessing pipelines by inserting operators into custom reader's 
+      sub-block.
+    )DOC");
+  }
+};
+
+class CustomReaderInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(!ctx->IsRuntime(),
+                   "'CustomReaderInferShape' should only be invoked during "
+                   "compile time.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "The output decorated reader should not be null.");
+    const auto* sub_block =
+        ctx->Attrs().Get<framework::BlockDesc*>("sub_block");
+    const auto sink_var_names =
+        ctx->Attrs().Get<std::vector<std::string>>("sink_var_names");
+    std::vector<std::vector<int64_t>> res_dims;
+    std::vector<int32_t> res_lod_levels;
+    for (const std::string& var_name : sink_var_names) {
+      auto* sink_var = sub_block->FindVar(var_name);
+      PADDLE_ENFORCE_NOT_NULL(sink_var);
+      res_dims.emplace_back(sink_var->GetShape());
+      res_lod_levels.push_back(sink_var->GetLoDLevel());
+    }
+    auto* out_reader =
+        boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
+    out_reader->SetShapes(res_dims);
+    out_reader->SetLoDLevels(res_lod_levels);
+  }
+};
+
+class CustomReaderInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    framework::VarDesc* out_reader = block->FindVar(op_desc.Output("Out")[0]);
+    PADDLE_ENFORCE_NOT_NULL(out_reader);
+    out_reader->SetType(framework::proto::VarType::READER);
+
+    auto sink_var_names =
+        boost::get<std::vector<std::string>>(op_desc.GetAttr("sink_var_names"));
+    const auto* sub_block =
+        boost::get<framework::BlockDesc*>(op_desc.GetAttr("sub_block"));
+    std::vector<framework::proto::VarType::Type> res_data_types;
+    for (const std::string& var_name : sink_var_names) {
+      framework::VarDesc* var = sub_block->FindVar(var_name);
+      PADDLE_ENFORCE_NOT_NULL(var);
+      res_data_types.emplace_back(var->GetDataType());
+    }
+    out_reader->SetDataTypes(res_data_types);
+  }
+};
+
+void CustomReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+  out->clear();
+  std::vector<framework::LoDTensor> underlying_outs;
+  reader_->ReadNext(&underlying_outs);
+  if (underlying_outs.empty()) {
+    // There is not next data.
+    return;
+  }
+  PADDLE_ENFORCE(source_var_names_.size() == underlying_outs.size(),
+                 "The size of source_var_names(%d) and the size of "
+                 "underlying_outs(%d) are not consistent. Each feeding element "
+                 "must have its own source variable.",
+                 source_var_names_.size(), underlying_outs.size());
+  // The scope for CustomReader's sub-block should be independent and shouldn't
+  // be any other computation scope's child. Otherwise, data preprocessing and
+  // compution cannot be concurrent.
+  framework::Scope scope;
+  // 1. Copy LoDTensors from underlying reader's output to source variables.
+  for (size_t i = 0; i < source_var_names_.size(); ++i) {
+    framework::Variable* var = scope.Var(source_var_names_[i]);
+    framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
+    tensor->ShareDataWith(underlying_outs[i]);
+    tensor->set_lod(underlying_outs[i].lod());
+  }
+  // 2. Run the sub-block.
+  exe_.Run(program_, &scope, sub_block_id_, false, true);
+  // 3. Copy LoDTensors from sink variables to out.
+  out->resize(sink_var_names_.size());
+  for (size_t i = 0; i < sink_var_names_.size(); ++i) {
+    const auto& tensor = detail::Ref(scope.FindVar(sink_var_names_[i]))
+                             .Get<framework::LoDTensor>();
+    framework::TensorCopySync(tensor, platform::CPUPlace(), &(*out)[i]);
+  }
+}
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators::reader;
+REGISTER_OPERATOR(create_custom_reader, ops::CreateCustomReaderOp,
+                  ops::CreateCustomReaderOpMaker, ops::CustomReaderInferShape,
+                  ops::CustomReaderInferVarType,
+                  paddle::framework::EmptyGradOpMaker)
diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc
index 11f1ddebc48134158315ea70a2d2b9e07f2e2469..612e1f5eca3a4836db1fd167fc6bb63400d20177 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -115,6 +115,7 @@ void DecoratedReaderInferShape::operator()(
       boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
   out_reader->SetLoDLevels(in_reader->GetLoDLevels());
 }
+
 void DecoratedReaderInferVarType::operator()(
     const framework::OpDesc& op_desc, framework::BlockDesc* block) const {
   std::string in_reader_name = op_desc.Input("UnderlyingReader")[0];
diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc
index 7148bd0e363a71b58581a6c3c5f245d98d5b9d02..d8ddb7b448910b5e0e6e71742eb2fdc6a225c919 100644
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 
 #include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -36,19 +37,23 @@ class RecvOp : public framework::OperatorBase {
                const platform::Place& place) const override {
     auto outs = Outputs("Out");
     std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+    int sync_mode = Attr<int>("sync_mode");
 
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
+    // For profiling
+    platform::RecordEvent record_event(Type(), &ctx);
+
+    auto rpc_client = detail::RPCClient::GetInstance();
 
     for (size_t i = 0; i < outs.size(); i++) {
-      VLOG(3) << "getting " << outs[i];
-      client_.AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
+      VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
+      rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
+    }
+    if (sync_mode) {
+      PADDLE_ENFORCE(rpc_client->Wait());
     }
-    PADDLE_ENFORCE(client_.Wait());
   }
-
- private:
-  mutable detail::RPCClient client_;
 };
 
 class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -65,6 +70,10 @@ This operator can get variables from server side.
                                       "Server endpoints in the order of input "
                                       "variables for mapping")
         .SetDefault({});
+    AddAttr<int>("sync_mode",
+                 "(int, default 0)"
+                 "sync recv or async recv.")
+        .SetDefault(0);
   }
 };
 
diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc
index 1ce0907f3a9473e37f53bf7b2d42cddcb629dfa6..bcd8e81609a37cc544f5a5cc4188400c1632a668 100644
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 
 #include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -36,31 +37,32 @@ class SendBarrierOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
     std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
+    bool sync_mode = Attr<bool>("sync_mode");
 
-    auto client_var_name = Output("RPCClient");
-    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
-                            "Can not find variable '%s' in the scope.",
-                            client_var_name);
-    auto* client_var = scope.FindVar(client_var_name);
-    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(place);
+    // For profiling
+    platform::RecordEvent record_event(Type(), &ctx);
+
+    auto rpc_client = detail::RPCClient::GetInstance();
+
+    VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode;
 
     // need to wait before sending send_barrier message
     PADDLE_ENFORCE(rpc_client->Wait());
-
-    for (auto& ep : eps) {
-      VLOG(3) << "send barrier, ep: " << ep;
-      rpc_client->AsyncSendBatchBarrier(ep);
+    if (sync_mode) {
+      for (auto& ep : eps) {
+        VLOG(3) << "send barrier, ep: " << ep;
+        rpc_client->AsyncSendBatchBarrier(ep);
+      }
+      PADDLE_ENFORCE(rpc_client->Wait());
     }
-    PADDLE_ENFORCE(rpc_client->Wait());
   }
 };
 
 class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
-    AddOutput("RPCClient",
-              "(RPCClient) The RPC client object which is"
-              "initialized at most once.");
     AddComment(R"DOC(
 SendBarrier operator
 
@@ -72,17 +74,7 @@ the Parameter Server would knew all variables have been sent.
                                       "(string vector, default 127.0.0.1:6164)"
                                       "Server endpoints to send variables to.")
         .SetDefault({"127.0.0.1:6164"});
-  }
-};
-
-class SendBarrierOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto out_var_name = op_desc.Output("RPCClient").front();
-    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarType::RAW;
-    out_var.SetType(var_type);
+    AddAttr<bool>("sync_mode", "work in sync_mode or not").SetDefault(true);
   }
 };
 
@@ -98,5 +90,4 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(send_barrier, ops::SendBarrierOp,
                   paddle::framework::EmptyGradOpMaker, ops::SendBarrierOpMaker,
-                  ops::SendBarrierOpVarTypeInference,
                   ops::SendBarrierOpShapeInference);
diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index 95bb1f3c695297e6d8134a647925310207118a9b..a5150f242ca3b0befafa2443f0bc466e2aea85e4 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -49,12 +49,7 @@ class SendOp : public framework::OperatorBase {
     // For profiling
     platform::RecordEvent record_event(Type(), &ctx);
 
-    auto client_var_name = Output("RPCClient");
-    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
-                            "Can not find variable '%s' in the scope.",
-                            client_var_name);
-    auto* client_var = scope.FindVar(client_var_name);
-    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
+    auto rpc_client = detail::RPCClient::GetInstance();
 
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
@@ -96,9 +91,6 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable();
     AddOutput("Out", "(Tensor) Output tensor to be received from server")
         .AsDuplicable();
-    AddOutput("RPCClient",
-              "(RPCClient) The RPC client object which is"
-              "initialized at most once.");
     AddComment(R"DOC(
 Send operator
 
@@ -119,17 +111,6 @@ This operator will send tensor to recv_op at the parameter server.
   }
 };
 
-class SendOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto out_var_name = op_desc.Output("RPCClient").front();
-    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarType::RAW;
-    out_var.SetType(var_type);
-  }
-};
-
 class SendOpShapeInference : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext* ctx) const override {}
@@ -141,5 +122,4 @@ class SendOpShapeInference : public framework::InferShapeBase {
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(send, ops::SendOp, paddle::framework::EmptyGradOpMaker,
-                  ops::SendOpMaker, ops::SendOpVarTypeInference,
-                  ops::SendOpShapeInference);
+                  ops::SendOpMaker, ops::SendOpShapeInference);
diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc
index d5303eaf50722234d205264e56892b1723104d53..e550552b195b768d68ec64e9c3b5889b56ca719f 100644
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -156,6 +156,7 @@ TEST(SendRecvOp, CPUDense) {
   std::thread server_thread(StartServerNet, false, &initialized);
   while (!initialized) {
   }
+
   static_cast<paddle::operators::ListenAndServOp *>(listen_and_serv_op.get())
       ->WaitServerReady();
 
@@ -175,9 +176,10 @@ TEST(SendRecvOp, CPUDense) {
   std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
   attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
   attrs.insert({"epmap", std::vector<std::string>({endpoint})});
-  auto send_op = f::OpRegistry::CreateOp(
-      "send", {{"X", {"x1"}}},
-      {{"Out", {"Out"}}, {"RPCClient", {"RPC_CLIENT_VAR"}}}, attrs);
+  const f::VariableNameMap &inputs = {{"X", {"x1"}}};
+  const f::VariableNameMap &outputs = {{"Out", {"Out"}}};
+
+  auto send_op = f::OpRegistry::CreateOp("send", inputs, outputs, attrs);
   send_op->Run(scope, place);
 
   auto in_var = scope.Var("x1");
@@ -220,9 +222,8 @@ TEST(SendRecvOp, CPUSparse) {
   std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
   attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
   attrs.insert({"epmap", std::vector<std::string>({endpoint})});
-  auto send_op = f::OpRegistry::CreateOp(
-      "send", {{"X", {"x1"}}},
-      {{"Out", {"Out"}}, {"RPCClient", {"RPC_CLIENT_VAR"}}}, attrs);
+  auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
+                                         {{"Out", {"Out"}}}, attrs);
   send_op->Run(scope, place);
 
   auto x0 = scope.Var("x0")->GetMutable<f::SelectedRows>();
diff --git a/paddle/fluid/operators/send_recv_util.h b/paddle/fluid/operators/send_recv_util.h
index 113513eb6b327773ab4a1c062fb8a3f06fddfbca..deab005149027caffa962783df944fad7110382f 100644
--- a/paddle/fluid/operators/send_recv_util.h
+++ b/paddle/fluid/operators/send_recv_util.h
@@ -20,6 +20,9 @@ namespace operators {
 
 inline bool NeedSend(const framework::Scope& scope,
                      const std::string& varname) {
+  // dummy variable is only used in parallel executor to represent
+  // some dependency relationship, we don't need to send/recv it.
+  if (varname == "dummy") return false;
   auto* var = scope.FindVar(varname);
   PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
                           varname);
diff --git a/paddle/fluid/operators/send_vars_op.cc b/paddle/fluid/operators/send_vars_op.cc
index f11e84c176ae97dff0fda560ce3ebe2ab72c7bcc..fe839dab6924618c8a4c39868d9bf86056a0be40 100644
--- a/paddle/fluid/operators/send_vars_op.cc
+++ b/paddle/fluid/operators/send_vars_op.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/grpc_client.h"
 #include "paddle/fluid/operators/send_recv_util.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -41,12 +42,10 @@ class SendVarsOp : public framework::OperatorBase {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
 
-    auto client_var_name = Output("RPCClient");
-    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
-                            "Can not find variable '%s' in the scope.",
-                            client_var_name);
-    auto* client_var = scope.FindVar(client_var_name);
-    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
+    // For profiling
+    platform::RecordEvent record_event(Type(), &ctx);
+
+    auto rpc_client = detail::RPCClient::GetInstance();
 
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
@@ -69,9 +68,6 @@ class SendVarsOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() {
     AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
         .AsDuplicable();
-    AddOutput("RPCClient",
-              "(RPCClient) The RPC client object which will be"
-              "initialized at most once.");
     AddComment(R"DOC(
 Send operator
 
@@ -89,17 +85,6 @@ This operator will send variables to listen_and_serve op at the parameter server
   }
 };
 
-class SendVarsOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto out_var_name = op_desc.Output("RPCClient").front();
-    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarType::RAW;
-    out_var.SetType(var_type);
-  }
-};
-
 class SendVarsOpShapeInference : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext* ctx) const override {}
@@ -112,5 +97,4 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(send_vars, ops::SendVarsOp,
                   paddle::framework::EmptyGradOpMaker, ops::SendVarsOpMaker,
-                  ops::SendVarsOpVarTypeInference,
                   ops::SendVarsOpShapeInference);
diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h
index f3e88b0a0b05ef792b2cc8e880bdfddb6e6124d1..f9e0596191d0b86686e0fa36265806111c774b38 100644
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
@@ -96,8 +96,12 @@ class SGDOpKernel : public framework::OpKernel<T> {
         return;
       }
 
-      size_t param_row_width = param.value().numel() / param.rows().size();
-      size_t grad_row_width = grad.value().numel() / grad.rows().size();
+      auto param_row_width = param.value().dims()[1];
+      auto grad_row_width = grad.value().dims()[1];
+      VLOG(4) << " param rows: " << param.rows().size()
+              << " param memory rows: " << param.value().dims()[0]
+              << " grad rows: " << grad.rows().size()
+              << " grad memory rows: " << grad.value().dims()[0];
       PADDLE_ENFORCE_EQ(param_row_width, grad_row_width,
                         "param_row should have the same size with grad_row");
 
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c75fce7959d1af51afd52af23fe657d10a2f3988
--- /dev/null
+++ b/paddle/fluid/operators/shape_op.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/shape_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class ShapeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input (Input) of get_shape op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output (Out) of get_shape op should not be null.");
+    auto in_dim = ctx->GetInputDim("Input");
+    ctx->SetOutputDim("Out", {in_dim.size()});
+  }
+};
+
+class ShapeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Input", "(Tensor), The input tensor.");
+    AddOutput("Out", "(Tensor), The shape of input tensor.");
+    AddComment(R"DOC(
+Shape Operator. 
+Get the shape of input tensor.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(shape, ops::ShapeOp, ops::ShapeOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<int>, ops::ShapeKernel<int64_t>,
+                       ops::ShapeKernel<float>, ops::ShapeKernel<double>);
diff --git a/paddle/fluid/operators/shape_op.cu b/paddle/fluid/operators/shape_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7736a2a1e13cfa5d445411b3efac7669a7bf23a2
--- /dev/null
+++ b/paddle/fluid/operators/shape_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/shape_op.h"
+
+REGISTER_OP_CUDA_KERNEL(shape, paddle::operators::ShapeKernel<int>,
+                        paddle::operators::ShapeKernel<int64_t>,
+                        paddle::operators::ShapeKernel<float>,
+                        paddle::operators::ShapeKernel<double>);
diff --git a/paddle/fluid/operators/shape_op.h b/paddle/fluid/operators/shape_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..3be86b66a538e7b38a5d59095fee7e7636364bce
--- /dev/null
+++ b/paddle/fluid/operators/shape_op.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class ShapeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_t = ctx.Input<Tensor>("Input");
+    auto* out_t = ctx.Output<Tensor>("Out");
+    auto out_data = out_t->mutable_data<int64_t>(platform::CPUPlace());
+    auto in_dims = in_t->dims();
+    for (int i = 0; i < in_dims.size(); ++i) {
+      out_data[i] = in_dims[i];
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..855157e7c4c5c4a43091d28d3a5414e6e386b727
--- /dev/null
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+
+#include "paddle/fluid/operators/tensorrt_engine_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
+    const framework::ExecutionContext &context) const {
+  // Get the ProgramDesc and pass to convert.
+  const auto &block = context.Attr<framework::proto::BlockDesc>("subgraph");
+  max_batch_ = context.Attr<int>("max_batch");
+  auto max_workspace = context.Attr<int>("max_workspace");
+  engine_.reset(new inference::tensorrt::TensorRTEngine(
+      max_batch_, max_workspace, nullptr));
+  // TODO(Superjomn) parameters should be passed after analysised from outside.
+  inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
+      block, {}, context.scope(), engine_.get());
+  engine_->FreezeNetwork();
+}
+
+class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Xs", "A list of inputs.").AsDuplicable();
+    AddOutput("Ys", "A list of outputs").AsDuplicable();
+    AddAttr<std::string>("subgraph", "the subgraph");
+    AddComment("TensorRT engine operator.");
+  }
+};
+
+class TensorRTEngineInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp,
+                  ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    tensorrt_engine,
+    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int64_t>);
+
+#endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe273d386c529be3df05a955f492e2c39d4d8812
--- /dev/null
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -0,0 +1,110 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_CUDA
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+
+namespace paddle {
+namespace operators {
+
+class TensorRTEngineOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::OpKernelType kt = framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>("pre_ids")->type()),
+        platform::CPUPlace());
+    return kt;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TensorRTEngineKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    if (!engine_) {
+      Prepare(context);
+    }
+    auto input_names = context.op().Inputs("Xs");
+    PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
+    // Try to determine a batch_size
+    auto* tensor0 = context.Input<framework::LoDTensor>(input_names.front());
+    PADDLE_ENFORCE_NOT_NULL(tensor0);
+    int batch_size = tensor0->dims()[0];
+    PADDLE_ENFORCE_LE(batch_size, max_batch_);
+
+    // Convert input tensor from fluid to engine.
+    for (const auto& x : context.Inputs("Xs")) {
+      // convert input and copy to TRT engine's buffer
+      auto* v = context.scope().FindVar(x);
+      PADDLE_ENFORCE_NOT_NULL(v, "no variable called %s", x);
+      auto& t = v->Get<framework::LoDTensor>();
+      if (platform::is_cpu_place(t.place())) {
+        engine_->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
+                                 t.memory_size());
+      } else {
+        engine_->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()),
+                                 t.memory_size());
+      }
+    }
+    // Execute the engine.
+    PADDLE_ENFORCE_GT(batch_size, 0);
+    engine_->Execute(batch_size);
+    // Convert output tensor from engine to fluid
+    for (const auto& y : context.Outputs("Ys")) {
+      // convert output and copy to fluid.
+      nvinfer1::ITensor* trt_t = engine_->GetITensor(y);
+      auto dims = trt_t->getDimensions();
+      // Use the output ITensor's dims to reshape the Fluid Tensor.
+      std::vector<int> ddim(dims.d, dims.d + dims.nbDims);
+
+      auto* fluid_v = context.scope().FindVar(y);
+      PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
+      auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
+      fluid_t->Resize(framework::make_ddim(ddim));
+      auto size = inference::analysis::AccuDims(dims.d, dims.nbDims);
+      if (platform::is_cpu_place(fluid_t->place())) {
+        engine_->GetOutputInCPU(
+            y, fluid_t->mutable_data<float>(platform::CPUPlace()), size);
+      } else {
+        engine_->GetOutputInGPU(
+            y, fluid_t->mutable_data<float>(platform::CUDAPlace()), size);
+      }
+    }
+  }
+
+ protected:
+  // Build the engine.
+  void Prepare(const framework::ExecutionContext& context) const;
+
+ private:
+  mutable std::unique_ptr<inference::tensorrt::TensorRTEngine> engine_;
+  mutable int max_batch_{0};
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/test_send_nccl_id.cc b/paddle/fluid/operators/test_send_nccl_id.cc
index 719f039a0f5fcd7445bf1589a683f122e6d62ba0..a845ba2eb038fa6a8e70dfbac06c31c19dbb9e3e 100644
--- a/paddle/fluid/operators/test_send_nccl_id.cc
+++ b/paddle/fluid/operators/test_send_nccl_id.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/grpc_server.h"
+#include "paddle/fluid/operators/detail/request_handler_impl.h"
 #include "paddle/fluid/operators/listen_and_serv_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
@@ -35,42 +37,44 @@ namespace m = paddle::operators::math;
 namespace detail = paddle::operators::detail;
 namespace string = paddle::string;
 
-std::unique_ptr<detail::AsyncGRPCServer> rpc_service;
+std::unique_ptr<detail::AsyncGRPCServer> g_rpc_service;
+std::unique_ptr<detail::RequestHandler> g_req_handler;
 
-void StartServer(std::atomic<bool>* initialized) {
+void StartServer() {
   f::Scope scope;
   p::CPUPlace place;
   scope.Var(NCCL_ID_VARNAME);
   p::DeviceContextPool& pool = p::DeviceContextPool::Instance();
   auto& dev_ctx = *pool.Get(p::CPUPlace());
 
-  rpc_service.reset(new detail::AsyncGRPCServer("127.0.0.1:0", true));
-
   f::ProgramDesc empty_program;
   f::Executor executor(dev_ctx.GetPlace());
-  rpc_service->SetScope(&scope);
-  rpc_service->SetDevCtx(&dev_ctx);
-  rpc_service->SetProgram(&empty_program);
-  rpc_service->SetExecutor(&executor);
+  g_req_handler->SetScope(&scope);
+  g_req_handler->SetDevCtx(&dev_ctx);
+  g_req_handler->SetProgram(&empty_program);
+  g_req_handler->SetExecutor(&executor);
+
+  g_rpc_service->RegisterRPC(detail::kRequestSend, g_req_handler.get());
+  g_req_handler->SetRPCServer(g_rpc_service.get());
 
   std::thread server_thread(
-      std::bind(&detail::AsyncGRPCServer::RunSyncUpdate, rpc_service.get()));
-  *initialized = true;
-  rpc_service->SetCond(0);
-  auto recv = rpc_service->Get();
+      std::bind(&detail::AsyncGRPCServer::StartServer, g_rpc_service.get()));
+
+  g_rpc_service->SetCond(detail::kRequestSend);
+  std::cout << "before WaitFanInOfSend" << std::endl;
+  g_rpc_service->WaitBarrier(detail::kRequestSend);
+
   LOG(INFO) << "got nccl id and stop server...";
-  rpc_service->ShutDown();
+  g_rpc_service->ShutDown();
   server_thread.join();
 }
 
-TEST(SendNcclId, DISABLED_Normal) {
-  std::atomic<bool> initialized{false};
-  std::thread server_thread(StartServer, &initialized);
-  while (!initialized) {
-  }
-  // wait server to start
-  // sleep(2);
-  rpc_service->WaitServerReady();
+TEST(SendNcclId, GrpcServer) {
+  g_req_handler.reset(new detail::RequestSendHandler(true));
+  g_rpc_service.reset(new detail::AsyncGRPCServer("127.0.0.1:0", 1));
+
+  std::thread server_thread(StartServer);
+  g_rpc_service->WaitServerReady();
 
   f::Scope scope;
   p::CPUPlace place;
@@ -78,17 +82,20 @@ TEST(SendNcclId, DISABLED_Normal) {
   auto& dev_ctx = *pool.Get(p::CPUPlace());
 
   auto var = scope.Var(NCCL_ID_VARNAME);
-  // var->SetType(f::proto::VarType_Type_RAW);
   auto id = var->GetMutable<ncclUniqueId>();
   p::dynload::ncclGetUniqueId(id);
 
-  int port = rpc_service->GetSelectedPort();
+  int port = g_rpc_service->GetSelectedPort();
+
   std::string ep = string::Sprintf("127.0.0.1:%d", port);
   detail::RPCClient client;
-
+  LOG(INFO) << "connect to server" << ep;
   client.AsyncSendVariable(ep, dev_ctx, scope, NCCL_ID_VARNAME);
   client.Wait();
+  client.AsyncSendBatchBarrier(ep);
+  client.Wait();
+
   server_thread.join();
-  auto* ptr = rpc_service.release();
-  delete ptr;
+  g_rpc_service.reset(nullptr);
+  g_req_handler.reset(nullptr);
 }
diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
index d44eeae8e6ff9ac87ab093d04e3f5427743f0c08..7ddb82ef6ff063868a4b9b603b8ab89700b9dd13 100644
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -55,6 +55,9 @@ class TopkKernel : public framework::OpKernel<T> {
     // NOTE: eigen shape doesn't affect paddle tensor.
     eg_input.reshape(flat2dims);
 
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
     for (size_t i = 0; i < row; i++) {
       std::vector<std::pair<T, size_t>> vec;
       for (size_t j = 0; j < col; j++) {
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index 09367889a9517956ad01ad2847c31e2633cc643d..6f8e3f22db54d166cf97cfdd3d009058207a7ca5 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <stdio.h>
+#include <string>
 #include <thread>  // NOLINT
 #include <typeindex>
 #include <vector>
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 2fb5c6dc6b8ad25fa1ad5fcf7c2acfedd5be4a83..3d8d64e4c2758675067834810ebb9aee1e88fdb9 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -38,6 +38,7 @@ struct EventList;
 
 static int64_t profiler_lister_id = 0;
 static bool should_send_profile_state = false;
+std::mutex profiler_mu;
 
 // The profiler state, the initial value is ProfilerState::kDisabled
 static ProfilerState g_state = ProfilerState::kDisabled;
@@ -228,6 +229,8 @@ void EnableProfiler(ProfilerState state) {
   PADDLE_ENFORCE(state != ProfilerState::kDisabled,
                  "Can't enbale profling, since the input state is ",
                  "ProfilerState::kDisabled");
+
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (state == g_state) {
     return;
   }
@@ -295,7 +298,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
   } else if (g_state == ProfilerState::kAll) {
     place = "All";
   } else {
-    PADDLE_THROW("Invalid profiler state");
+    PADDLE_THROW("Invalid profiler state", g_state);
   }
 
   std::cout << "Place: " << place << std::endl;
@@ -443,6 +446,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
 
 void DisableProfiler(EventSortingKey sorted_key,
                      const std::string& profile_path) {
+  std::lock_guard<std::mutex> l(profiler_mu);
   if (g_state == ProfilerState::kDisabled) return;
   // Mark the profiling stop.
   Mark("_stop_profiler_", nullptr);
@@ -466,7 +470,7 @@ void SetProfileListener() {
   std::mt19937 rng;
   rng.seed(std::random_device()());
   std::uniform_int_distribution<std::mt19937::result_type> dist6(
-      1, std::numeric_limits<std::mt19937::result_type>::max());
+      1, std::numeric_limits<int>::max());
   profiler_lister_id = dist6(rng);
 }
 int64_t ListenerId() { return profiler_lister_id; }
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
index 9111abca5aac97e9d5c7b00ce5173f08e49cda12..76aa7d2010682416f68e982e9b89da9813abb078 100644
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -32,7 +32,8 @@ void BindConstValue(pybind11::module* m) {
       .value("Forward", framework::OpRole::kForward)
       .value("Backward", framework::OpRole::kBackward)
       .value("Optimize", framework::OpRole::kOptimize)
-      .value("Loss", framework::OpRole::kLoss);
+      .value("Loss", framework::OpRole::kLoss)
+      .value("RPC", framework::OpRole::kRPC);
 
   op_proto_and_checker_maker.def(
       "kOpRoleAttrName", framework::OpProtoAndCheckerMaker::OpRoleAttrName);
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 50a1c07251b5bc4e7cc27de63f5457d3f94daef5..3af8941be69fe507bc105e26b608ec768e4b5998 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -117,6 +117,7 @@ PYBIND11_PLUGIN(core) {
       .def("set", PyCPUTensorSetFromArray<int64_t>)
       .def("set", PyCPUTensorSetFromArray<bool>)
       .def("set", PyCPUTensorSetFromArray<uint16_t>)
+      .def("set", PyCPUTensorSetFromArray<uint8_t>)
 #ifdef PADDLE_WITH_CUDA
       .def("set", PyCUDATensorSetFromArray<float>)
       .def("set", PyCUDATensorSetFromArray<int>)
@@ -124,12 +125,14 @@ PYBIND11_PLUGIN(core) {
       .def("set", PyCUDATensorSetFromArray<int64_t>)
       .def("set", PyCUDATensorSetFromArray<bool>)
       .def("set", PyCUDATensorSetFromArray<uint16_t>)
+      .def("set", PyCUDATensorSetFromArray<uint8_t>)
       .def("set", PyCUDAPinnedTensorSetFromArray<float>)
       .def("set", PyCUDAPinnedTensorSetFromArray<int>)
       .def("set", PyCUDAPinnedTensorSetFromArray<double>)
       .def("set", PyCUDAPinnedTensorSetFromArray<int64_t>)
       .def("set", PyCUDAPinnedTensorSetFromArray<bool>)
       .def("set", PyCUDAPinnedTensorSetFromArray<uint16_t>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<uint8_t>)
 #endif
       .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
       .def("set_float_element", TensorSetElement<float>)
@@ -492,6 +495,7 @@ All parameter, weight, gradient are variables in Paddle.
 
   m.def("enable_profiler", platform::EnableProfiler);
   m.def("disable_profiler", platform::DisableProfiler);
+  m.def("is_profiler_enabled", platform::IsProfileEnabled);
   m.def("reset_profiler", platform::ResetProfiler);
 
   // -- python binds for parallel executor.
diff --git a/paddle/function/BlockExpandOp.cpp b/paddle/function/BlockExpandOp.cpp
index aa53853e08716ff0dd8dce7c73766d9543bed2b9..f01f89a7277acc5fe494b92a3e7ca3ca18498c97 100644
--- a/paddle/function/BlockExpandOp.cpp
+++ b/paddle/function/BlockExpandOp.cpp
@@ -33,7 +33,7 @@ namespace paddle {
  * \param outputs[0] Image data of NCHW format.
  */
 class BlockExpandFunction : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     // function arguments
     strides_ = config.get<std::vector<size_t>>("strides");
@@ -81,7 +81,7 @@ public:
                         (size_t)blockW()});
   }
 
-protected:
+ protected:
   std::vector<size_t> strides_;
   std::vector<size_t> paddings_;
   std::vector<size_t> blocks_;
@@ -101,7 +101,7 @@ protected:
 
 template <DeviceType Device>
 class BlockExpandForward : public BlockExpandFunction {
-public:
+ public:
   void init(const FuncConfig& config) override {
     BlockExpandFunction::init(config);
   }
@@ -149,7 +149,7 @@ public:
 
 template <DeviceType Device>
 class BlockExpandBackward : public BlockExpandFunction {
-public:
+ public:
   void init(const FuncConfig& config) override {
     BlockExpandFunction::init(config);
   }
diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index 89ee09837db69d79bbd678312f02f6dc87e8067c..6de8c94e778c8d1439b2a2aa3c581a5a3cf70261 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -63,12 +63,12 @@ enum ArgType {
   ADD_TO = 2,
 };
 class BufferArg {
-public:
+ public:
   void setArgType(ArgType argType) { argType_ = argType; }
 
   ArgType getArgType() const { return argType_; }
 
-public:
+ public:
   BufferArg(ValueType valueType,
             const TensorShape& shape,
             ArgType argType = UNSPECIFIED)
@@ -169,7 +169,7 @@ public:
   const SequenceArg& sequence() const;
   const SparseMatrixArg& sparse() const;
 
-protected:
+ protected:
   void* buf_;
   ValueType valueType_;
   TensorShape shape_;
@@ -185,7 +185,7 @@ protected:
 // valueType_ = int32
 // if a < b then value_.buf_[a] < value_.buf_[b]
 class SequenceIdArg : public BufferArg {
-public:
+ public:
   SequenceIdArg(const TensorShape& shape, ArgType argType = UNSPECIFIED)
       : BufferArg(VALUE_TYPE_INT32, shape, argType) {
     bufferType_ = TENSOR_SEQUENCE_ID;
@@ -212,7 +212,7 @@ public:
 
   size_t numSeqs() const { return numSeqs_; }
 
-private:
+ private:
   size_t numSeqs_;
 };
 
@@ -222,7 +222,7 @@ private:
 // SequenceArg can be used to represent sequences that contain multiple
 // unequal lengths.
 class SequenceArg : public BufferArg {
-public:
+ public:
   SequenceArg(ValueType valueType,
               const TensorShape& shape,
               ArgType argType = UNSPECIFIED)
@@ -255,7 +255,7 @@ public:
   SequenceIdArg& getSequenceId() { return startPositions_; }
   const SequenceIdArg& getSequenceId() const { return startPositions_; }
 
-private:
+ private:
   SequenceIdArg startPositions_;
 };
 
@@ -263,7 +263,7 @@ private:
 // valueType_ == float or double
 // shape_.ndims() == 2
 class SparseMatrixArg : public BufferArg {
-public:
+ public:
   SparseMatrixArg(void* buf,
                   ValueType valueType,
                   const TensorShape& shape,
@@ -353,7 +353,7 @@ public:
 
   SparseDataType dataType() const { return type_; }
 
-private:
+ private:
   BufferArg row_;
   BufferArg col_;
   size_t nnz_;
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index 904b0958e6f2c1b8fb8cf56f3cd7d07ad8e24f19..1187842452460ac3fd71f48150fab6467f93dc6c 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -100,7 +100,7 @@ void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
  */
 template <DeviceType Device>
 class ContextProjectionForwardFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     context_length_ = config.get<size_t>("context_length");
     context_start_ = config.get<int>("context_start");
@@ -146,7 +146,7 @@ public:
                                      begin_pad_);
   }
 
-private:
+ private:
   size_t context_length_;
   int context_start_;
   size_t begin_pad_;
@@ -223,7 +223,7 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
  */
 template <DeviceType Device>
 class ContextProjectionBackwardFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     context_length_ = config.get<size_t>("context_length");
     context_start_ = config.get<int>("context_start");
@@ -278,7 +278,7 @@ public:
                                       total_pad_);
   }
 
-private:
+ private:
   size_t context_length_;
   int context_start_;
   size_t begin_pad_;
@@ -299,7 +299,7 @@ private:
  */
 template <DeviceType Device>
 class ContextProjectionBackwardDataFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     context_length_ = config.get<size_t>("context_length");
     context_start_ = config.get<int>("context_start");
@@ -331,7 +331,7 @@ public:
         out_grad_mat, in_grad_mat, seq_vec, context_length_, context_start_);
   }
 
-private:
+ private:
   size_t context_length_;
   int context_start_;
 };
@@ -348,7 +348,7 @@ private:
  */
 template <DeviceType Device>
 class ContextProjectionBackwardWeightFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     context_length_ = config.get<size_t>("context_length");
     context_start_ = config.get<int>("context_start");
@@ -382,7 +382,7 @@ public:
                                             begin_pad_);
   }
 
-private:
+ private:
   size_t context_length_;
   int context_start_;
   size_t begin_pad_;
diff --git a/paddle/function/ConvOp.h b/paddle/function/ConvOp.h
index 7d23d0079c8f62b2c8912dfcb9f191c622a60bc9..2d8437bcfe60d1d81897f1c4be1cbfecb5b27fe0 100644
--- a/paddle/function/ConvOp.h
+++ b/paddle/function/ConvOp.h
@@ -56,7 +56,7 @@ namespace paddle {
  *      H and W is height and width of filter.
  */
 class ConvFunctionBase : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     // function arguments
     strides_ = config.get<std::vector<size_t>>("strides");
@@ -101,7 +101,7 @@ public:
     }
   }
 
-protected:
+ protected:
   size_t getFilterHeight(const TensorShape& filter) const {
     return filter[filter.ndims() - 2];
   }
diff --git a/paddle/function/CosSimOp.cpp b/paddle/function/CosSimOp.cpp
index 81bccc1a9c7d614763a10e3838271b57eef2c603..2c25e1af44965d30591faeccc9a181e36c7e0a0f 100644
--- a/paddle/function/CosSimOp.cpp
+++ b/paddle/function/CosSimOp.cpp
@@ -97,7 +97,7 @@ class CosSimForwardFunc : public FunctionBase {
     CosSimForward<Device>(out_mat, in1_mat, in2_mat, scale_);
   }
 
-private:
+ private:
   real scale_;
 };
 
@@ -227,7 +227,7 @@ class CosSimBackwardFunc : public FunctionBase {
         out_grad, out_val, in1_val, in2_val, in1_grad, in2_grad, scale_);
   }
 
-private:
+ private:
   real scale_;
 };
 
diff --git a/paddle/function/CropOp.cpp b/paddle/function/CropOp.cpp
index 7aa527d21615e19257bd003d0563b5e26b2fcb2f..5bd98910fe838751935f8ef2387ce96e755c6df1 100644
--- a/paddle/function/CropOp.cpp
+++ b/paddle/function/CropOp.cpp
@@ -112,7 +112,7 @@ void CropGrad<DEVICE_TYPE_CPU>(const real* inGrad,
  */
 template <DeviceType Device>
 class CropFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override { conf_ = config; }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -130,7 +130,7 @@ public:
                  conf_);
   }
 
-private:
+ private:
   FuncConfig conf_;
 };
 
@@ -145,7 +145,7 @@ private:
 
 template <DeviceType Device>
 class CropGradFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override { conf_ = config; }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -163,7 +163,7 @@ public:
                      conf_);
   }
 
-private:
+ private:
   FuncConfig conf_;
 };
 
diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index 75c0fc2a3d047a9162d49809a717629f2270872d..7ff9227e5c2702d9d5334db501730b57ec10bfe3 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -160,7 +160,7 @@ void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
  */
 template <DeviceType Device>
 class CrossMapNormalFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     // function arguments
     size_ = config.get<size_t>("size");
@@ -220,7 +220,7 @@ public:
     return ops;
   }
 
-private:
+ private:
   size_t size_;
   real scale_;
   real pow_;
@@ -260,7 +260,7 @@ private:
  */
 template <DeviceType Device>
 class CrossMapNormalGradFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     // function arguments
     size_ = config.get<size_t>("size");
@@ -328,7 +328,7 @@ public:
     return ops;
   }
 
-private:
+ private:
   size_t size_;
   real scale_;
   real pow_;
diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/function/DepthwiseConvOp.cpp
index 46651345b45e4ced9a3ef3373af437d939a66716..958034e08e60c9a63d1c480bde7c84b760205ae4 100644
--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
@@ -19,7 +19,7 @@ namespace paddle {
 
 template <class T>
 class DepthwiseConvFunctor<DEVICE_TYPE_CPU, T> {
-public:
+ public:
   void operator()(const T* inputData,
                   const T* filterData,
                   int batchSize,
@@ -43,7 +43,7 @@ public:
 
 template <class T>
 class DepthwiseConvGradInputFunctor<DEVICE_TYPE_CPU, T> {
-public:
+ public:
   void operator()(const T* outputGrad,
                   const T* filterData,
                   int batchSize,
@@ -66,7 +66,7 @@ public:
 
 template <class T>
 class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_CPU, T> {
-public:
+ public:
   void operator()(const T* outputGrad,
                   const T* inputData,
                   int batchSize,
@@ -93,7 +93,7 @@ public:
  */
 template <DeviceType Device>
 class DepthwiseConvFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
@@ -156,7 +156,7 @@ public:
  */
 template <DeviceType Device>
 class DepthwiseConvGradInputFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
@@ -220,7 +220,7 @@ public:
  */
 template <DeviceType Device>
 class DepthwiseConvGradFilterFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
diff --git a/paddle/function/DepthwiseConvOp.h b/paddle/function/DepthwiseConvOp.h
index 6700747314fa8377828dab0c436eb4b2053f46f6..7837edd1c071980592b1cf36ecb69a3b7c12cc5e 100644
--- a/paddle/function/DepthwiseConvOp.h
+++ b/paddle/function/DepthwiseConvOp.h
@@ -44,7 +44,7 @@ namespace paddle {
  */
 template <DeviceType Device, class T>
 class DepthwiseConvFunctor {
-public:
+ public:
   void operator()(const T* inputData,
                   const T* filterData,
                   int batchSize,
@@ -89,7 +89,7 @@ public:
  */
 template <DeviceType Device, class T>
 class DepthwiseConvGradInputFunctor {
-public:
+ public:
   void operator()(const T* outputGrad,
                   const T* filterData,
                   int batchSize,
@@ -135,7 +135,7 @@ public:
  */
 template <DeviceType Device, class T>
 class DepthwiseConvGradFilterFunctor {
-public:
+ public:
   void operator()(const T* outputGrad,
                   const T* inputData,
                   int batchSize,
diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu
index cd1d55a416c84c6327226ffaae4d5d9d5be81038..2c0e71b19b22abac25d273d8bbeddc330e67f8b0 100644
--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ b/paddle/function/DepthwiseConvOpGpu.cu
@@ -199,7 +199,7 @@ __global__ void ConvolutionDepthwiseFilterBackward(const int num_i,
 
 template <class T>
 class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T> {
-public:
+ public:
   void operator()(const T* inputData,
                   const T* filterData,
                   int batchSize,
@@ -249,7 +249,7 @@ public:
 
 template <class T>
 class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T> {
-public:
+ public:
   void operator()(const T* outputGrad,
                   const T* filterData,
                   int batchSize,
@@ -300,7 +300,7 @@ public:
 
 template <class T>
 class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, T> {
-public:
+ public:
   void operator()(const T* outputGrad,
                   const T* inputData,
                   int batchSize,
diff --git a/paddle/function/EigenGemm.cpp b/paddle/function/EigenGemm.cpp
index bac4659e62b107dd80ef95dd0907b3da4becffbc..8e9dbbd7a154095a7298bb2f59a82d13a60f9bd3 100644
--- a/paddle/function/EigenGemm.cpp
+++ b/paddle/function/EigenGemm.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <glog/logging.h>
-#include "unsupported/Eigen/CXX11/Tensor"
+#include "paddle/function/EigenThreadDevice.h"
 
 namespace paddle {
 
@@ -70,25 +70,26 @@ struct EigenBlasGemm {
     dims[0].first = transA ? 0 : 1;
     dims[0].second = transB ? 1 : 0;
 
-    Eigen::DefaultDevice device;
+    auto* device = EigenDeviceWarpper::device();
     if (N == ldc) {
       if (alpha == T(1) && beta == T(0)) {
-        c.device(device) = a.contract(b, dims);
+        c.device(*device) = a.contract(b, dims);
       } else if (alpha == T(1) && beta == T(1)) {
-        c.device(device) += a.contract(b, dims);
+        c.device(*device) += a.contract(b, dims);
       } else {
-        c.device(device) = alpha * a.contract(b, dims) + beta * c;
+        c.device(*device) = alpha * a.contract(b, dims) + beta * c;
       }
     } else {
       if (alpha == T(1) && beta == T(0)) {
-        c.slice(offsetC, extentC).device(device) = a.contract(b, dims);
+        c.slice(offsetC, extentC).device(*device) = a.contract(b, dims);
       } else if (alpha == T(1) && beta == T(1)) {
-        c.slice(offsetC, extentC).device(device) += a.contract(b, dims);
+        c.slice(offsetC, extentC).device(*device) += a.contract(b, dims);
       } else {
-        c.slice(offsetC, extentC).device(device) =
+        c.slice(offsetC, extentC).device(*device) =
             alpha * a.contract(b, dims) + beta * c.slice(offsetC, extentC);
       }
     }
+    EigenDeviceWarpper::free_device(device);
   }
 };
 
diff --git a/paddle/function/EigenThreadDevice.h b/paddle/function/EigenThreadDevice.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb92251c827a26d55ca021c4418182bae28dd6a5
--- /dev/null
+++ b/paddle/function/EigenThreadDevice.h
@@ -0,0 +1,73 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License. */
+
+#pragma once
+
+#if defined(__OSX__) || defined(__APPLE__)
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#endif
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+
+#if defined(__ANDROID__)
+int GetCpuCount() {
+  FILE* fp = fopen("/sys/devices/system/cpu/possible", "r");
+  if (!fp) {
+    return 1;
+  }
+  int rank0, rank1;
+  int num = fscanf(fp, "%d-%d", &rank0, &rank1);
+  fclose(fp);
+  if (num < 2) return 1;
+  return rank1 + 1;
+}
+#elif defined(__OSX__) || defined(__APPLE__)
+int GetCpuCount() {
+  int count = 0;
+  size_t len = sizeof(int);
+  sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
+  return count > 0 ? count : 1;
+}
+#else
+int GetCpuCount() { return 1; }
+#endif
+
+class EigenDeviceWarpper {
+ public:  // NOLINT
+#if EIGEN_USE_THREADS
+  static Eigen::ThreadPoolDevice* device() {
+    const int num_cpus = GetCpuCount();
+    const int num_threads = (num_cpus > 2) ? 2 : num_cpus;
+    static Eigen::ThreadPool tp(num_threads);
+    static Eigen::ThreadPoolDevice* device =
+        new Eigen::ThreadPoolDevice(&tp, num_threads);
+    return device;
+  }
+
+  static void free_device(Eigen::ThreadPoolDevice* device) {
+    // do nothing
+  }
+#else
+  static Eigen::DefaultDevice* device() {
+    Eigen::DefaultDevice* device = new Eigen::DefaultDevice;
+    return device;
+  }
+
+  static void free_device(Eigen::DefaultDevice* device) { delete device; }
+#endif
+};
+
+}  // namespace paddle
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index 01288ef92e7b59d7958e6e23daf641b30a60eed1..a6c14ef29b760faa393c37bd2357824a061c7b38 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -29,7 +29,7 @@ namespace paddle {
  * The argument type of Function::init.
  */
 class FuncConfig {
-public:
+ public:
   template <typename T>
   T get(const std::string& key, Error* err = nullptr) const {
     try {
@@ -59,7 +59,7 @@ public:
     return *this;
   }
 
-protected:
+ protected:
   mutable std::unordered_map<std::string, any> valueMap_;
 };
 
@@ -77,7 +77,7 @@ protected:
  * in the BufferArgs life time.
  */
 class BufferArgs {
-public:
+ public:
   BufferArgs() {}
 
   ~BufferArgs() {
@@ -137,7 +137,7 @@ public:
 
   void addArg(SparseMatrixArg& arg) { args_.push_back(&arg); }
 
-private:
+ private:
   std::vector<BufferArg*> args_;
   // The BufferArg object is constructed and freed by BufferArgs.
   std::vector<BufferArg*> _args_;
@@ -163,7 +163,7 @@ private:
  * If Function has more than one output, each output can have different modes.
  */
 class FunctionBase {
-public:
+ public:
   virtual ~FunctionBase() {}
 
   virtual void init(const FuncConfig& config) {}
@@ -192,7 +192,7 @@ public:
 
   static ClassRegistrar<FunctionBase> funcRegistrar_;
 
-protected:
+ protected:
   // numInputs_ and numOutputs_ represents the maximum
   // input and output supported by Function.
   // Some functions are optimized for input and output,
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
index 56c3537b6a96c8042d172f8aca2163fa18c813c1..14003d2c885c8f846f9445ad8844869c9112816e 100644
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -39,7 +39,7 @@ struct Allocator<DEVICE_TYPE_GPU> {
 // Copy argument1 to argument2
 template <DeviceType DType1, DeviceType DType2>
 class CopyArgument {
-public:
+ public:
   void operator()(const BufferArg& arg1, BufferArg& arg2) {
     CHECK_EQ(arg1.valueType(), arg2.valueType());
     CHECK_LE(arg1.shape().getElements(), arg2.shape().getElements());
@@ -95,7 +95,7 @@ public:
  */
 template <DeviceType DType1, DeviceType DType2>
 class Compare2Function {
-public:
+ public:
   typedef typename test::Allocator<DType1>::type Allocator1;
   typedef typename test::Allocator<DType2>::type Allocator2;
   typedef typename Tensor<real, DType1>::Vector Vector1;
@@ -305,7 +305,7 @@ public:
 
   std::shared_ptr<FunctionBase> getFunction2() const { return function2_; }
 
-protected:
+ protected:
   // only init cpu argument, gpu argument copy from cpu argument.
   void initArg(BufferArg& arg) {
     Vector1 vector(arg.shape().getElements(), (real*)arg.data());
@@ -381,7 +381,7 @@ protected:
     }
   }
 
-protected:
+ protected:
   std::shared_ptr<FunctionBase> function1_;
   std::shared_ptr<FunctionBase> function2_;
   std::vector<std::shared_ptr<Allocator1>> func1Memory_;
@@ -400,7 +400,7 @@ protected:
 
 class CpuGpuFuncCompare
     : public Compare2Function<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> {
-public:
+ public:
   CpuGpuFuncCompare(const std::string& name, const FuncConfig& config)
       : Compare2Function(name + "-CPU", name + "-GPU", config) {}
 
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index 2b7c6f9eab223c8d6a2107ff4605ac6e60295f7d..5b023e2c10e5040a28660d555efceb0e26b40d49 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -24,7 +24,7 @@ namespace paddle {
  */
 template <DeviceType Device>
 class GemmConvFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
@@ -136,7 +136,7 @@ public:
  */
 template <DeviceType Device>
 class GemmConvMobileFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
@@ -297,7 +297,7 @@ public:
  */
 template <DeviceType Device>
 class GemmConvGradInputFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
@@ -404,7 +404,7 @@ public:
  */
 template <DeviceType Device>
 class GemmConvGradFilterFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h
index 6a0778700037c142d62fdb99667403ade806f7c1..e0ce6918a2a5324a396ade734945cf426b81ab56 100644
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -70,7 +70,7 @@ enum ColFormat { kCFO = 0, kOCF = 1 };
  */
 template <ColFormat Format, DeviceType Device, class T>
 class Im2ColFunctor {
-public:
+ public:
   void operator()(const T* imData,
                   const TensorShape& imShape,
                   T* colData,
@@ -85,7 +85,7 @@ public:
 
 template <ColFormat Format, DeviceType Device, class T>
 class Col2ImFunctor {
-public:
+ public:
   void operator()(T* imData,
                   const TensorShape& imShape,
                   const T* colData,
@@ -100,7 +100,7 @@ public:
 
 template <class T>
 class Im2ColMobileFunctor {
-public:
+ public:
   void operator()(const T* imData,
                   const TensorShape& imShape,
                   T* colData,
diff --git a/paddle/function/Im2ColOp.cpp b/paddle/function/Im2ColOp.cpp
index ad2aed8f3c237cf9c0f7f0dcc4900cac807e25ea..55a3ff98db63ede96094a3d3fdeedf03b573294f 100644
--- a/paddle/function/Im2ColOp.cpp
+++ b/paddle/function/Im2ColOp.cpp
@@ -23,7 +23,7 @@ namespace paddle {
  */
 template <class T>
 class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> {
-public:
+ public:
   void operator()(const T* imData,
                   const TensorShape& imShape,
                   T* colData,
@@ -75,7 +75,7 @@ public:
  */
 template <class T>
 class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, T> {
-public:
+ public:
   void operator()(T* imData,
                   const TensorShape& imShape,
                   const T* colData,
@@ -130,7 +130,7 @@ template class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, double>;
  */
 template <class T>
 class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, T> {
-public:
+ public:
   void operator()(const T* imData,
                   const TensorShape& imShape,
                   T* colData,
@@ -188,7 +188,7 @@ public:
  */
 template <class T>
 class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, T> {
-public:
+ public:
   void operator()(T* imData,
                   const TensorShape& imShape,
                   const T* colData,
diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/function/Im2ColOpGpu.cu
index a944a0ee687fefc5e002096b9c5b869495554167..96dd8f528eaa38f9d174ab7c2a5ea5eb96e2a060 100644
--- a/paddle/function/Im2ColOpGpu.cu
+++ b/paddle/function/Im2ColOpGpu.cu
@@ -71,7 +71,7 @@ __global__ void im2col(const T* data_im,
  */
 template <class T>
 class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, T> {
-public:
+ public:
   void operator()(const T* imData,
                   const TensorShape& imShape,
                   T* colData,
@@ -184,7 +184,7 @@ __global__ void col2im(size_t n,
  */
 template <class T>
 class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, T> {
-public:
+ public:
   void operator()(T* imData,
                   const TensorShape& imShape,
                   const T* colData,
@@ -292,7 +292,7 @@ __global__ void im2colOCF(const T* imData,
  */
 template <class T>
 class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, T> {
-public:
+ public:
   void operator()(const T* imData,
                   const TensorShape& imShape,
                   T* colData,
@@ -399,7 +399,7 @@ __global__ void col2imOCF(T* imData,
  */
 template <class T>
 class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, T> {
-public:
+ public:
   void operator()(T* imData,
                   const TensorShape& imShape,
                   const T* colData,
diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp
index 90cd4a2b6d1bfb2529e1c966cf7a1fb904a844d7..7bf36c8050a8c33d836ce98dc7f3cf6d3de38d55 100644
--- a/paddle/function/MulOp.cpp
+++ b/paddle/function/MulOp.cpp
@@ -240,7 +240,7 @@ void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
  */
 template <DeviceType Device>
 class MulFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     aTrans_ = config.get<bool>("aTrans");
     bTrans_ = config.get<bool>("bTrans");
@@ -335,7 +335,7 @@ public:
     }
   }
 
-private:
+ private:
   bool aTrans_;
   bool bTrans_;
 };
diff --git a/paddle/function/NaiveConvOp.cpp b/paddle/function/NaiveConvOp.cpp
index 22d3b33d0f4a730691234c6c742978abd72294a6..99c8b81acbbb16a91bc0faa1c7f2873fa94ab108 100644
--- a/paddle/function/NaiveConvOp.cpp
+++ b/paddle/function/NaiveConvOp.cpp
@@ -24,7 +24,7 @@ namespace paddle {
  */
 template <class T>
 class NaiveConvFunctor {
-public:
+ public:
   void operator()(const T* inputData,
                   size_t batchSize,
                   size_t inputChannels,
@@ -85,7 +85,7 @@ public:
 
 template <DeviceType Device>
 class NaiveConvFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
diff --git a/paddle/function/PadOp.cpp b/paddle/function/PadOp.cpp
index db6dd518ca5df9d852e545b37f61f1141c81f57c..5d7515e8c053439b95fb18de3c8ffe70705600a3 100644
--- a/paddle/function/PadOp.cpp
+++ b/paddle/function/PadOp.cpp
@@ -132,7 +132,7 @@ static inline PadConf castToPadConf(const FuncConfig& conf) {
 
 template <DeviceType Device>
 class PadFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -157,7 +157,7 @@ public:
                 pad_);
   }
 
-private:
+ private:
   PadConf pad_;
 };
 
@@ -173,7 +173,7 @@ private:
 
 template <DeviceType Device>
 class PadGradFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -201,7 +201,7 @@ public:
                     pad_);
   }
 
-private:
+ private:
   PadConf pad_;
 };
 
diff --git a/paddle/function/RowConvOp.cpp b/paddle/function/RowConvOp.cpp
index 925860346e1a53065b0fe4ccbd26853afc8898a1..129e9334582fad011c259e8ab8268b00a7fab7b6 100644
--- a/paddle/function/RowConvOp.cpp
+++ b/paddle/function/RowConvOp.cpp
@@ -129,7 +129,7 @@ void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
 
 template <DeviceType Device>
 class RowConvFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {}
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -176,7 +176,7 @@ public:
 template <DeviceType Device>
 class RowConvGradFunc : public FunctionBase {
   // TODO(qingqing): split into RowConvDataFunc and RowConvWeightFunc
-public:
+ public:
   void init(const FuncConfig& config) override {}
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
diff --git a/paddle/function/ScaleSubRegionOp.cpp b/paddle/function/ScaleSubRegionOp.cpp
index 6ed6eb2dba477722664ca4a29f4689114f368846..9a06ef2a96f25b5b7326049df2a708637f319561 100644
--- a/paddle/function/ScaleSubRegionOp.cpp
+++ b/paddle/function/ScaleSubRegionOp.cpp
@@ -92,7 +92,7 @@ void ScaleSubRegionGrad<DEVICE_TYPE_CPU>(const real* inGrad,
  */
 template <DeviceType Device>
 class ScaleSubRegionFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override { conf_ = config; }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -109,7 +109,7 @@ public:
                            conf_);
   }
 
-private:
+ private:
   FuncConfig conf_;
 };
 
@@ -124,7 +124,7 @@ private:
 
 template <DeviceType Device>
 class ScaleSubRegionGradFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override { conf_ = config; }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -141,7 +141,7 @@ public:
                                conf_);
   }
 
-private:
+ private:
   FuncConfig conf_;
 };
 
diff --git a/paddle/function/SwitchOp.cpp b/paddle/function/SwitchOp.cpp
index 50e1d6c04c54fed5b847aa10dbb253f00cfa42d4..750fb6bf28baf050b1f9f965a1a9b315363e5645 100644
--- a/paddle/function/SwitchOp.cpp
+++ b/paddle/function/SwitchOp.cpp
@@ -75,7 +75,7 @@ void NHWC2NCHW<DEVICE_TYPE_CPU>(real* outputs,
  */
 template <DeviceType Device>
 class NCHW2NHWCFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {}
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
@@ -108,7 +108,7 @@ public:
  */
 template <DeviceType Device>
 class NHWC2NCHWFunc : public FunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {}
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
diff --git a/paddle/function/TensorShape.h b/paddle/function/TensorShape.h
index 02d38c32c007325a928910d136d48214ba5f6bc3..d4d1eae3960c333a2a7dc6099ae7a68677fdcd5f 100644
--- a/paddle/function/TensorShape.h
+++ b/paddle/function/TensorShape.h
@@ -22,7 +22,7 @@ namespace paddle {
  * TensorShape used to represent shape of normal tensor.
  */
 class TensorShape {
-public:
+ public:
   TensorShape() : ndims_(0), nelements_(0) { initDims(0); }
 
   TensorShape(size_t ndims) : ndims_(ndims), nelements_(1) { initDims(ndims); };
@@ -80,7 +80,7 @@ public:
 
   bool operator!=(const TensorShape& t) const { return !(*this == t); }
 
-private:
+ private:
   // compute number of elements
   void numElements() {
     nelements_ = 1;
diff --git a/paddle/function/neon/NeonDepthwiseConv.cpp b/paddle/function/neon/NeonDepthwiseConv.cpp
index d3298c753853ca6d212a619cf8d0bd9356a8dbd7..d7ac83da41aaba5cd38b042d0381dea527f9c42d 100644
--- a/paddle/function/neon/NeonDepthwiseConv.cpp
+++ b/paddle/function/neon/NeonDepthwiseConv.cpp
@@ -21,7 +21,7 @@ namespace paddle {
 
 template <DeviceType Device>
 class NeonDepthwiseConvFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
@@ -66,18 +66,18 @@ public:
     float* inputPadding = inputData;
     int padInputHeight = inputHeight + 2 * paddingH();
     int padInputWidth = inputWidth + 2 * paddingW();
-    if (paddingH() > 0 || paddingW() > 0) {
-      int newSize = batchSize * inputChannels * padInputHeight * padInputWidth;
-      resizeBuffer<Device>(newSize);
-      inputPadding = reinterpret_cast<float*>(memory_->getBuf());
-      neon::Padding<float>::run(inputData,
-                                inputPadding,
-                                batchSize * inputChannels,
-                                inputHeight,
-                                inputWidth,
-                                padInputHeight,
-                                padInputWidth);
-    }
+    int newSize =
+        batchSize * (inputChannels + 1) * padInputHeight * padInputWidth;
+
+    resizeBuffer<Device>(newSize);
+    inputPadding = reinterpret_cast<float*>(memory_->getBuf());
+    neon::Padding<float>::run(inputData,
+                              inputPadding,
+                              batchSize * inputChannels,
+                              inputHeight,
+                              inputWidth,
+                              padInputHeight,
+                              padInputWidth);
 
     std::function<void(
         const float*, const float*, int, int, int, int, int, int, float*)>
diff --git a/paddle/function/neon/NeonDepthwiseConvTranspose.cpp b/paddle/function/neon/NeonDepthwiseConvTranspose.cpp
index d443d3fa4902f998230651c5c64355d93c4c4f6a..1fc5daf6078bbd5b4506ff2e0832e2cc3ec48fe3 100644
--- a/paddle/function/neon/NeonDepthwiseConvTranspose.cpp
+++ b/paddle/function/neon/NeonDepthwiseConvTranspose.cpp
@@ -21,7 +21,7 @@ namespace paddle {
 
 template <DeviceType Device>
 class NeonDepthwiseConvTransposeFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
   }
diff --git a/paddle/function/nnpack/NNPACKConvOp.cpp b/paddle/function/nnpack/NNPACKConvOp.cpp
index 3cdba4f2ed0dad42035fe2d0de87ad5aeeef20ca..48c997b50d8c73b25c58801c30e597c9d1f3232a 100644
--- a/paddle/function/nnpack/NNPACKConvOp.cpp
+++ b/paddle/function/nnpack/NNPACKConvOp.cpp
@@ -46,7 +46,7 @@ nnp_convolution_algorithm get_nnp_convolution_algorithm(
 
 template <DeviceType Device>
 class NNPACKConvFunction : public ConvFunctionBase {
-public:
+ public:
   void init(const FuncConfig& config) override {
     ConvFunctionBase::init(config);
     algorithm_ = get_nnp_convolution_algorithm(config.get<std::string>("algo"));
@@ -231,7 +231,7 @@ public:
     }
   }
 
-private:
+ private:
   nnp_convolution_algorithm algorithm_;
   nnp_convolution_transform_strategy transform_strategy_;
   void* workspaceBuffer_;
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index 8d8f01234fe3859989e44fe6147105fb72b832ff..71c238fbfe9f32f3764601ebb441336931f8ef5f 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -44,10 +44,10 @@ static ClassRegistrar<ActivationFunction> gActivationRegistrar;
  */
 #define BEGIN_DEFINE_ACTIVATION(ACTIVATION_NAME)                             \
   class ACTIVATION_CLASS_NAME(ACTIVATION_NAME) : public ActivationFunction { \
-  private:                                                                   \
+   private:                                                                  \
     static const std::string name;                                           \
                                                                              \
-  public:                                                                    \
+   public:                                                                   \
     const std::string& getName() const { return name; }
 /**
  * @def END_DEFINE_ACTIVATION
@@ -70,7 +70,7 @@ static ClassRegistrar<ActivationFunction> gActivationRegistrar;
  * Do nothing when forward/backward.
  */
 class IdentityActivation : public ActivationFunction {
-public:
+ public:
   static const std::string name;
   Error __must_check forward(Argument& act) {
     (void)act;
diff --git a/paddle/gserver/activations/ActivationFunction.h b/paddle/gserver/activations/ActivationFunction.h
index 0f4b0fe0abb85403d42fc8a2ac28560e10058c20..8e2e144769f2e668a9a8f02890d29c4a7fe128a3 100644
--- a/paddle/gserver/activations/ActivationFunction.h
+++ b/paddle/gserver/activations/ActivationFunction.h
@@ -31,7 +31,7 @@ struct Argument;
  *
  */
 class ActivationFunction {
-public:
+ public:
   static ActivationFunction* create(const std::string& type);
   static std::vector<std::string> getAllRegisteredTypes();
 
diff --git a/paddle/gserver/activations/MKLDNNActivation.cpp b/paddle/gserver/activations/MKLDNNActivation.cpp
index 56ffb839344aabe43eaae0bd46e6dbf95e4d8f20..672444c6561adbeb78c3c453f12ab6aaedeed646 100644
--- a/paddle/gserver/activations/MKLDNNActivation.cpp
+++ b/paddle/gserver/activations/MKLDNNActivation.cpp
@@ -35,10 +35,10 @@ static ClassRegistrar<ActivationFunction> gMKLDNNActivationRegistrar;
  * @def END_MKLDNN_ACTIVATION
  */
 #define END_MKLDNN_ACTIVATION(ACT_TYPE)                            \
-private:                                                           \
+ private:                                                          \
   static const std::string name;                                   \
                                                                    \
-public:                                                            \
+ public:                                                           \
   const std::string& getName() const { return name; }              \
   }                                                                \
   ;                                                                \
@@ -63,11 +63,11 @@ public:                                                            \
 #define DEFINE_MKLDNN_ELTWISE_ACTIVATION(                            \
     ACT_TYPE, BASE_CLASS, ALPHA, BWD_ALPHA)                          \
   BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS)                      \
-private:                                                             \
+ private:                                                            \
   static const float alpha;                                          \
   static const float bwdAlpha;                                       \
                                                                      \
-public:                                                              \
+ public:                                                             \
   float getAlpha() const { return alpha; }                           \
   float getBwdAlpha() const { return bwdAlpha; }                     \
   END_MKLDNN_ACTIVATION(ACT_TYPE)                                    \
diff --git a/paddle/gserver/activations/MKLDNNActivation.h b/paddle/gserver/activations/MKLDNNActivation.h
index 392b32c70dae3728e13ee64f09f135c015c122cf..eece1b9c37e72624dffd119804c65f7bd36e20fb 100644
--- a/paddle/gserver/activations/MKLDNNActivation.h
+++ b/paddle/gserver/activations/MKLDNNActivation.h
@@ -27,7 +27,7 @@ namespace paddle {
  * including mkldnn_relu, mkldnn_elu, mkldnn_tanh, mkldnn_softmax
  */
 class MKLDNNActivation : public ActivationFunction {
-protected:
+ protected:
   // input value element count
   size_t cnt_;
   // should not merge the resetBwd into resetFwd,
@@ -43,7 +43,7 @@ protected:
   std::vector<mkldnn::primitive> pipelineFwd_;
   std::vector<mkldnn::primitive> pipelineBwd_;
 
-public:
+ public:
   MKLDNNActivation() : cnt_(0), needResetBwd_(true) {}
   ~MKLDNNActivation() {}
   static ActivationFunction* create(const std::string& type);
@@ -72,7 +72,7 @@ class MKLDNNEltwiseActivation : public MKLDNNActivation {
   typedef mkldnn::eltwise_backward eltwise_bwd;
   typedef mkldnn::algorithm algorithm;
 
-protected:
+ protected:
   // save the forward primitive desc, which can be used backward
   std::shared_ptr<eltwise_fwd::primitive_desc> fwdPD_;
   // eltwise_bwd need src input value
@@ -80,7 +80,7 @@ protected:
   // use for copy data
   std::shared_ptr<mkldnn::reorder> copyInVal_;
 
-public:
+ public:
   MKLDNNEltwiseActivation() {}
   ~MKLDNNEltwiseActivation() {}
   virtual const std::string& getName() const = 0;
@@ -102,12 +102,12 @@ public:
 class MKLDNNSoftmaxActivation : public MKLDNNActivation {
   typedef mkldnn::softmax_forward softmax_fwd;
 
-private:
+ private:
   // for backward
   MatrixPtr sftMaxSum_;
   MatrixPtr sftMaxDot_;
 
-public:
+ public:
   MKLDNNSoftmaxActivation() {}
   ~MKLDNNSoftmaxActivation() {}
   virtual const std::string& getName() const = 0;
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 4851168abab7179d552648c88923a529d55e6a7e..21822b10c2ebf1d353195794cf8f49e02b64c177 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -71,7 +71,7 @@ typedef std::shared_ptr<BufferBatch> BufferBatchPtr;
  * @brief Data for batch training a neural network
  */
 class DataBatch {
-public:
+ public:
   DataBatch() : size_(0) { data_.clear(); }
   /**
    * @brief Get batch size
@@ -181,7 +181,7 @@ public:
     }
   }
 
-protected:
+ protected:
   /**
    * @brief batch size
    */
@@ -194,7 +194,7 @@ protected:
 };
 
 class BufferBatch {
-public:
+ public:
   BufferBatch() {
     hlStream_ = HPPL_STREAM_DEFAULT;
     hlEvent_ = NULL;
@@ -235,7 +235,7 @@ public:
   void swap(BufferBatch* bufBatch);
   void clone(DataBatch* srcBatch, bool useGpu);
 
-protected:
+ protected:
   DataBatch* batchData_;
   hl_stream_t hlStream_;
   hl_event_t hlEvent_;
@@ -247,7 +247,7 @@ typedef std::shared_ptr<DataProvider> DataProviderPtr;
 typedef Queue<BufferBatch*> BufferBatchQueue;
 
 class DoubleBuffer {
-public:
+ public:
   DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0);
   virtual ~DoubleBuffer();
   void removeOneBatch(DataBatch* dataBatch);
@@ -267,7 +267,7 @@ public:
 
   void setPending(bool pending) { pending_ = pending; }
 
-protected:
+ protected:
   virtual void asyncLoadBatch();
   void insertOneBatch(DataBatch* batch);
 
@@ -290,7 +290,7 @@ protected:
  * one is for input, one is for label.
  */
 class DataProvider {
-public:
+ public:
   static ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool> registrar_;
   static DataProvider* create(const DataConfig& config,
                               const ModelConfig& modelConfig,
@@ -359,7 +359,7 @@ public:
    */
   virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch) = 0;
 
-protected:
+ protected:
   DataConfig config_;
   bool skipShuffle_;
   float usageRatio_;
@@ -382,7 +382,7 @@ protected:
  * necessary configurations such as stream_names
  */
 class DummyDataProvider : public DataProvider {
-public:
+ public:
   DummyDataProvider(const DataConfig& config, bool useGpu)
       : DataProvider(config, useGpu) {}
   virtual void shuffle() {}
@@ -399,7 +399,7 @@ public:
  * Data provider for one input and one integer label.
  */
 class SimpleDataProviderBase : public DataProvider {
-protected:
+ protected:
   /// sample feature dimension
   int64_t sampleDim_;
   /// the number of samples
@@ -425,7 +425,7 @@ protected:
 
   RWLock lock_;
 
-public:
+ public:
   SimpleDataProviderBase(const DataConfig& config, bool useGpu, bool withInfo);
   ~SimpleDataProviderBase() {}
 
@@ -440,7 +440,7 @@ public:
   /// return the number of samples in the buffer
   int64_t fillBuffer();
 
-protected:
+ protected:
   /**
    * @brief Fill at most size samples into data and label.
    *
@@ -458,12 +458,12 @@ protected:
 };
 
 class SimpleDataProvider : public SimpleDataProviderBase {
-public:
+ public:
   SimpleDataProvider(const DataConfig& config, bool useGpu);
   ~SimpleDataProvider();
   virtual void reset();
 
-protected:
+ protected:
   void loadData(const std::string& fileName);
   void loadDataFile(const std::string& fileName);
   virtual int64_t fillBufferImp(real* data,
@@ -471,7 +471,7 @@ protected:
                                 int* info,
                                 int64_t size);
 
-protected:
+ protected:
   size_t currentSampleIndex_;
   std::vector<int> labels_;
   std::vector<real> data_;
diff --git a/paddle/gserver/dataproviders/DataProviderGroup.h b/paddle/gserver/dataproviders/DataProviderGroup.h
index 768e54fe82bedd6faca5ad9eb2b6f2ee0017dc3d..91c94dc986c7aeb70df25511ce14a5f9c312a159 100644
--- a/paddle/gserver/dataproviders/DataProviderGroup.h
+++ b/paddle/gserver/dataproviders/DataProviderGroup.h
@@ -20,7 +20,7 @@ namespace paddle {
 
 template <class T>
 class DataProviderGroup : public DataProvider {
-protected:
+ protected:
   typedef T ProviderType;
   typedef std::shared_ptr<ProviderType> ProviderPtrType;
   ProviderPtrType provider_;
@@ -29,7 +29,7 @@ protected:
   std::mutex lock_;
   std::unique_ptr<MultiThreadWorker<ProviderType>> loader_;
 
-public:
+ public:
   DataProviderGroup(const DataConfig& config, bool useGpu);
   ~DataProviderGroup() {}
 
@@ -38,7 +38,7 @@ public:
   virtual int64_t getSize() { return -1; }
   virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
 
-private:
+ private:
   void startLoader();
   void stopLoader();
   void forceStopLoader();
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.h b/paddle/gserver/dataproviders/MultiDataProvider.h
index 9a863c896773d71a99e21660fc13e3dd477a0c12..baa1fc019002f86414c9c45734ad65cda916d457 100644
--- a/paddle/gserver/dataproviders/MultiDataProvider.h
+++ b/paddle/gserver/dataproviders/MultiDataProvider.h
@@ -19,10 +19,10 @@ limitations under the License. */
 namespace paddle {
 
 class MultiDataProvider : public DataProvider {
-protected:
+ protected:
   std::vector<std::unique_ptr<DataProvider>> subDataProviders_;
 
-public:
+ public:
   MultiDataProvider(const DataConfig& config,
                     const ModelConfig& modelConfig,
                     bool useGpu);
@@ -33,7 +33,7 @@ public:
   virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
   bool isTestMode() const { return isTestMode_; }
 
-private:
+ private:
   int totalDataRatio_;
   bool isTestMode_;
 };
diff --git a/paddle/gserver/dataproviders/ProtoReader.h b/paddle/gserver/dataproviders/ProtoReader.h
index 786703f4dee4802bb967f9d15fb69ebcbc15d997..08d045226e1ebb014bdd91ebf0e8f0353179b0c8 100644
--- a/paddle/gserver/dataproviders/ProtoReader.h
+++ b/paddle/gserver/dataproviders/ProtoReader.h
@@ -28,7 +28,7 @@ namespace paddle {
  * messages from/to i/ostream.
  */
 class ProtoReader {
-public:
+ public:
   explicit ProtoReader(std::istream* s, bool dataCompression = false) {
     CHECK(s) << "istream pointer is nullptr";
     istreamInput_.reset(new google::protobuf::io::IstreamInputStream(s));
@@ -109,7 +109,7 @@ public:
     return true;
   }
 
-protected:
+ protected:
   std::unique_ptr<google::protobuf::io::ZeroCopyInputStream> istreamInput_;
   std::unique_ptr<google::protobuf::io::GzipInputStream> gzipInput_;
   std::unique_ptr<google::protobuf::io::CodedInputStream> codedInput_;
@@ -144,7 +144,7 @@ protected:
 };
 
 class ProtoWriter {
-public:
+ public:
   explicit ProtoWriter(std::ostream* s, bool dataCompression = false) {
     CHECK(s) << "ostream pointer is nullptr";
     ostreamOutput_.reset(new google::protobuf::io::OstreamOutputStream(s));
@@ -168,7 +168,7 @@ public:
     return ret;
   }
 
-protected:
+ protected:
   std::unique_ptr<google::protobuf::io::ZeroCopyOutputStream> ostreamOutput_;
   std::unique_ptr<google::protobuf::io::GzipOutputStream> gzipOutput_;
   std::unique_ptr<google::protobuf::io::CodedOutputStream> codedOutput_;
diff --git a/paddle/gserver/dataproviders/PyDataProvider.h b/paddle/gserver/dataproviders/PyDataProvider.h
index e53354c9e43ea9dc58fd4bd38a533025b6f17482..da50dd4e2ebb743ef45af319bc713ed7ac3d3e10 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.h
+++ b/paddle/gserver/dataproviders/PyDataProvider.h
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 
 class PyDataProvider : public DataProvider {
-public:
+ public:
   PyDataProvider(const DataConfig& config,
                  bool useGpu,
                  bool loadDataAll = true);
@@ -40,7 +40,7 @@ public:
 
   virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
 
-protected:
+ protected:
   struct ProtoSlot;
   // return false if each each sample is one sequence, i.e., independent
   // of other samples.
@@ -73,7 +73,7 @@ protected:
   void resetSlots();
   void loadData(const std::vector<std::string>& fileList);
 
-protected:
+ protected:
   struct ProtoSlot {
     SlotDef::SlotType type;
     int dim;
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp
index b4215bb307cc31ce64bb724986b88fdc20bbbf45..54ee091e8f257f76b113d4ca6f8a7c3989c0c1df 100644
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -93,7 +93,7 @@ inline std::ostream& operator<<(std::ostream& os, const SlotHeader& header) {
  * prepare step, fill data into argument during fill step.
  */
 class IFieldScanner {
-public:
+ public:
   DISABLE_COPY(IFieldScanner);
   /**
    * Ctor.
@@ -146,7 +146,7 @@ public:
    */
   static IFieldScanner* create(SlotHeader* header);
 
-protected:
+ protected:
   SlotHeader* headerPtr_;
 };
 
@@ -154,7 +154,7 @@ protected:
  * Py Data Provider Cache Interface.
  */
 class IPyDataProviderCache {
-public:
+ public:
   virtual ~IPyDataProviderCache() {}
 
   /**
@@ -193,7 +193,7 @@ public:
  * data. And it support cache strategies.
  */
 class PyDataProvider2 : public DataProvider {
-public:
+ public:
   /**
    * Ctor
    */
@@ -234,7 +234,7 @@ public:
    */
   virtual ~PyDataProvider2() { resetImpl(false); }
 
-private:
+ private:
   void createPyDataObj(const std::string& model,
                        const std::string& className,
                        const std::string& fileListName,
@@ -435,7 +435,7 @@ private:
     exit_ = false;
   }
 
-private:
+ private:
   std::unique_ptr<std::thread> loadThread_;
   std::atomic<bool> exit_;
   std::deque<PyObjectPtr> callingContexts_;
@@ -461,7 +461,7 @@ private:
   static PyObjectPtr zeroTuple_;
 
   class PositionRandom {
-  public:
+   public:
     inline explicit PositionRandom(bool skipRand)
         : eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {}
 
@@ -476,14 +476,14 @@ private:
       }
     }
 
-  private:
+   private:
     std::default_random_engine& eng_;
     std::unique_ptr<std::uniform_int_distribution<size_t>> dist_;
     bool skipRand_;
   };
 
   // DataProvider interface
-public:
+ public:
   /**
    * Resetting the PyDataProvider. May start reading thread here.
    */
@@ -666,7 +666,7 @@ REGISTER_DATA_PROVIDER_EX(py2, PyDataProvider2);
  * Scanner for dense slot.
  */
 class DenseScanner : public IFieldScanner {
-public:
+ public:
   explicit DenseScanner(SlotHeader* ptr) : IFieldScanner(ptr), height_(0) {}
 
   /**
@@ -708,7 +708,7 @@ public:
     ++height_;
   }
 
-private:
+ private:
   size_t height_;
 };
 
@@ -716,7 +716,7 @@ private:
  * Scanner for index slot
  */
 class IndexScanner : public IFieldScanner {
-public:
+ public:
   explicit IndexScanner(SlotHeader* ptr) : IFieldScanner(ptr), cnt_(0) {}
 
   /**
@@ -740,12 +740,12 @@ public:
     CHECK(ok) << "Cannot cast int " << py::repr(obj);
   }
 
-private:
+ private:
   size_t cnt_;
 };
 
 class SparseNonValueScanner : public IFieldScanner {
-public:
+ public:
   explicit SparseNonValueScanner(SlotHeader* ptr)
       : IFieldScanner(ptr), nnz_(0), height_(0) {}
 
@@ -790,7 +790,7 @@ public:
     ++height_;
   }
 
-protected:
+ protected:
   /**
    * Set a single sparse index and value.
    * @param [out] col sparse index
@@ -809,7 +809,7 @@ protected:
 };
 
 class SparseValueScanner : public SparseNonValueScanner {
-public:
+ public:
   explicit SparseValueScanner(SlotHeader* ptr) : SparseNonValueScanner(ptr) {}
 
   virtual void finishPrepare(Argument& argument) {
@@ -817,7 +817,7 @@ public:
         argument.value, height_, headerPtr_->dim, nnz_, FLOAT_VALUE);
   }
 
-protected:
+ protected:
   virtual void setData(int* col, real* dat, PyObject* obj) {
     py::SequenceHelper s(obj);
     SparseNonValueScanner::setData(col, dat, s[0]);
@@ -829,7 +829,7 @@ protected:
  * Sequence Scanner. Scanner for sequence or sub-sequence.
  */
 class SequenceScanner : public IFieldScanner {
-public:
+ public:
   /**
    * Ctor
    * @param innerScanner inner scanner for each timestep or sub-sequence.
@@ -902,7 +902,7 @@ public:
    */
   virtual void finishFill(Argument& argument) { inner_->finishFill(argument); }
 
-protected:
+ protected:
   size_t getSize(PyObject* obj) {
     py::SequenceHelper s(obj);
     auto sc = dynamic_cast<SequenceScanner*>(inner_.get());
@@ -917,7 +917,7 @@ protected:
     }
   }
 
-private:
+ private:
   std::unique_ptr<IFieldScanner> inner_;
   size_t cnt_;
   std::function<ICpuGpuVectorPtr&(Argument&)> getSeqStartPos_;
@@ -969,7 +969,7 @@ IFieldScanner* IFieldScanner::create(SlotHeader* header) {
  * python every pass.
  */
 class NoCacheStrategy : public IPyDataProviderCache {
-public:
+ public:
   virtual bool reset() { return true; }
 
   virtual void drop(std::deque<PyObjectPtr>* data) { data->clear(); }
@@ -984,7 +984,7 @@ public:
  * The rest passes, will load data from memory.
  */
 class CacheOnePassInMemory : public IPyDataProviderCache {
-public:
+ public:
   CacheOnePassInMemory()
       : objPool_(new std::deque<PyObjectPtr>()),
         droppedPool_(new std::deque<PyObjectPtr>()) {}
@@ -1011,7 +1011,7 @@ public:
 
   virtual std::deque<PyObjectPtr>* load() { return objPool_.get(); }
 
-private:
+ private:
   std::unique_ptr<std::deque<PyObjectPtr>> objPool_;
   std::unique_ptr<std::deque<PyObjectPtr>> droppedPool_;
 };
diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
index 0f680de776f4755ca5fe83c86ea759d88f93ed01..c6cd41de9a1a22470d8659eb90d1ac2b075b2df9 100644
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -22,7 +22,7 @@ namespace paddle {
  * calculate sequence-to-sequence edit distance
  */
 class CTCErrorEvaluator : public Evaluator {
-private:
+ private:
   MatrixPtr outActivations_;
   int numTimes_, numClasses_, numSequences_, blank_;
   real deletions_, insertions_, substitutions_;
@@ -197,7 +197,7 @@ private:
         (real)seqClassficationError_ / numSequences_;
   }
 
-public:
+ public:
   CTCErrorEvaluator()
       : numTimes_(0),
         numClasses_(0),
diff --git a/paddle/gserver/evaluators/ChunkEvaluator.cpp b/paddle/gserver/evaluators/ChunkEvaluator.cpp
index 755b91d05caf33745e66415e7b111ba348c575d9..a2216293b1ab3a32e9cc903b805ca0aca10d58c1 100644
--- a/paddle/gserver/evaluators/ChunkEvaluator.cpp
+++ b/paddle/gserver/evaluators/ChunkEvaluator.cpp
@@ -77,7 +77,7 @@ class ChunkEvaluator : public Evaluator {
   std::set<int> excludedChunkTypes_;
   mutable std::unordered_map<std::string, real> values_;
 
-public:
+ public:
   virtual void init(const EvaluatorConfig& config) {
     Evaluator::init(config);
     if (config.chunk_scheme() == "IOB") {
@@ -276,7 +276,7 @@ public:
     return "chunk";
   }
 
-private:
+ private:
   void storeLocalValues() const {
     CHECK_GE(numOutputSegments_, 0);
     CHECK_GE(numLabelSegments_, 0);
diff --git a/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp b/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp
index f43ef5dd51407236a3a36b300b33f92a9fad885a..ddb8ebca784db4a83c328ff75f5c50c7aecd7352 100644
--- a/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp
+++ b/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp
@@ -28,7 +28,7 @@ namespace paddle {
  * The config file api is detection_map_evaluator.
  */
 class DetectionMAPEvaluator : public Evaluator {
-public:
+ public:
   DetectionMAPEvaluator()
       : evaluateDifficult_(false), cpuOutput_(nullptr), cpuLabel_(nullptr) {}
 
@@ -132,7 +132,7 @@ public:
     LOG(FATAL) << "Distribute detection evaluation not implemented.";
   }
 
-protected:
+ protected:
   void calcTFPos(const size_t batchSize,
                  const vector<map<size_t, vector<NormalizedBBox>>>& allGTBBoxes,
                  const vector<map<size_t, vector<pair<real, NormalizedBBox>>>>&
@@ -287,7 +287,7 @@ protected:
 
   real getValueImpl() const { return calcMAP(); }
 
-private:
+ private:
   real overlapThreshold_;  // overlap threshold when determining whether matched
   bool evaluateDifficult_;  // whether evaluate difficult ground truth
   size_t backgroundId_;     // class index of background
diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp
index 79478e7fac63a49c494105d53a6944b4b89e6c63..941fb8fb539d58cca22ecf563d2effa816243c3b 100644
--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -38,7 +38,7 @@ void Evaluator::eval(const NeuralNetwork& nn) {
  * The config file api is classification_error_evaluator.
  */
 class ClassificationErrorEvaluator : public Evaluator {
-public:
+ public:
   /*
   ClassificationErrorEvaluator() : totalScore2_(0) {}
 
@@ -124,7 +124,7 @@ public:
   }
 
   // Evaluator interface
-protected:
+ protected:
   std::string getTypeImpl() const { return "classification_error"; }
 };
 
@@ -135,7 +135,7 @@ protected:
  */
 class SequenceClassificationErrorEvaluator
     : public ClassificationErrorEvaluator {
-public:
+ public:
   virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
     numSamples_ += arguments[0].getNumSequences();
   }
@@ -166,7 +166,7 @@ public:
   }
 
   // Evaluator interface
-protected:
+ protected:
   std::string getTypeImpl() const { return "seq_classification_error"; }
 };
 REGISTER_EVALUATOR(seq_classification_error,
@@ -178,7 +178,7 @@ REGISTER_EVALUATOR(seq_classification_error,
  * The config file api is sum_evaluator.
  */
 class SumEvaluator : public Evaluator {
-public:
+ public:
   SumEvaluator() : cpuLabel_(nullptr), cpuWeight_(nullptr) {}
 
   virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
@@ -255,12 +255,12 @@ public:
     mergeResultsOfAllClients(client);
   }
 
-private:
+ private:
   IVectorPtr cpuLabel_;
   MatrixPtr cpuWeight_;
 
   // Evaluator interface
-protected:
+ protected:
   std::string getTypeImpl() const { return "sum"; }
 };
 /**
@@ -274,7 +274,7 @@ protected:
  *
  */
 class ColumnSumEvaluator : public Evaluator {
-public:
+ public:
   explicit ColumnSumEvaluator(int32_t colIdx)
       : colIdx_(colIdx), colNum_(0), sum_(nullptr) {}
 
@@ -368,13 +368,13 @@ public:
     client->reduce(&numSamples_, &numSamples_, 1, FLAGS_trainer_id, 0);
   }
 
-private:
+ private:
   int32_t colIdx_;
   size_t colNum_;
   MatrixPtr sum_; /* cpu matrix */
 
   // Evaluator interface
-protected:
+ protected:
   std::string getTypeImpl() const {
     if (colIdx_ == -1)
       return "last-column-sum";
@@ -1018,7 +1018,7 @@ static InitFunction __reg_type_auc_sum__([]() {
  * The config file api is value_printer_evaluator.
  */
 class ValuePrinter : public NotGetableEvaluator {
-public:
+ public:
   virtual void eval(const NeuralNetwork& nn) {
     for (const std::string& name : config_.input_layers()) {
       nn.getLayer(name)->getOutput().printValueString(LOG(INFO),
@@ -1038,7 +1038,7 @@ REGISTER_EVALUATOR(value_printer, ValuePrinter);
  * The config file api is gradient_printer_evaluator.
  */
 class GradientPrinter : public NotGetableEvaluator {
-public:
+ public:
   virtual void eval(const NeuralNetwork& nn) {
     for (const std::string& name : config_.input_layers()) {
       const Argument& argu = nn.getLayer(name)->getOutput();
@@ -1061,11 +1061,11 @@ REGISTER_EVALUATOR(gradient_printer, GradientPrinter);
  * The config file api is maxid_printer_evaluator.
  */
 class MaxIdPrinter : public NotGetableEvaluator {
-private:
+ private:
   IVectorPtr maxIds_;
   MatrixPtr maxValues_;
 
-public:
+ public:
   MaxIdPrinter() {}
 
   virtual void eval(const NeuralNetwork& nn) {
@@ -1103,12 +1103,12 @@ REGISTER_EVALUATOR(max_id_printer, MaxIdPrinter);
  * The config file api is maxframe_printer_evaluator.
  */
 class MaxFramePrinter : public NotGetableEvaluator {
-private:
+ private:
   IVectorPtr maxIds_;
   MatrixPtr maxValues_;
   MatrixPtr value_;
 
-public:
+ public:
   MaxFramePrinter() {
     value_ =
         Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, false);
@@ -1190,7 +1190,7 @@ REGISTER_EVALUATOR(max_frame_printer, MaxFramePrinter);
  *
  */
 class SequenceTextPrinter : public NotGetableEvaluator {
-private:
+ private:
   /// dict_file, which contains a list of tokens
   std::vector<std::string> dict_;
   /// result_file, which is the output file
@@ -1203,7 +1203,7 @@ private:
   /// store the probability associated with each sequence
   std::vector<MatrixPtr> cpuIn_;
 
-public:
+ public:
   SequenceTextPrinter() {}
 
   virtual void init(const EvaluatorConfig& config) {
@@ -1334,7 +1334,7 @@ REGISTER_EVALUATOR(seq_text_printer, SequenceTextPrinter);
  * The config file api is classification_error_printer_evaluator.
  */
 class ClassificationErrorPrinter : public ClassificationErrorEvaluator {
-public:
+ public:
   virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
 
   virtual real evalImp(std::vector<Argument>& arguments) {
diff --git a/paddle/gserver/evaluators/Evaluator.h b/paddle/gserver/evaluators/Evaluator.h
index be2032992c455fe2b442dbe05d84128ef8ebf82f..42948f1097d9a12600f4b11646a47e45b9bf4e96 100644
--- a/paddle/gserver/evaluators/Evaluator.h
+++ b/paddle/gserver/evaluators/Evaluator.h
@@ -40,7 +40,7 @@ class NeuralNetwork;
  * has been by a trained model.
  */
 class Evaluator {
-public:
+ public:
   static Evaluator* create(const EvaluatorConfig& config);
 
   Evaluator() : numSamples_(0), totalScore_(0) {}
@@ -172,7 +172,7 @@ public:
     return this->getTypeImpl();
   }
 
-protected:
+ protected:
   /**
    * @brief getValueImpl The simplest way to define getValue result. If this
    * evaluator doesn't contain multiple fields, and do not throw any error, just
@@ -191,7 +191,7 @@ protected:
    */
   virtual std::string getTypeImpl() const { return "base"; }
 
-protected:
+ protected:
   EvaluatorConfig config_;
   double numSamples_;
   double totalScore_;
@@ -204,7 +204,7 @@ protected:
  */
 class NotGetableEvaluator : public Evaluator {
   // Evaluator interface
-public:
+ public:
   void getNames(std::vector<std::string>* names) {}
 
   real getValue(const std::string& name, Error* err) const {
@@ -219,7 +219,7 @@ public:
 };
 
 class DummyEvaluator : public Evaluator {
-public:
+ public:
   DummyEvaluator() {}
   virtual void init(const EvaluatorConfig&) {}
   virtual void start() {}
@@ -232,7 +232,7 @@ public:
   virtual void printStats(std::ostream&) const {}
 
   // Evaluator interface
-protected:
+ protected:
   std::string getTypeImpl() const;
 };
 /**
@@ -251,7 +251,7 @@ protected:
  *
  */
 class AucEvaluator : public Evaluator {
-public:
+ public:
   AucEvaluator(int32_t colIdx)
       : colIdx_(colIdx),
         realColumnIdx_(0),
@@ -269,7 +269,7 @@ public:
 
   virtual void distributeEval(ParameterClient2* client);
 
-private:
+ private:
   static const uint32_t kBinNum_ = (1 << 24) - 1;
   static const int kNegativeLabel_ = 0;
   double statPos_[kBinNum_ + 1];
@@ -292,7 +292,7 @@ private:
   double calcAuc() const;
 
   // Evaluator interface
-protected:
+ protected:
   real getValueImpl() const;
   std::string getTypeImpl() const;
 };
@@ -305,7 +305,7 @@ protected:
  * dense value.
  */
 class RankAucEvaluator : public Evaluator {
-public:
+ public:
   // evaluate ranking AUC
   virtual void start();
 
@@ -317,7 +317,7 @@ public:
     mergeResultsOfAllClients(client);
   }
 
-private:
+ private:
   MatrixPtr output_;
   MatrixPtr click_;
   MatrixPtr pv_;
@@ -329,7 +329,7 @@ private:
                      size_t size);
 
   // Evaluator interface
-protected:
+ protected:
   std::string getTypeImpl() const;
 };
 
@@ -344,7 +344,7 @@ protected:
  * The config file api is precision_recall_evaluator.
  */
 class PrecisionRecallEvaluator : public Evaluator {
-public:
+ public:
   // Evaluate precision, recall and F1 score
   PrecisionRecallEvaluator()
       : isMultiBinaryLabel_(false),
@@ -379,7 +379,7 @@ public:
     StatsInfo() : TP(0.0), TN(0.0), FP(0.0), FN(0.0) {}
   };
 
-private:
+ private:
   bool isMultiBinaryLabel_;
   std::vector<StatsInfo> statsInfo_;
 
@@ -444,7 +444,7 @@ private:
  * The config file api is pnpair_evaluator.
  */
 class PnpairEvaluator : public Evaluator {
-public:
+ public:
   PnpairEvaluator()
       : cpuOutput_(nullptr),
         cpuLabel_(nullptr),
@@ -491,7 +491,7 @@ public:
               << " calc total neg pair: " << pairArray_[1];
   }
 
-private:
+ private:
   static const uint32_t kPairArrayNum_ = 2;
   double pairArray_[kPairArrayNum_];
   MatrixPtr cpuOutput_;
@@ -500,7 +500,7 @@ private:
   MatrixPtr cpuWeight_;
 
   // Evaluator interface
-protected:
+ protected:
   real getValueImpl() const {
     return pairArray_[0] / ((pairArray_[1] <= 0) ? 1.0 : pairArray_[1]);
   }
diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h
index 60936c311d1b0119186c76d5c95b8819294446ce..22cf5d265f429ecbcea1808a54c85d7e89f8bc99 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
@@ -73,7 +73,7 @@ class GradientMachine;
 typedef std::shared_ptr<GradientMachine> GradientMachinePtr;
 
 class GradientMachine {
-public:
+ public:
   enum CreateMode {
     kNormal = 0,
     kSgdSparseCpuTraining = 3,
@@ -240,7 +240,7 @@ public:
    */
   virtual void releaseOutput() {}
 
-protected:
+ protected:
   virtual void onLoadParameter() {}
 
   std::vector<ParameterPtr> parameters_;
diff --git a/paddle/gserver/gradientmachines/GradientMachineMode.h b/paddle/gserver/gradientmachines/GradientMachineMode.h
index 898b68fbbc329145109ad0ae4b97c872d4f9a37c..dd944a35f8952e354f8e4f3eb5c67b136c5f080e 100644
--- a/paddle/gserver/gradientmachines/GradientMachineMode.h
+++ b/paddle/gserver/gradientmachines/GradientMachineMode.h
@@ -19,14 +19,14 @@ limitations under the License. */
 namespace paddle {
 
 class IGradientMachineMode {
-public:
+ public:
   virtual ~IGradientMachineMode() {}
 
-public:  // interfaces
-         /**
-          * @brief create current mode's gradient machine by model config.
-          * @param config model config
-          */
+ public:  // interfaces
+          /**
+           * @brief create current mode's gradient machine by model config.
+           * @param config model config
+           */
   virtual GradientMachine* create(const ModelConfig& config) = 0;
 
   /**
@@ -55,14 +55,14 @@ public:  // interfaces
    */
   virtual bool needTrainWholeDataInOneBatch() const = 0;
 
-public:  // static methods.
-         /**
-          * @brief register a custom gradient machine mode.
-          * @note For user to register a custom gradient machine mode, id should >=
-          * kCustom.
-          * @param mode mode id.
-          * @param ptr mode description object.
-          */
+ public:  // static methods.
+          /**
+           * @brief register a custom gradient machine mode.
+           * @note For user to register a custom gradient machine mode, id should >=
+           * kCustom.
+           * @param mode mode id.
+           * @param ptr mode description object.
+           */
   static void regGradientMachineMode(
       int32_t mode, std::unique_ptr<IGradientMachineMode>&& ptr) {
     modes_.insert(std::make_pair(mode, std::move(ptr)));
@@ -141,7 +141,7 @@ public:  // static methods.
     }
   }
 
-private:
+ private:
   static std::unordered_map<int32_t, std::unique_ptr<IGradientMachineMode>>
       modes_;
 };
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.h b/paddle/gserver/gradientmachines/MultiGradientMachine.h
index 83d2651f34b3698848427f29b1a90e606e57950e..eff7d5284c6dd4898344203b50acc94ae61b4d59 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.h
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h
@@ -166,7 +166,7 @@ struct GradBuffer {
  *     the merged gradient to parameter server.
  */
 class MultiGradientMachine : public GradientMachine {
-public:
+ public:
   enum TaskType {
     TASK_FORWARD_BACKWARD = 0,
     TASK_FORWARD = 1,
@@ -213,7 +213,7 @@ public:
   /// The gradietns will be copied to each thread in the computing threads.
   virtual void setOutputGrad(const std::vector<Argument>& args);
 
-protected:
+ protected:
   friend class TrainerThread;
 
   std::vector<TrainerThreadPtr>& getAllThreads() { return threads_; }
@@ -281,7 +281,7 @@ protected:
 
   int paraMainThread(int pid) const { return paraMainThread_[pid]; }
 
-protected:
+ protected:
   virtual void forwardImp(const std::vector<Argument>& inArgs,
                           std::vector<Argument>* outArgs,
                           PassType passType,
@@ -298,7 +298,7 @@ protected:
 
   void allocGradBufs();
 
-protected:
+ protected:
   bool useGpu_;
 
   bool hasNonstaticCpuParamters_;
@@ -342,7 +342,7 @@ protected:
 };
 
 class TrainerThread {
-public:
+ public:
   TrainerThread(const ModelConfig& config,
                 int threadId,
                 MultiGradientMachine* multiMachine);
@@ -392,7 +392,7 @@ public:
   /// Whether the thread has input data.
   bool hasInputData() { return batchSize_ != 0; }
 
-protected:
+ protected:
   void mergeCpuGradients();
 
   void mergeGradSparse(
@@ -421,7 +421,7 @@ protected:
   /// GradientMachine::backward
   void doCallback(int pid);
 
-protected:
+ protected:
   MultiGradientMachine* multiMachine_;
   ModelConfig config_;
   /// whether the thread should stop
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.cpp b/paddle/gserver/gradientmachines/MultiNetwork.cpp
index a1140402b8baaae20e20802ebf87462e301b60f9..5f3d09dda26772850828e6d44e8cc65635b314dc 100644
--- a/paddle/gserver/gradientmachines/MultiNetwork.cpp
+++ b/paddle/gserver/gradientmachines/MultiNetwork.cpp
@@ -122,7 +122,7 @@ void MultiNetwork::finish() {
 }
 
 class MultiCombinedEvaluator : public Evaluator {
-public:
+ public:
   MultiCombinedEvaluator() {}
   void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
     evaluators_.emplace_back(std::move(evaluator));
@@ -167,7 +167,7 @@ public:
     }
   }
 
-protected:
+ protected:
   std::vector<std::unique_ptr<Evaluator>> evaluators_;
 };
 
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.h b/paddle/gserver/gradientmachines/MultiNetwork.h
index 186a9ad0a39cd7815aea6738e6c6bc4a0c944aa9..495d5592017b5fb937fb8243bf12a5f2f30d67e7 100644
--- a/paddle/gserver/gradientmachines/MultiNetwork.h
+++ b/paddle/gserver/gradientmachines/MultiNetwork.h
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 
 class MultiNetwork : public NeuralNetwork {
-public:
+ public:
   explicit MultiNetwork(std::string subModelName = "")
       : NeuralNetwork(subModelName) {}
 
@@ -58,7 +58,7 @@ public:
 
   virtual void finish();
 
-protected:
+ protected:
   std::vector<std::unique_ptr<NeuralNetwork>> subNetworks_;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index a3c13df3dbad973505d8919bce8b95348527e273..ac60a3a3408d37b66cb712d893c6b93a1750f448 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -362,7 +362,7 @@ void NeuralNetwork::releaseOutput() {
 #ifndef PADDLE_MOBILE_INFERENCE
 
 class CombinedEvaluator : public Evaluator {
-public:
+ public:
   void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
     evaluators_.emplace_back(std::move(evaluator));
   }
@@ -400,11 +400,11 @@ public:
     }
   }
 
-protected:
+ protected:
   std::vector<std::unique_ptr<Evaluator>> evaluators_;
 
   // Evaluator interface
-public:
+ public:
   /**
    * @brief getNames will return all inside evaluators' names.
    * @param names [out]: return names.
@@ -435,7 +435,7 @@ public:
         });
   }
 
-private:
+ private:
   template <typename T>
   T getMethodHelper(const std::string& name,
                     Error* err,
@@ -454,7 +454,7 @@ private:
 };
 
 class SubnetEvaluator : public CombinedEvaluator {
-public:
+ public:
   SubnetEvaluator(const std::string& layerName,
                   std::unique_ptr<Evaluator>&& evaluator)
       : layerName_(layerName) {
@@ -473,7 +473,7 @@ public:
                     << " in submodel " << nn.getName();
   }
 
-protected:
+ protected:
   std::string layerName_;
 };
 
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
index 5b32f844f742c07c8bee6638cb46dc00285f49b0..3e5615c8f0b30ab1283d41e025496051869289dc 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@@ -56,7 +56,7 @@ void parameterInitNN(int paramId,
                      std::vector<ParameterPtr>* sharedParams);
 
 class NeuralNetwork : public GradientMachine {
-public:
+ public:
   virtual void init(const ModelConfig& config,
                     ParamInitCallback callback = nullptr,
                     const std::vector<ParameterType>& parameterTypes =
@@ -144,7 +144,7 @@ public:
    */
   void releaseOutput();
 
-protected:
+ protected:
   /**
    * The constructor of NeuralNetwork.
    * The sub networks can get parameters_ and parameterMap_
diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
index e3b6812123141e8e0afb9368fb06f2b34f526800..c091459506ad477bed3f429a22071eccedd664bb 100644
--- a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
@@ -32,7 +32,7 @@ enum TaskType {
  * multiple threads in parallel.
  */
 class ParallelNeuralNetwork : public NeuralNetwork {
-public:
+ public:
   ParallelNeuralNetwork(std::string subModelName = "",
                         NeuralNetwork *rootNetwork = nullptr)
       : NeuralNetwork(subModelName, rootNetwork) {}
@@ -66,7 +66,7 @@ public:
 
   // virtual void eval(Evaluator* evaluator);
 
-protected:
+ protected:
   bool useGpu_;
   /// number of gpu devices
   int numDevices_;
@@ -74,7 +74,7 @@ protected:
 };
 
 class ParallelThread {
-public:
+ public:
   ParallelThread(int threadId, int deviceId, bool useGpu);
   ~ParallelThread();
   void jobEnqueue(LayerPtr layer, TaskType task);
@@ -87,10 +87,10 @@ public:
   }
   void setForwardPassType(PassType passType) { passType_ = passType; }
 
-protected:
+ protected:
   void computeThread();
 
-public:
+ public:
   struct Job {
     LayerPtr layer_;
     TaskType task_;
@@ -98,7 +98,7 @@ public:
   typedef Queue<Job> JobQueue;
   JobQueue queue_;
 
-protected:
+ protected:
   /// from 0 to threads-1
   int threadId_;
   /// the GPU device Id which the computeThread_ used
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index 2429b5d1a0a5ccf66db365b82c494c53d8e1fd4b..73ac8cda721f200c1a02cd9c1d9456df70d7b7d2 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -96,7 +96,7 @@ static InitFunction __init__diy_prob_method(
     std::numeric_limits<int>::max());
 
 class BeamSearchControlCallbacks {
-public:
+ public:
   RecurrentGradientMachine::BeamSearchCandidatesAdjustCallback
       beamSearchCandidateAdjust;
   RecurrentGradientMachine::NormOrDropNodeCallback normOrDropNode;
@@ -115,7 +115,7 @@ public:
 };
 
 class BeamSearchStatisticsCallbacks {
-public:
+ public:
   RecurrentGradientMachine::EachStepCallback onEachStepStarted;
   RecurrentGradientMachine::EachStepCallback onEachStepStoped;
 
@@ -148,11 +148,11 @@ RecurrentGradientMachine::RecurrentGradientMachine(
  *    so it's should not be placed in root network.
  */
 class BootBiasLayer : public Layer {
-protected:
+ protected:
   std::unique_ptr<Weight> biases_;
   IVectorPtr cpuIds_;
 
-public:
+ public:
   explicit BootBiasLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
index 0032b72cdae44588af976f1ac542149545f551f1..7e943cebd35234ba7af357c9f64fde6b0a9546ce 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -30,7 +30,7 @@ class BeamSearchControlCallbacks;
 class BeamSearchStatisticsCallbacks;
 
 class RecurrentGradientMachine : public NeuralNetwork {
-public:
+ public:
   RecurrentGradientMachine(const std::string& subModelName,
                            NeuralNetwork* rootNetwork);
 
@@ -290,7 +290,7 @@ public:
     return this->finalPaths_;
   }
 
-protected:
+ protected:
   std::vector<Argument::SeqInfo> commonSeqInfo_;
   ICpuGpuVectorPtr sequenceStartPositions_;
   void calcSequenceStartPositions();
@@ -447,7 +447,7 @@ protected:
   MatrixPtr cpuProb_;
   IVectorPtr cpuEos_;
 
-private:
+ private:
   /*
    * @return beam size in beam search
    */
diff --git a/paddle/gserver/layers/AddtoLayer.h b/paddle/gserver/layers/AddtoLayer.h
index 1d000630567cb1116ab0ff69e42380fc0eae6173..6ea54f4a53d466594055db2fb5167fa1a9d6c9da 100644
--- a/paddle/gserver/layers/AddtoLayer.h
+++ b/paddle/gserver/layers/AddtoLayer.h
@@ -33,10 +33,10 @@ namespace paddle {
  * The config file api is addto_layer.
  */
 class AddtoLayer : public Layer {
-protected:
+ protected:
   std::unique_ptr<Weight> biases_;
 
-public:
+ public:
   explicit AddtoLayer(const LayerConfig& config) : Layer(config) {}
 
   ~AddtoLayer() {}
diff --git a/paddle/gserver/layers/AgentLayer.h b/paddle/gserver/layers/AgentLayer.h
index da0ac4530836205757399ac8eb64dd003740a53f..51f346d5c9fdf9599cddf4b668c128035fd94187 100644
--- a/paddle/gserver/layers/AgentLayer.h
+++ b/paddle/gserver/layers/AgentLayer.h
@@ -26,11 +26,11 @@ namespace paddle {
  * called to set one and only one real layer
  */
 class AgentLayer : public Layer {
-protected:
+ protected:
   LayerPtr realLayer_;
   int numSamples_;
 
-public:
+ public:
   explicit AgentLayer(const LayerConfig& config) : Layer(config) {}
 
   ~AgentLayer() {}
@@ -55,14 +55,14 @@ public:
  * GatherAgentLayer collect a complete sequence.
  */
 class GatherAgentLayer : public Layer {
-protected:
+ protected:
   std::vector<LayerPtr> realLayers_;
   std::vector<IVectorPtr> idsVec_;
   // we don't clear idsVec_ vector to aviod IVector alloc/free
   IVectorPtr allIds_;
   std::vector<int> idIndex_;
 
-public:
+ public:
   explicit GatherAgentLayer(const LayerConfig& config) : Layer(config) {}
 
   virtual ~GatherAgentLayer() {}
@@ -95,7 +95,7 @@ public:
  * if it is, the agent will select a few ids in real layer.
  */
 class ScatterAgentLayer : public Layer {
-protected:
+ protected:
   LayerPtr realLayer_;
   IVectorPtr ids_;
   IVectorPtr cpuIds_;
@@ -113,7 +113,7 @@ protected:
   // true for setRealLayer, false for setRealLayerAndOutput
   bool selectionMode_;
 
-public:
+ public:
   explicit ScatterAgentLayer(const LayerConfig& config) : Layer(config) {}
 
   virtual ~ScatterAgentLayer() {}
diff --git a/paddle/gserver/layers/AverageLayer.h b/paddle/gserver/layers/AverageLayer.h
index 24602d2a9c3e08cf76f6f98b5f9e3f593118e6e1..03e2673b55ceca7a698f1b858327ad6fad739087 100644
--- a/paddle/gserver/layers/AverageLayer.h
+++ b/paddle/gserver/layers/AverageLayer.h
@@ -37,7 +37,7 @@ namespace paddle {
  * The config file api is pooling_layer.
  */
 class AverageLayer : public SequencePoolLayer {
-public:
+ public:
   enum AverageStrategy { kAverage = 0, kSum = 1, kAverageSquareRootN = 2 };
   explicit AverageLayer(const LayerConfig& config)
       : SequencePoolLayer(config) {}
@@ -48,7 +48,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   int mode_;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.h b/paddle/gserver/layers/BatchNormBaseLayer.h
index 69d642af4f12593e8db8a726310e6b1934c8e3be..5a446c0843a22adecbaf2ae09fcd526b68865ae2 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.h
+++ b/paddle/gserver/layers/BatchNormBaseLayer.h
@@ -40,7 +40,7 @@ namespace paddle {
  */
 
 class BatchNormBaseLayer : public Layer {
-public:
+ public:
   explicit BatchNormBaseLayer(const LayerConfig& config) : Layer(config) {}
 
   ~BatchNormBaseLayer() {}
@@ -61,7 +61,7 @@ public:
    */
   void calFeatureMapSize();
 
-protected:
+ protected:
   /// Batch normalization scale parameter, which is referred to as gamma in
   /// in original paper.
   std::unique_ptr<Weight> weight_;
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.h b/paddle/gserver/layers/BatchNormalizationLayer.h
index 95add69215e3ea0b0225d0a245fe37905c33127b..e5e4e690b6017f32de0f4d7557065c02c03d689f 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.h
+++ b/paddle/gserver/layers/BatchNormalizationLayer.h
@@ -27,7 +27,7 @@ namespace paddle {
  */
 
 class BatchNormalizationLayer : public BatchNormBaseLayer {
-public:
+ public:
   explicit BatchNormalizationLayer(const LayerConfig& config)
       : BatchNormBaseLayer(config), firstTest_(true) {}
 
@@ -38,7 +38,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   /// Load pre-calculated mean and std.
   void setMeanAndStd();
 
diff --git a/paddle/gserver/layers/BilinearInterpLayer.h b/paddle/gserver/layers/BilinearInterpLayer.h
index acd320420f4bbfe313f3ae77577ffc6b5cbfbfdf..8e08c2e1ce80172f55c93d8242821f683fa1a731 100644
--- a/paddle/gserver/layers/BilinearInterpLayer.h
+++ b/paddle/gserver/layers/BilinearInterpLayer.h
@@ -26,13 +26,13 @@ namespace paddle {
  * @note  The config file api is bilinear_interp_layer.
  */
 class BilinearInterpLayer : public Layer {
-protected:
+ protected:
   size_t outImgH_, outImgW_;
   size_t inImgH_, inImgW_;
   real ratioH_, ratioW_;
   size_t numChannels_;
 
-public:
+ public:
   explicit BilinearInterpLayer(const LayerConfig& config) : Layer(config) {}
 
   virtual ~BilinearInterpLayer() {}
diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h
index 1797b64036b5cb9f97477d5a44b2f58e2d6c0cd4..9d76584f3a4eda19a9e8f806256a7b8da617cc37 100644
--- a/paddle/gserver/layers/BlockExpandLayer.h
+++ b/paddle/gserver/layers/BlockExpandLayer.h
@@ -40,7 +40,7 @@ namespace paddle {
  * The config file api is block_expand_layer.
  */
 class BlockExpandLayer : public Layer {
-protected:
+ protected:
   /**
    * @brief Calculate outputH_ and outputW_ and return block number which
    * actually is time steps.
@@ -53,7 +53,7 @@ protected:
   TensorShape inputShape_;
   TensorShape outputShape_;
 
-public:
+ public:
   explicit BlockExpandLayer(const LayerConfig& config) : Layer(config) {}
 
   ~BlockExpandLayer() {}
diff --git a/paddle/gserver/layers/CRFDecodingLayer.h b/paddle/gserver/layers/CRFDecodingLayer.h
index fba3cebac1a375008c58d21c458d9e0b98305ffa..018162e146fa93725fe84bdf2da9a6124f3cea6f 100644
--- a/paddle/gserver/layers/CRFDecodingLayer.h
+++ b/paddle/gserver/layers/CRFDecodingLayer.h
@@ -30,14 +30,14 @@ namespace paddle {
  * See LinearChainCRF.h for the detail of the CRF formulation.
  */
 class CRFDecodingLayer : public CRFLayer {
-public:
+ public:
   explicit CRFDecodingLayer(const LayerConfig& config) : CRFLayer(config) {}
   bool init(const LayerMap& layerMap,
             const ParameterMap& parameterMap) override;
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override;
 
-protected:
+ protected:
   std::unique_ptr<LinearChainCRF> crf_;
 };
 
diff --git a/paddle/gserver/layers/CRFLayer.h b/paddle/gserver/layers/CRFLayer.h
index cb5bd05568cc79c0093d6af0791cf0b3ce2dae47..88c2ed343ad5743068c871fe351437270d85f223 100644
--- a/paddle/gserver/layers/CRFLayer.h
+++ b/paddle/gserver/layers/CRFLayer.h
@@ -27,14 +27,14 @@ namespace paddle {
  * See class LinearChainCRF for the detail of the CRF formulation.
  */
 class CRFLayer : public Layer {
-public:
+ public:
   explicit CRFLayer(const LayerConfig& config) : Layer(config) {}
   bool init(const LayerMap& layerMap,
             const ParameterMap& parameterMap) override;
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override;
 
-protected:
+ protected:
   size_t numClasses_;
   ParameterPtr parameter_;
   std::vector<LinearChainCRF> crfs_;
diff --git a/paddle/gserver/layers/CTCLayer.h b/paddle/gserver/layers/CTCLayer.h
index fcbc42565e9340903d05aca2d0ba2091ffe20be0..5d70b1f4ceb03028865378d1d01b5706b35b10de 100644
--- a/paddle/gserver/layers/CTCLayer.h
+++ b/paddle/gserver/layers/CTCLayer.h
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 
 class CTCLayer : public Layer {
-public:
+ public:
   explicit CTCLayer(const LayerConfig& config) : Layer(config) {}
   bool init(const LayerMap& layerMap,
             const ParameterMap& parameterMap) override;
@@ -31,7 +31,7 @@ public:
                    const Argument& softmaxSeqs,
                    const Argument& labelSeqs);
 
-protected:
+ protected:
   size_t numClasses_;
   bool normByTimes_;
   std::vector<LinearChainCTC> ctcs_;
diff --git a/paddle/gserver/layers/ClipLayer.cpp b/paddle/gserver/layers/ClipLayer.cpp
index dbc3337499788af5a9b6f68a6016e94c2072d61b..6aa3c8fe64f5a59e82f3271baed99fd17fd6653f 100644
--- a/paddle/gserver/layers/ClipLayer.cpp
+++ b/paddle/gserver/layers/ClipLayer.cpp
@@ -24,11 +24,11 @@ namespace paddle {
  */
 
 class ClipLayer : public Layer {
-protected:
+ protected:
   double min_;
   double max_;
 
-public:
+ public:
   explicit ClipLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/ConcatenateLayer.cpp b/paddle/gserver/layers/ConcatenateLayer.cpp
index f5ab29a509e45e72c71ba122c73aeba1b3b6a827..e6de329ff3f9ccfdd1cbe697c1de1a9cd8c7926a 100644
--- a/paddle/gserver/layers/ConcatenateLayer.cpp
+++ b/paddle/gserver/layers/ConcatenateLayer.cpp
@@ -23,7 +23,7 @@ namespace paddle {
  * each input as one row for the output of this layer and apply activation.
  */
 class ConcatenateLayer : public Layer {
-public:
+ public:
   explicit ConcatenateLayer(const LayerConfig& config) : Layer(config) {}
 
   ~ConcatenateLayer() {}
@@ -97,7 +97,7 @@ void ConcatenateLayer::backward(const UpdateCallback& callback) {
  * processed by a Projection.
  */
 class ConcatenateLayer2 : public Layer {
-public:
+ public:
   explicit ConcatenateLayer2(const LayerConfig& config) : Layer(config) {}
 
   ~ConcatenateLayer2() {}
@@ -108,7 +108,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   std::vector<std::unique_ptr<Projection>> projections_;
   std::vector<Argument> projOutput_;
   std::vector<std::pair<size_t, size_t>> projCol_;
diff --git a/paddle/gserver/layers/ContextProjection.h b/paddle/gserver/layers/ContextProjection.h
index e30f98f58d2be9ac538f6385efe68990b705ac5f..9c217145419048282a9a09ad899dc970e7c9704f 100644
--- a/paddle/gserver/layers/ContextProjection.h
+++ b/paddle/gserver/layers/ContextProjection.h
@@ -42,7 +42,7 @@ namespace paddle {
  * The config file api is context_projection.
  */
 class ContextProjection : public Projection {
-public:
+ public:
   /**
    * Constructor. If context_start is zero and context_lenth is one, it will
    * set trainable_padding false. trainable_padding is an optional arguments
@@ -63,7 +63,7 @@ public:
 
   virtual bool init();
 
-protected:
+ protected:
   std::unique_ptr<Weight> weight_;
   /// number of extra timesteps added at the beginning
   size_t beginPad_;
diff --git a/paddle/gserver/layers/Conv3DLayer.h b/paddle/gserver/layers/Conv3DLayer.h
index 5ab5ff3d4af07449484c441958c31c8fb06de894..07b804bad02beb6ec9c3e9fd43c3cd3aa6d50b22 100644
--- a/paddle/gserver/layers/Conv3DLayer.h
+++ b/paddle/gserver/layers/Conv3DLayer.h
@@ -26,7 +26,7 @@ namespace paddle {
  * calculate convolution operation.
  */
 class Conv3DLayer : public ConvBaseLayer {
-public:
+ public:
   explicit Conv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
   ~Conv3DLayer() {}
 
@@ -40,7 +40,7 @@ public:
   void bpropWeights(int i);
   size_t getSize();
 
-protected:
+ protected:
   // Figure out the dimensions for individual gemms.
   IntV M_;  /// numFilters_ / filter_group_;
   IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
diff --git a/paddle/gserver/layers/ConvBaseLayer.h b/paddle/gserver/layers/ConvBaseLayer.h
index 93869fe68d15b1cf38296fa8e2f6197dc74f879f..801bc4f888c5a60e803c882dcf807678c64af20c 100644
--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ b/paddle/gserver/layers/ConvBaseLayer.h
@@ -24,7 +24,7 @@ namespace paddle {
  */
 
 class ConvBaseLayer : public Layer {
-protected:
+ protected:
   typedef std::vector<int> IntV;
 
   /// True if it's deconv layer, false if it's convolution layer
@@ -88,7 +88,7 @@ protected:
   /// of output size.
   bool caffeMode_;
 
-public:
+ public:
   explicit ConvBaseLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/ConvBaseOperator.h b/paddle/gserver/layers/ConvBaseOperator.h
index 27fb0362d3c9518a263eac54206e00974d08eb20..c3c647cb69da5a70eb5346737cc0092e2201c89e 100644
--- a/paddle/gserver/layers/ConvBaseOperator.h
+++ b/paddle/gserver/layers/ConvBaseOperator.h
@@ -29,7 +29,7 @@ namespace paddle {
  */
 
 class ConvBaseOperator : public Operator {
-public:
+ public:
   ConvBaseOperator(const OperatorConfig &config, bool useGpu);
   /**
    * Free workspace in device and destroy cudnn tensor descriptor.
@@ -46,7 +46,7 @@ public:
     hl_destroy_convolution_descriptor(convDesc_);
   }
 
-protected:
+ protected:
   /**
    * Get convolution parameters from layer config and
    * initialize member variables.
diff --git a/paddle/gserver/layers/ConvBaseProjection.h b/paddle/gserver/layers/ConvBaseProjection.h
index ba76d236d901187093a2e372a61c5d29d661e8bb..f3266ae1ab945042cde9f24b7c2673c18d37bc11 100644
--- a/paddle/gserver/layers/ConvBaseProjection.h
+++ b/paddle/gserver/layers/ConvBaseProjection.h
@@ -23,7 +23,7 @@ namespace paddle {
  * @brief Base class for ConvProjection and ConvTransProjection.
  */
 class ConvBaseProjection : public Projection {
-public:
+ public:
   /**
    * Constructor.
    */
@@ -33,7 +33,7 @@ public:
 
   ~ConvBaseProjection();
 
-protected:
+ protected:
   void getConvParams();
   void initCudnn();
 
diff --git a/paddle/gserver/layers/ConvOperator.h b/paddle/gserver/layers/ConvOperator.h
index fbdb7bb1cd2b81bd72912dffdc9d059c520068a8..527dbf8c270f35e19ca23acd8a3ba8197d03b988 100644
--- a/paddle/gserver/layers/ConvOperator.h
+++ b/paddle/gserver/layers/ConvOperator.h
@@ -29,7 +29,7 @@ namespace paddle {
  */
 
 class ConvOperator : public ConvBaseOperator {
-public:
+ public:
   ConvOperator(const OperatorConfig &config, bool useGpu)
       : ConvBaseOperator(config, useGpu) {}
   /**
diff --git a/paddle/gserver/layers/ConvProjection.h b/paddle/gserver/layers/ConvProjection.h
index e8ecb99431a421d4b52228600909568b0808649a..22a2202bb6cc256a4a5897724d8eb8a93fefb79f 100644
--- a/paddle/gserver/layers/ConvProjection.h
+++ b/paddle/gserver/layers/ConvProjection.h
@@ -23,7 +23,7 @@ namespace paddle {
  * @brief Convolution projection do the same calculation with CudnnConvLayer.
  */
 class ConvProjection : public ConvBaseProjection {
-public:
+ public:
   /**
    * Constructor.
    */
diff --git a/paddle/gserver/layers/ConvShiftLayer.cpp b/paddle/gserver/layers/ConvShiftLayer.cpp
index fb877710196835e025466f37b5da27bcf80a3db4..615c3478061b591ea30cbf0b3d27ef2551c0dd28 100644
--- a/paddle/gserver/layers/ConvShiftLayer.cpp
+++ b/paddle/gserver/layers/ConvShiftLayer.cpp
@@ -42,7 +42,7 @@ namespace paddle {
  */
 
 class ConvShiftLayer : public Layer {
-public:
+ public:
   explicit ConvShiftLayer(const LayerConfig& config) : Layer(config) {}
 
   ~ConvShiftLayer() {}
diff --git a/paddle/gserver/layers/ConvTransOperator.h b/paddle/gserver/layers/ConvTransOperator.h
index 1bf58f2bfb78ae7dee433455ece37d908b113045..53cb7a21b49189898d09aa20cd46d04cc5c20198 100644
--- a/paddle/gserver/layers/ConvTransOperator.h
+++ b/paddle/gserver/layers/ConvTransOperator.h
@@ -29,7 +29,7 @@ namespace paddle {
  */
 
 class ConvTransOperator : public ConvBaseOperator {
-public:
+ public:
   ConvTransOperator(const OperatorConfig &config, bool useGpu)
       : ConvBaseOperator(config, useGpu) {}
   /**
diff --git a/paddle/gserver/layers/ConvTransProjection.h b/paddle/gserver/layers/ConvTransProjection.h
index 269b2694c82ea076102633537d7c961139a19a43..0f9ed720d3b8855a3a24ac25a1c3917c4b98e81d 100644
--- a/paddle/gserver/layers/ConvTransProjection.h
+++ b/paddle/gserver/layers/ConvTransProjection.h
@@ -23,7 +23,7 @@ namespace paddle {
  * @brief Convolution projection do the same calculation with CudnnConvLayer.
  */
 class ConvTransProjection : public ConvBaseProjection {
-public:
+ public:
   /**
    * Constructor.
    */
diff --git a/paddle/gserver/layers/ConvexCombinationLayer.cpp b/paddle/gserver/layers/ConvexCombinationLayer.cpp
index dce751940c1bf1695a034a3c551412dcb9b7b8b5..31363d97c4fd318ec2c6d48f9200f6ba1f49ba11 100644
--- a/paddle/gserver/layers/ConvexCombinationLayer.cpp
+++ b/paddle/gserver/layers/ConvexCombinationLayer.cpp
@@ -36,7 +36,7 @@ namespace paddle {
  * The config file api is linear_comb_layer.
  */
 class ConvexCombinationLayer : public Layer {
-protected:
+ protected:
   /// A matrix pointer pointing to second input.
   MatrixPtr tmpMtx0;
   /// A matrix pointer pointing to first input.
@@ -44,7 +44,7 @@ protected:
   /// A matrix pointer pointing to output.
   MatrixPtr tmpRow1;
 
-public:
+ public:
   explicit ConvexCombinationLayer(const LayerConfig& config) : Layer(config) {}
 
   ~ConvexCombinationLayer() {}
diff --git a/paddle/gserver/layers/CosSimLayer.h b/paddle/gserver/layers/CosSimLayer.h
index 675cdb16b563faa7acf9e701096bd334ed661160..d9fe1ff270f1f76e3b246dca374ddf45445419f9 100644
--- a/paddle/gserver/layers/CosSimLayer.h
+++ b/paddle/gserver/layers/CosSimLayer.h
@@ -33,7 +33,7 @@ namespace paddle {
  * The config file api is cos_sim.
  */
 class CosSimLayer : public Layer {
-public:
+ public:
   explicit CosSimLayer(const LayerConfig& config) : Layer(config) {}
 
   ~CosSimLayer() {}
diff --git a/paddle/gserver/layers/CosSimVecMatLayer.cpp b/paddle/gserver/layers/CosSimVecMatLayer.cpp
index 685b4e8ef376b76b3058eeba82d803d460e7105c..230ecc768b4d7314b21ac1d76899c3c3bab12309 100644
--- a/paddle/gserver/layers/CosSimVecMatLayer.cpp
+++ b/paddle/gserver/layers/CosSimVecMatLayer.cpp
@@ -32,7 +32,7 @@ namespace paddle {
  */
 
 class CosSimVecMatLayer : public Layer {
-protected:
+ protected:
   MatrixPtr tmpMtx0;
   MatrixPtr tmpMtx1;
   MatrixPtr tmpRow0;
@@ -40,7 +40,7 @@ protected:
   MatrixPtr tmpRow2;
   MatrixPtr tmpRow3;
 
-public:
+ public:
   explicit CosSimVecMatLayer(const LayerConfig& config) : Layer(config) {}
 
   ~CosSimVecMatLayer() {}
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index 484f803a8387a16152c5911d7d5c72b0111283ae..1327616950a8887efa2cba410fa7ae8b5bd97da4 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -716,7 +716,7 @@ void HuberTwoClassification::backwardImp(Matrix& output,
  * \f]
  */
 class SumCostLayer : public Layer {
-public:
+ public:
   explicit SumCostLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/gserver/layers/CostLayer.h
index 306c067ed1c040555d2b03996cc0749faf0ea68c..9bfec0e2b169fac4f235fd13347be687c4f1a222 100644
--- a/paddle/gserver/layers/CostLayer.h
+++ b/paddle/gserver/layers/CostLayer.h
@@ -29,7 +29,7 @@ namespace paddle {
  * handled by the base class.
  */
 class CostLayer : public Layer {
-public:
+ public:
   explicit CostLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -51,7 +51,7 @@ public:
                            Argument& label,
                            Matrix& outputGrad) = 0;
 
-protected:
+ protected:
   LayerPtr weightLayer_;
   real coeff_;
 };
@@ -65,7 +65,7 @@ protected:
  * \f]
  */
 class MultiClassCrossEntropy : public CostLayer {
-public:
+ public:
   explicit MultiClassCrossEntropy(const LayerConfig& config)
       : CostLayer(config) {}
 
@@ -95,7 +95,7 @@ public:
  *     In Proceedings of the ACL 2014 Conference.
  */
 class MultiClassCrossEntropyWithSelfNorm : public CostLayer {
-public:
+ public:
   explicit MultiClassCrossEntropyWithSelfNorm(const LayerConfig& config)
       : CostLayer(config) {}
 
@@ -108,7 +108,7 @@ public:
                    Argument& label,
                    Matrix& outputGrad) override;
 
-protected:
+ protected:
   MatrixPtr sftMaxSum_;
   MatrixPtr sumInv_;
 };
@@ -120,7 +120,7 @@ protected:
  * \f]
  */
 class SoftBinaryClassCrossEntropy : public CostLayer {
-public:
+ public:
   explicit SoftBinaryClassCrossEntropy(const LayerConfig& config)
       : CostLayer(config) {}
 
@@ -133,7 +133,7 @@ public:
                    Argument& label,
                    Matrix& outputGrad) override;
 
-protected:
+ protected:
   MatrixPtr targetPerDim_;
 };
 
@@ -145,7 +145,7 @@ protected:
  * \f]
  */
 class SumOfSquaresCostLayer : public CostLayer {
-public:
+ public:
   explicit SumOfSquaresCostLayer(const LayerConfig& config)
       : CostLayer(config) {}
 
@@ -171,7 +171,7 @@ public:
  * x = output - label
  */
 class SmoothL1CostLayer : public CostLayer {
-public:
+ public:
   explicit SmoothL1CostLayer(const LayerConfig& config) : CostLayer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -197,7 +197,7 @@ public:
  *      Rank useing Gradient Descent.
  */
 class RankingCost : public Layer {
-public:
+ public:
   explicit RankingCost(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -225,7 +225,7 @@ public:
     (void)outputGrad;
   }
 
-private:
+ private:
   double posPairCount_;
   double negPairCount_;
   MatrixPtr margin_;
@@ -250,7 +250,7 @@ private:
  *     with Nonsmooth Cost Functions.
  */
 class LambdaCost : public Layer {
-public:
+ public:
   explicit LambdaCost(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -270,7 +270,7 @@ public:
                 real* gradData,
                 int size);
 
-private:
+ private:
   MatrixPtr marginGrad_;
   int truncationSize_;
   int maxSortSize_;
@@ -287,10 +287,10 @@ private:
  * \f]
  */
 class MultiBinaryLabelCrossEntropy : public CostLayer {
-protected:
+ protected:
   MatrixPtr targetPerDim_;
 
-public:
+ public:
   explicit MultiBinaryLabelCrossEntropy(const LayerConfig& config)
       : CostLayer(config) {}
 
@@ -308,7 +308,7 @@ public:
  * A base layer for HuberRegressionLoss and HuberTwoClassification.
  */
 class HuberCost : public CostLayer {
-public:
+ public:
   std::vector<Argument> tmpCpuInput_;
 
   explicit HuberCost(const LayerConfig& config) : CostLayer(config) {}
@@ -331,7 +331,7 @@ public:
  * Loss = delta * abs(y - f) - 0.5 * delta^2, otherwise
  */
 class HuberRegressionLoss : public HuberCost {
-public:
+ public:
   explicit HuberRegressionLoss(const LayerConfig& config) : HuberCost(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -343,7 +343,7 @@ public:
                    Argument& label,
                    Matrix& outputGrad) override;
 
-protected:
+ protected:
   real delta_;
 };
 
@@ -356,7 +356,7 @@ protected:
  * Loss = 0, otherwise
  */
 class HuberTwoClassification : public HuberCost {
-public:
+ public:
   explicit HuberTwoClassification(const LayerConfig& config)
       : HuberCost(config) {}
 
diff --git a/paddle/gserver/layers/CropLayer.h b/paddle/gserver/layers/CropLayer.h
index 1a85911ef75e992df587a60cfc9a727eafa4cc76..ef88bc483d157406a0f5a7924c14c345ea0df8c4 100644
--- a/paddle/gserver/layers/CropLayer.h
+++ b/paddle/gserver/layers/CropLayer.h
@@ -28,7 +28,7 @@ namespace paddle {
  *                  crop input as this shape conf
  */
 class CropLayer : public Layer {
-public:
+ public:
   explicit CropLayer(const LayerConfig& config) : Layer(config) {}
 
   ~CropLayer() {}
@@ -38,7 +38,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   void setOutDims();
   void setInDims();
 
diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.h b/paddle/gserver/layers/CrossEntropyOverBeam.h
index b47a2933c255c264ba780b2d87c9fbe53cb5665d..c8702b16165eee8d552c563082ffc708ce443deb 100644
--- a/paddle/gserver/layers/CrossEntropyOverBeam.h
+++ b/paddle/gserver/layers/CrossEntropyOverBeam.h
@@ -44,7 +44,7 @@ struct BeamExpansion {
 typedef std::shared_ptr<BeamExpansion> BeamExpansionPtr;
 
 class CostForOneSequence {
-public:
+ public:
   CostForOneSequence()
       : beamSize_(0), validExpansionCount_(0), goldAsExtraPath_(false) {}
   void setData(const BeamExpansionPtr bPtr, size_t beamSize) {
@@ -64,7 +64,7 @@ public:
   real forward();
   void backward();
 
-private:
+ private:
   void calValidExpandStep();
   void constructTotalExpansion();
   size_t initLastExpansion();
@@ -93,14 +93,14 @@ private:
 };
 
 class CrossEntropyOverBeam : public Layer {
-public:
+ public:
   explicit CrossEntropyOverBeam(const LayerConfig& config) : Layer(config) {}
   bool init(const LayerMap& layerMap,
             const ParameterMap& parameterMap) override;
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override;
 
-private:
+ private:
   void checkInputs();
   void copyInputsToCpu();
   void resizeOutput();
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.h b/paddle/gserver/layers/CudnnBatchNormLayer.h
index aa279f73d66770384815cad4d9e2ee0b04a4a1ad..1bb4eff8d2372660caa4ec4a4a20a27f365bebd0 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.h
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.h
@@ -30,7 +30,7 @@ namespace paddle {
  */
 
 class CudnnBatchNormLayer : public BatchNormBaseLayer {
-public:
+ public:
   explicit CudnnBatchNormLayer(const LayerConfig& config)
       : BatchNormBaseLayer(config) {}
 
@@ -46,7 +46,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   /// Epsilon value used in the batch normalization formula.
   /// Same epsilon value should be used in forward and backward functions.
   double eps_;
diff --git a/paddle/gserver/layers/CudnnConvBaseLayer.h b/paddle/gserver/layers/CudnnConvBaseLayer.h
index 698104e4fbd2556f426001687a581153f32773d8..1ee1aa100d8adaed04ce24ee12b5b9af52c14b13 100644
--- a/paddle/gserver/layers/CudnnConvBaseLayer.h
+++ b/paddle/gserver/layers/CudnnConvBaseLayer.h
@@ -31,14 +31,14 @@ namespace paddle {
  * The config file api is img_conv_layer.
  */
 class CudnnConvBaseLayer : public ConvBaseLayer {
-protected:
+ protected:
   std::vector<std::unique_ptr<ProjectionConfig>> projConf_;
   std::vector<std::unique_ptr<Projection>> projections_;
 
   hl_tensor_descriptor biasDesc_;
   hl_tensor_descriptor outputDesc_;
 
-public:
+ public:
   explicit CudnnConvBaseLayer(const LayerConfig& config)
       : ConvBaseLayer(config) {}
 
diff --git a/paddle/gserver/layers/CudnnPoolLayer.h b/paddle/gserver/layers/CudnnPoolLayer.h
index 9eb4fc6138b0bce59660406705d15291eb38af9b..fc249354d10333211691b6844bffa3c8da8a79ee 100644
--- a/paddle/gserver/layers/CudnnPoolLayer.h
+++ b/paddle/gserver/layers/CudnnPoolLayer.h
@@ -26,7 +26,7 @@ namespace paddle {
  */
 
 class CudnnPoolLayer : public PoolLayer {
-protected:
+ protected:
   int windowHeight, windowWidth;
   int heightPadding, widthPadding, strideHeight, strideWidth;
   int imageH_, imageW_, outputH_, outputW_;
@@ -40,7 +40,7 @@ protected:
   /// A description of a pooling operation.
   hl_pooling_descriptor poolingDesc_;
 
-public:
+ public:
   static bool typeCheck(const std::string& poolType,
                         hl_pooling_mode_t* mode = nullptr);
   explicit CudnnPoolLayer(const LayerConfig& config);
diff --git a/paddle/gserver/layers/DataLayer.h b/paddle/gserver/layers/DataLayer.h
index 4b12afe0efe81843b58e459ca1e58b4f7f4a1664..d02f5a4697b9067f7d34e4d0b2d34f8c63ffe020 100644
--- a/paddle/gserver/layers/DataLayer.h
+++ b/paddle/gserver/layers/DataLayer.h
@@ -25,7 +25,7 @@ namespace paddle {
  * The config file api is data_layer.
  */
 class DataLayer : public Layer {
-public:
+ public:
   explicit DataLayer(const LayerConfig& config) : Layer(config) {}
 
   virtual void setData(const Argument& data) { data_ = data; }
@@ -58,10 +58,10 @@ public:
     }
   }
 
-private:
+ private:
   void copyDataToOutput(Argument& output);
 
-protected:
+ protected:
   Argument data_;
 };
 
diff --git a/paddle/gserver/layers/DataNormLayer.h b/paddle/gserver/layers/DataNormLayer.h
index 2a2a2a4aa76e8e315d9d66da1b738d6d615d10f2..7ae67a877b488c8d197896b8b1e3e90057fbe1c9 100644
--- a/paddle/gserver/layers/DataNormLayer.h
+++ b/paddle/gserver/layers/DataNormLayer.h
@@ -37,7 +37,7 @@ namespace paddle {
  */
 
 class DataNormLayer : public Layer {
-public:
+ public:
   enum NormalizationStrategy { kZScore = 0, kMinMax = 1, kDecimalScaling = 2 };
 
   explicit DataNormLayer(const LayerConfig& config) : Layer(config) {}
@@ -50,7 +50,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   int mode_;
   std::unique_ptr<Weight> weight_;
   MatrixPtr min_;
diff --git a/paddle/gserver/layers/DeConv3DLayer.h b/paddle/gserver/layers/DeConv3DLayer.h
index 57d51cdec66930b9b79c0c0395da66922cd53ae4..13d1d07cf5cc6e2a6ea89768e29b1fe8cda5e81c 100644
--- a/paddle/gserver/layers/DeConv3DLayer.h
+++ b/paddle/gserver/layers/DeConv3DLayer.h
@@ -27,7 +27,7 @@ namespace paddle {
  * calculate deconvolution3D operation.
  */
 class DeConv3DLayer : public ConvBaseLayer {
-public:
+ public:
   explicit DeConv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
   ~DeConv3DLayer() {}
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
@@ -40,7 +40,7 @@ public:
   void bpropWeights(int i);
   size_t getSize();
 
-protected:
+ protected:
   // Figure out the dimensions for individual gemms.
   IntV M_;  /// numFilters_ / filter_group_;
   IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
diff --git a/paddle/gserver/layers/DetectionOutputLayer.h b/paddle/gserver/layers/DetectionOutputLayer.h
index 174a6e5d9acb476276b66627b4aabce2ae6c1037..b0270ed33141993665aeabdc53829600a4403643 100644
--- a/paddle/gserver/layers/DetectionOutputLayer.h
+++ b/paddle/gserver/layers/DetectionOutputLayer.h
@@ -33,7 +33,7 @@ namespace paddle {
  */
 
 class DetectionOutputLayer : public Layer {
-public:
+ public:
   explicit DetectionOutputLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
@@ -42,7 +42,7 @@ public:
 
   void backward(const UpdateCallback& callback = nullptr) {}
 
-protected:
+ protected:
   inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
 
   inline LayerPtr getLocInputLayer(size_t index) {
@@ -53,7 +53,7 @@ protected:
     return inputLayers_[1 + inputNum_ + index];
   }
 
-private:
+ private:
   size_t numClasses_;  // number of classes
   size_t inputNum_;    // number of input layers
   real nmsThreshold_;
diff --git a/paddle/gserver/layers/DotMulOperator.cpp b/paddle/gserver/layers/DotMulOperator.cpp
index 68db2929adee1336e52abfcb8e6495e589afa683..03d18d9b239e57dc41334462f2324ae2d0505a62 100644
--- a/paddle/gserver/layers/DotMulOperator.cpp
+++ b/paddle/gserver/layers/DotMulOperator.cpp
@@ -27,7 +27,7 @@ namespace paddle {
  * The config file api is dotmul_operator.
  */
 class DotMulOperator : public Operator {
-public:
+ public:
   DotMulOperator(const OperatorConfig& config, bool useGpu);
   virtual void forward();
   virtual void backward();
diff --git a/paddle/gserver/layers/DotMulProjection.cpp b/paddle/gserver/layers/DotMulProjection.cpp
index 86453aae84142f9f534182d085f4a96a2c7a3e15..d7780387670e83af24fa342be3d596b618b1f677 100644
--- a/paddle/gserver/layers/DotMulProjection.cpp
+++ b/paddle/gserver/layers/DotMulProjection.cpp
@@ -26,14 +26,14 @@ namespace paddle {
  * The config file api is dotmul_projection.
  */
 class DotMulProjection : public Projection {
-public:
+ public:
   DotMulProjection(const ProjectionConfig& config,
                    const ParameterPtr& parameter,
                    bool useGpu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 
-protected:
+ protected:
   /// shared memory with parameter
   std::unique_ptr<Weight> weight_;
 };
diff --git a/paddle/gserver/layers/DotProdLayer.cpp b/paddle/gserver/layers/DotProdLayer.cpp
index 5148d93e27d199b0c373221cedd4f03d6d32c8ab..72b0c707b2131dc275ba604cd20ae0007c34a9a9 100644
--- a/paddle/gserver/layers/DotProdLayer.cpp
+++ b/paddle/gserver/layers/DotProdLayer.cpp
@@ -27,7 +27,7 @@ namespace paddle {
  */
 
 class DotProdLayer : public Layer {
-public:
+ public:
   explicit DotProdLayer(const LayerConfig& config) : Layer(config) {}
 
   ~DotProdLayer() {}
diff --git a/paddle/gserver/layers/EosIdCheckLayer.cpp b/paddle/gserver/layers/EosIdCheckLayer.cpp
index 470a5b8ea208ad0acb64e3067881e0d183e1dc39..04400f2836581179849a4dd1c256bbddcc82530f 100644
--- a/paddle/gserver/layers/EosIdCheckLayer.cpp
+++ b/paddle/gserver/layers/EosIdCheckLayer.cpp
@@ -24,7 +24,7 @@ namespace paddle {
  * It is used by recurrent layer group.
  */
 class EosIdCheckLayer : public Layer {
-public:
+ public:
   explicit EosIdCheckLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/ExpandConvLayer.h b/paddle/gserver/layers/ExpandConvLayer.h
index be968155efd0b8f19503c996ccd329379c6b1104..6919ef71355a4c660b9ddd60bff75fee399cfaa9 100644
--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ b/paddle/gserver/layers/ExpandConvLayer.h
@@ -29,7 +29,7 @@ namespace paddle {
  */
 
 class ExpandConvLayer : public ConvBaseLayer {
-public:
+ public:
   explicit ExpandConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
 
   ~ExpandConvLayer() {}
@@ -42,7 +42,7 @@ public:
 
   size_t getOutputSize();
 
-protected:
+ protected:
   std::vector<TensorShape> inputShape_;
   std::vector<TensorShape> filterShape_;
   std::vector<TensorShape> outputShape_;
diff --git a/paddle/gserver/layers/ExpandLayer.h b/paddle/gserver/layers/ExpandLayer.h
index 04bbfcbd04931fa11d11a9fcc74f0e4f19767f1b..06bd4ef05ee206628d981fee8e7eec3c91b18b7a 100644
--- a/paddle/gserver/layers/ExpandLayer.h
+++ b/paddle/gserver/layers/ExpandLayer.h
@@ -37,7 +37,7 @@ namespace paddle {
  */
 
 class ExpandLayer : public Layer {
-protected:
+ protected:
   std::unique_ptr<Weight> biases_;
   /// if input[0] is dense data, ExpandLevel=kNonSeq;
   /// if input[0] is sequence data, ExpandLevel=kSeq
@@ -48,7 +48,7 @@ protected:
   /// of input[1]
   ICpuGpuVectorPtr expandStartsPos_;
 
-public:
+ public:
   explicit ExpandLayer(const LayerConfig& config) : Layer(config) {}
 
   ~ExpandLayer() {}
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
index 684da4e65a461d46204c348b3374b0e9e00eb389..148abe238173dd44cd0fcf3f5cda732f70078706 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.h
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
@@ -42,7 +42,7 @@ namespace paddle {
  */
 
 class FactorizationMachineLayer : public Layer {
-protected:
+ protected:
   // The latent vectors, shape: (size, factorSize_)
   // Each row of the latentVectors_ matrix is the latent vector
   // corresponding to one input feature dimension
@@ -50,7 +50,7 @@ protected:
   // The hyperparameter that defines the dimensionality of the factorization
   size_t factorSize_;
 
-private:
+ private:
   // Store the square values of the letent vectors matrix
   MatrixPtr latentVectorsSquare_;
   // Store the square values of input matrix
@@ -65,7 +65,7 @@ private:
   // Negative identity matrix
   MatrixPtr negOnes_;
 
-public:
+ public:
   explicit FactorizationMachineLayer(const LayerConfig& config)
       : Layer(config) {}
   ~FactorizationMachineLayer() {}
diff --git a/paddle/gserver/layers/FeatureMapExpandLayer.cpp b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
index 81b98da45bc4b9b8ef0723dd6ea2db809860e219..d95f0b9b3d13e8bff635373cb4d5705c2351bd97 100644
--- a/paddle/gserver/layers/FeatureMapExpandLayer.cpp
+++ b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
@@ -38,11 +38,11 @@ namespace paddle {
  */
 
 class FeatureMapExpandLayer : public Layer {
-private:
+ private:
   int numFilters_;
   bool asRowVector_;
 
-public:
+ public:
   explicit FeatureMapExpandLayer(const LayerConfig& config) : Layer(config) {}
 
   ~FeatureMapExpandLayer() {}
diff --git a/paddle/gserver/layers/FullMatrixProjection.h b/paddle/gserver/layers/FullMatrixProjection.h
index 7c4cd1a7066d427f54e1a280a956acb025e6dc16..a27aa4a12327ac39ec3418a849b1230e13f759ee 100644
--- a/paddle/gserver/layers/FullMatrixProjection.h
+++ b/paddle/gserver/layers/FullMatrixProjection.h
@@ -28,14 +28,14 @@ namespace paddle {
  * The config file api is full_matrix_projection.
  */
 class FullMatrixProjection : public Projection {
-public:
+ public:
   FullMatrixProjection(const ProjectionConfig& config,
                        const ParameterPtr& parameter,
                        bool useGpu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 
-protected:
+ protected:
   std::unique_ptr<Weight> weight_;
 };
 
diff --git a/paddle/gserver/layers/FullyConnectedLayer.h b/paddle/gserver/layers/FullyConnectedLayer.h
index e66aeeb7334c9c871749196d77474a02ecf82b09..e0f9d6ce55fbdf73e5507032c108c735bf04597b 100644
--- a/paddle/gserver/layers/FullyConnectedLayer.h
+++ b/paddle/gserver/layers/FullyConnectedLayer.h
@@ -28,11 +28,11 @@ namespace paddle {
  */
 
 class FullyConnectedLayer : public Layer {
-protected:
+ protected:
   WeightList weights_;
   std::unique_ptr<Weight> biases_;
 
-public:
+ public:
   explicit FullyConnectedLayer(const LayerConfig& config) : Layer(config) {}
   ~FullyConnectedLayer() {}
 
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.h b/paddle/gserver/layers/GatedRecurrentLayer.h
index f0a3a823018f3943b0295c172b19d0fe9d0674b4..46508dc977bf1a6fd33dc1fb024bd1aed36a0ff3 100644
--- a/paddle/gserver/layers/GatedRecurrentLayer.h
+++ b/paddle/gserver/layers/GatedRecurrentLayer.h
@@ -47,7 +47,7 @@ namespace paddle {
  */
 
 class GatedRecurrentLayer : public Layer, public GruCompute {
-public:
+ public:
   explicit GatedRecurrentLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -63,7 +63,7 @@ public:
 
   LayerStatePtr getState() override;
 
-protected:
+ protected:
   void forwardSequence(int batchSize,
                        size_t numSequences,
                        const int* starts,
@@ -79,7 +79,7 @@ protected:
                     MatrixPtr inputValue);
   void backwardBatch(int batchSize, MatrixPtr inputGrad);
 
-protected:
+ protected:
   std::unique_ptr<Weight> weight_;
   std::unique_ptr<Weight> gateWeight_;
   std::unique_ptr<Weight> stateWeight_;
diff --git a/paddle/gserver/layers/GetOutputLayer.cpp b/paddle/gserver/layers/GetOutputLayer.cpp
index f255681f3e678e51f069522f965fd2776680b595..7c1e3c407cca374c7aa238d07e2263c4a142b6a5 100644
--- a/paddle/gserver/layers/GetOutputLayer.cpp
+++ b/paddle/gserver/layers/GetOutputLayer.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 
 class GetOutputLayer : public Layer {
-public:
+ public:
   explicit GetOutputLayer(const LayerConfig& config) : Layer(config) {}
 
   ~GetOutputLayer() {}
diff --git a/paddle/gserver/layers/GruCompute.h b/paddle/gserver/layers/GruCompute.h
index fb6bc56422002b4d4080ccb8438767b27ceef064..50006325ce9969c4941aaf28604260f0aeb9b97a 100644
--- a/paddle/gserver/layers/GruCompute.h
+++ b/paddle/gserver/layers/GruCompute.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 
 class GruCompute {
-public:
+ public:
   void init(LayerConfig &config);
 
   template <bool useGpu>
@@ -33,7 +33,7 @@ public:
                 int frameSize,
                 int batchSize = 1);
 
-public:
+ public:
   hl_activation_mode_t activeNode_;
   hl_activation_mode_t activeGate_;
 };
diff --git a/paddle/gserver/layers/GruStepLayer.cpp b/paddle/gserver/layers/GruStepLayer.cpp
index 917c50250c1c04d6c8f113c8d42ef029e1028606..114f287411c2fccbc08b7da4c05462967c81b268 100644
--- a/paddle/gserver/layers/GruStepLayer.cpp
+++ b/paddle/gserver/layers/GruStepLayer.cpp
@@ -44,13 +44,13 @@ namespace paddle {
  * The config file api if gru_step_layer.
  */
 class GruStepLayer : public Layer, public GruCompute {
-protected:
+ protected:
   Argument gate_;
   Argument resetOutput_;
   std::unique_ptr<Weight> weight_;
   std::unique_ptr<Weight> bias_;
 
-public:
+ public:
   explicit GruStepLayer(const LayerConfig& config) : Layer(config) {}
 
   ~GruStepLayer() {}
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
index 10e501f1807ef6ba03d326a1bcf257ede0ee850a..73ef252fd5a5443fe065f3b7bd8c49951ae0b4bd 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
@@ -58,7 +58,7 @@ namespace paddle {
  * The config file api is hsigmod_layer.
  */
 class HierarchicalSigmoidLayer : public Layer {
-public:
+ public:
   explicit HierarchicalSigmoidLayer(const LayerConfig& config)
       : Layer(config) {}
   bool init(const LayerMap& layerMap,
@@ -66,7 +66,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override;
 
-protected:
+ protected:
   /**
    * The last of inputs is label layer.
    */
diff --git a/paddle/gserver/layers/IdentityProjection.cpp b/paddle/gserver/layers/IdentityProjection.cpp
index 6c70f77acc0c890e11a4929ea013d7745d8bbed0..34e9eb90161f7942c528b70f177e30f301a8f53f 100644
--- a/paddle/gserver/layers/IdentityProjection.cpp
+++ b/paddle/gserver/layers/IdentityProjection.cpp
@@ -26,7 +26,7 @@ namespace paddle {
  * The config file api is identity_projection.
  */
 class IdentityProjection : public Projection {
-public:
+ public:
   IdentityProjection(const ProjectionConfig& config,
                      const ParameterPtr& parameter,
                      bool useGpu);
@@ -68,7 +68,7 @@ void IdentityProjection::backward(const UpdateCallback& callback) {
  * The config file api is identity_projection.
  */
 class IdentityOffsetProjection : public Projection {
-public:
+ public:
   IdentityOffsetProjection(const ProjectionConfig& config,
                            const ParameterPtr& parameter,
                            bool useGpu);
diff --git a/paddle/gserver/layers/InterpolationLayer.cpp b/paddle/gserver/layers/InterpolationLayer.cpp
index 0ac92024bc7eddf05ce023708537d0aa7bab6426..509c07cf22c9bcbe9283241b38540162b3dbe26b 100644
--- a/paddle/gserver/layers/InterpolationLayer.cpp
+++ b/paddle/gserver/layers/InterpolationLayer.cpp
@@ -33,12 +33,12 @@ namespace paddle {
  */
 
 class InterpolationLayer : public Layer {
-protected:
+ protected:
   /// weightLast = 1 - weight
   MatrixPtr weightLast_;
   MatrixPtr tmpMatrix;
 
-public:
+ public:
   explicit InterpolationLayer(const LayerConfig& config) : Layer(config) {}
 
   ~InterpolationLayer() {}
diff --git a/paddle/gserver/layers/KmaxSeqScoreLayer.cpp b/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
index 0ea960902efc10007896b3f4ce915dea79d0d12d..7fd25954efeb9d9e672040f9909198f2ae3c0449 100644
--- a/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
+++ b/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
@@ -17,14 +17,14 @@ limitations under the License. */
 namespace paddle {
 
 class KmaxSeqScoreLayer : public Layer {
-private:
+ private:
   MatrixPtr scores_;
   size_t beamSize_;
   void kmaxScorePerSeq(const real* score,
                        real* sortedRes,
                        const ICpuGpuVectorPtr seqStartPos);
 
-public:
+ public:
   explicit KmaxSeqScoreLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/L2DistanceLayer.h b/paddle/gserver/layers/L2DistanceLayer.h
index 97f35daf7860fb3b082ef03203327e09dca67371..44e688e1377145845033d9d5cc3f31f5594a11f6 100644
--- a/paddle/gserver/layers/L2DistanceLayer.h
+++ b/paddle/gserver/layers/L2DistanceLayer.h
@@ -33,7 +33,7 @@ namespace paddle {
  */
 
 class L2DistanceLayer : public Layer {
-public:
+ public:
   explicit L2DistanceLayer(const LayerConfig& config) : Layer(config) {}
   ~L2DistanceLayer() {}
 
@@ -43,7 +43,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-private:
+ private:
   // Store the result of subtracting Input2 from Input1 in forward computation,
   // which will be reused in backward computation.
   MatrixPtr inputSub_;
diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
index 8da342a00f72ee1196c4af24104ce92c6bbf9f5c..13e20e8316323f9082a9615041584685853aa395 100644
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -60,7 +60,7 @@ enum PADDLE_DEVICE_ID {
  * Define necessary variables and functions for every layer.
  */
 class Layer {
-protected:
+ protected:
   /// Layer config
   LayerConfig config_;
   /// whether to use GPU
@@ -112,7 +112,7 @@ protected:
   /// Layer backward function
   std::vector<std::shared_ptr<FunctionBase>> backward_;
 
-public:
+ public:
   /**
    * Wait until all input value ready.
    * Called before Layer::forward() function.
@@ -137,7 +137,7 @@ public:
    */
   virtual void markAllInputGrad();
 
-protected:
+ protected:
   /**
    * Create layer function. Function is called in forward or backward.
    * \param function, Layer::forward_ or Layer::backward_
@@ -252,7 +252,7 @@ protected:
    */
   void addOutputArgument(int deviceId);
 
-public:
+ public:
   explicit Layer(const LayerConfig& config, bool useGpu = FLAGS_use_gpu);
   virtual ~Layer() {}
 
@@ -490,7 +490,7 @@ public:
    */
   virtual void onPassEnd() {}
 
-protected:
+ protected:
   /**
    * Forward of activation function.
    */
diff --git a/paddle/gserver/layers/LinearChainCRF.h b/paddle/gserver/layers/LinearChainCRF.h
index 1ea4c7e105703b76601499bf3944648cdc98ec99..e802b701d0237bed44adc83273fe53c3e18c92ec 100644
--- a/paddle/gserver/layers/LinearChainCRF.h
+++ b/paddle/gserver/layers/LinearChainCRF.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 
 class LinearChainCRF {
-public:
+ public:
   /**
    * The size of para must be \f$(numClasses + 2) * numClasses\f$.
    * The first numClasses values of para are for starting weights (\f$a\f$).
@@ -71,7 +71,7 @@ public:
    */
   MatrixPtr getXGrad() { return matGrad_; }
 
-protected:
+ protected:
   int numClasses_;
   MatrixPtr a_;
   MatrixPtr b_;
diff --git a/paddle/gserver/layers/LinearChainCTC.h b/paddle/gserver/layers/LinearChainCTC.h
index 0b774277dc8cf27f48c6905168cdea047365c99d..5b325a0deb0e9d8df241175159321e52f527f6c4 100644
--- a/paddle/gserver/layers/LinearChainCTC.h
+++ b/paddle/gserver/layers/LinearChainCTC.h
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 
 class LinearChainCTC {
-public:
+ public:
   LinearChainCTC(int numClasses, bool normByTimes);
 
   // Calculate the negative log probability as loss
@@ -35,7 +35,7 @@ public:
                 int* labelSeq,
                 int labelSeqLen);
 
-protected:
+ protected:
   int numClasses_, blank_, totalSegments_, totalTime_;
   bool normByTimes_;
   bool isInvalid_;
diff --git a/paddle/gserver/layers/LstmCompute.h b/paddle/gserver/layers/LstmCompute.h
index b7d55eb1f984d102802cab87ba12ca9c69a2f4be..80fb01cd1885151c8d62a4b5dfdb4ba08327926d 100644
--- a/paddle/gserver/layers/LstmCompute.h
+++ b/paddle/gserver/layers/LstmCompute.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 
 class LstmCompute {
-public:
+ public:
   void init(LayerConfig &config);
 
   /**
@@ -57,7 +57,7 @@ public:
                            hl_lstm_grad grad,
                            int frameSize);
 
-public:
+ public:
   hl_activation_mode_t activeNode_;
   hl_activation_mode_t activeGate_;
   hl_activation_mode_t activeState_;
diff --git a/paddle/gserver/layers/LstmLayer.h b/paddle/gserver/layers/LstmLayer.h
index 4568b13ade5555e3cff703ceda1bbce3007c409d..76dfe8146bf67a0b7b4fd4835851fae6ac38d80f 100644
--- a/paddle/gserver/layers/LstmLayer.h
+++ b/paddle/gserver/layers/LstmLayer.h
@@ -71,7 +71,7 @@ namespace paddle {
  */
 
 class LstmLayer : public Layer, public LstmCompute {
-public:
+ public:
   explicit LstmLayer(const LayerConfig &config) : Layer(config) {}
 
   bool init(const LayerMap &layerMap,
@@ -87,7 +87,7 @@ public:
 
   LayerStatePtr getState() override;
 
-protected:
+ protected:
   /**
    * @brief Compute lstm forward one sequence by one sequence.
    * @param batchSize The batchSize is not equal to the batch_size in
@@ -165,7 +165,7 @@ protected:
    */
   void getPrevBatchState(size_t numSequences);
 
-protected:
+ protected:
   /// Learned parameters, shape: (size, 4*size).
   /// The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
   std::unique_ptr<Weight> weight_;
diff --git a/paddle/gserver/layers/LstmStepLayer.cpp b/paddle/gserver/layers/LstmStepLayer.cpp
index 8faaa1c4e138fe1ec04b1911449d05528bb5b8b5..c44768ddb2b903763288465325899d86176df73a 100644
--- a/paddle/gserver/layers/LstmStepLayer.cpp
+++ b/paddle/gserver/layers/LstmStepLayer.cpp
@@ -22,7 +22,7 @@ namespace paddle {
  * LstmStepLayer used in recurrent layer group.
  */
 class LstmStepLayer : public Layer, public LstmCompute {
-protected:
+ protected:
   Argument state_;
   Argument gate_;
   Argument stateActive_;
@@ -30,7 +30,7 @@ protected:
   MatrixPtr checkIgGrad_, checkFgGrad_, checkOgGrad_;
   std::unique_ptr<Weight> weight_;
 
-public:
+ public:
   explicit LstmStepLayer(const LayerConfig& config) : Layer(config) {}
 
   ~LstmStepLayer() {}
diff --git a/paddle/gserver/layers/MDLstmLayer.cpp b/paddle/gserver/layers/MDLstmLayer.cpp
index 7cfdb3ff25096ad06c09434cdee48b5f85d650af..22c28157c5a5b19aa54b3151a6c9a4cdcfb01765 100644
--- a/paddle/gserver/layers/MDLstmLayer.cpp
+++ b/paddle/gserver/layers/MDLstmLayer.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 
 class CoordIterator {
-public:
+ public:
   std::vector<int> dims_;
   std::vector<bool> directions_;
   std::vector<int> curPos_;
@@ -51,7 +51,7 @@ public:
     }
   }
 
-public:
+ public:
   CoordIterator(std::vector<int> dim, std::vector<bool> directions)
       : dims_(dim), directions_(directions), end_(false) {
     CHECK_EQ(dims_.size(), directions_.size());
@@ -178,7 +178,7 @@ public:
  * */
 
 class MDLstmLayer : public LstmLayer {
-public:
+ public:
   explicit MDLstmLayer(const LayerConfig& config) : LstmLayer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -188,13 +188,13 @@ public:
 
   void backward(const UpdateCallback& callback) override;
 
-protected:
+ protected:
   void forwardOneSequence(int start, CoordIterator& coordIter);
   void backwardOneSequence(int start, CoordIterator& coordIter);
   void forwardGate2OutputSequence(int start, CoordIterator& coordIter);
   void backwardGate2OutputSequence(int start, CoordIterator& coordIter);
 
-protected:
+ protected:
   std::vector<Argument> frameInputGate_;
   std::vector<Argument> frameForgetGate_;
   std::vector<Argument> frameOutputGate_;
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.h b/paddle/gserver/layers/MKLDNNAddtoLayer.h
index e40e2f2251a1b739958773b8e6dc95a70ed58c76..0b385e804fdbc74c8612031cf415d06f15ce311a 100644
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.h
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.h
@@ -25,7 +25,7 @@ namespace paddle {
  * The config file api is mkldnn_addto
  */
 class MKLDNNAddtoLayer : public MKLDNNLayer {
-protected:
+ protected:
   // layer size == ic * ih * iw == oc * oh *ow, and can not be changed
   size_t layerSize_;
 
@@ -38,7 +38,7 @@ protected:
   std::vector<std::shared_ptr<mkldnn::primitive>> fwdBias_;
   std::shared_ptr<mkldnn::primitive> bwdBias_;
 
-public:
+ public:
   explicit MKLDNNAddtoLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
 
   ~MKLDNNAddtoLayer() {}
@@ -59,7 +59,7 @@ public:
 
   void updateWeights(const UpdateCallback& callback) override;
 
-protected:
+ protected:
   void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
diff --git a/paddle/gserver/layers/MKLDNNBase.h b/paddle/gserver/layers/MKLDNNBase.h
index d84e2859407711c13c475a19e140e2f5f51e61c2..786ceaf86086d7c04331641693181809ac019597 100644
--- a/paddle/gserver/layers/MKLDNNBase.h
+++ b/paddle/gserver/layers/MKLDNNBase.h
@@ -31,7 +31,7 @@ typedef enum {
  *
  */
 class CPUEngine {
-public:
+ public:
   static CPUEngine& Instance() {
     // Thread-safe in C++11.
     static CPUEngine myInstance;
@@ -46,12 +46,12 @@ public:
 
   mkldnn::engine& getEngine() { return cpuEngine_; }
 
-protected:
+ protected:
   CPUEngine() : cpuEngine_(mkldnn::engine::cpu, 0) {}
   //    CPUEngine() : cpuEngine_(mkldnn::engine::cpu_lazy, 0) {}
   ~CPUEngine() {}
 
-private:
+ private:
   mkldnn::engine cpuEngine_;
 };
 
@@ -60,7 +60,7 @@ private:
  *
  */
 class MKLDNNStream {
-public:
+ public:
   MKLDNNStream() : ready_(false) { resetState(); }
 
   virtual ~MKLDNNStream() {}
@@ -89,7 +89,7 @@ public:
     ready_ = true;
   }
 
-private:
+ private:
   bool ready_;
   std::shared_ptr<mkldnn::stream> stream_;
 };
diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.h b/paddle/gserver/layers/MKLDNNBatchNormLayer.h
index 93e182206a1ab1f06087cb808bb266ddea1468c9..9aa20df98f30837e1b80b4269d05d85b7d99ba76 100644
--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.h
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.h
@@ -27,7 +27,7 @@ typedef mkldnn::batch_normalization_backward bn_bwd;
  * The config file api is mkldnn_batch_norm
  */
 class MKLDNNBatchNormLayer : public MKLDNNLayer {
-protected:
+ protected:
   // save forward primitive_desc, which can be used backward
   std::shared_ptr<bn_fwd::primitive_desc> fwdPD_;
 
@@ -62,7 +62,7 @@ protected:
   MKLDNNMatrixPtr mean_;
   MKLDNNMatrixPtr var_;
 
-public:
+ public:
   explicit MKLDNNBatchNormLayer(const LayerConfig& config)
       : MKLDNNLayer(config), useGlobalStats_(true), hasInitedWgt_(false) {}
 
@@ -88,7 +88,7 @@ public:
 
   void convertWeightsFromPaddle() override;
 
-protected:
+ protected:
   void initWeight();
   /**
    * cal moving mean and variance.
diff --git a/paddle/gserver/layers/MKLDNNConcatLayer.h b/paddle/gserver/layers/MKLDNNConcatLayer.h
index f7abdabfb51df27f8db4e6d4d88c80546eeba248..d7738df6c106c68f55b313f2d119e31c6e444cbf 100644
--- a/paddle/gserver/layers/MKLDNNConcatLayer.h
+++ b/paddle/gserver/layers/MKLDNNConcatLayer.h
@@ -25,7 +25,7 @@ namespace paddle {
  * The config file api is mkldnn_concat
  */
 class MKLDNNConcatLayer : public MKLDNNLayer {
-protected:
+ protected:
   std::vector<std::shared_ptr<mkldnn::primitive>> bwds_;
   // input channel numbers
   std::vector<int> channels_;
@@ -35,7 +35,7 @@ protected:
   // if axis_ == 1, concat channel (default)
   int axis_;
 
-public:
+ public:
   explicit MKLDNNConcatLayer(const LayerConfig& config)
       : MKLDNNLayer(config), axis_(1) {}
 
@@ -75,7 +75,7 @@ public:
     return totalSize;
   }
 
-protected:
+ protected:
   void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
                        MKLDNNMatrixPtr& out);
   void resetFwdPD(std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.h b/paddle/gserver/layers/MKLDNNConvLayer.h
index 29c8735fbb91e7418797874238eb87759420f181..d399035ed3ae2f411587c1fcf1799bb71c8de63e 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.h
+++ b/paddle/gserver/layers/MKLDNNConvLayer.h
@@ -28,7 +28,7 @@ typedef mkldnn::convolution_backward_data conv_bwdData;
  * The config file api is mkldnn_conv
  */
 class MKLDNNConvLayer : public MKLDNNLayer {
-protected:
+ protected:
   // padding height and width
   int ph_, pw_;
   // stride height and width
@@ -59,7 +59,7 @@ protected:
   std::unique_ptr<Weight> weight_;
   std::unique_ptr<Weight> biases_;
 
-public:
+ public:
   explicit MKLDNNConvLayer(const LayerConfig& config)
       : MKLDNNLayer(config), hasInitedWgt_(false), caffeMode_(true) {}
 
@@ -92,7 +92,7 @@ public:
                        << ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_;
   }
 
-protected:
+ protected:
   /**
    * load the dims settings of this conv
    */
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/gserver/layers/MKLDNNFcLayer.h
index 0d41a4379d677f86f672852fec09b1241009597b..a704066cc818a6b33bd0eed4612d62b674fa72ca 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -28,7 +28,7 @@ typedef mkldnn::inner_product_backward_data fc_bwdData;
  * The config file api is mkldnn_fc
  */
 class MKLDNNFcLayer : public MKLDNNLayer {
-protected:
+ protected:
   // input layer size, can not be change after init
   size_t iLayerSize_;  // == ic * ih * iw
 
@@ -42,7 +42,7 @@ protected:
   std::unique_ptr<Weight> weight_;
   std::unique_ptr<Weight> biases_;
 
-public:
+ public:
   explicit MKLDNNFcLayer(const LayerConfig& config)
       : MKLDNNLayer(config), hasInitedWgt_(false) {}
 
@@ -68,7 +68,7 @@ public:
 
   void convertWeightsToPaddle() override;
 
-protected:
+ protected:
   void resetFwdBuffers(MKLDNNMatrixPtr& in,
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& bias,
diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.h b/paddle/gserver/layers/MKLDNNLRNLayer.h
index b503ee55947294d7c44d1760058f8c26bceed142..028438f2c93b2182318c53cd348351376d491e79 100644
--- a/paddle/gserver/layers/MKLDNNLRNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLRNLayer.h
@@ -27,7 +27,7 @@ typedef mkldnn::lrn_backward lrn_bwd;
  * The config file api is mkldnn_lrn
  */
 class MKLDNNLRNLayer : public MKLDNNLayer {
-protected:
+ protected:
   // save forward primitive_desc, which can be used in backward
   std::shared_ptr<lrn_fwd::primitive_desc> fwdPD_;
   // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
@@ -37,7 +37,7 @@ protected:
   int localSize_;
   float alpha_, beta_;  // scale and pow in paddle
 
-public:
+ public:
   explicit MKLDNNLRNLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
 
   ~MKLDNNLRNLayer() {}
@@ -56,7 +56,7 @@ public:
                 std::vector<MKLDNNMatrixPtr>& inputs,
                 MKLDNNMatrixPtr& out) override;
 
-protected:
+ protected:
   void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
   void resetFwdPD(std::shared_ptr<lrn_fwd::primitive_desc>& pd,
                   MKLDNNMatrixPtr in,
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 4a7eb74ce3a13ed38be3548d8ce34382c594205a..2b164d0d3bc0e1446d7e4d82bb8a713195dbd927 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -33,7 +33,7 @@ typedef std::shared_ptr<MKLDNNLayer> MKLDNNLayerPtr;
  *
  */
 class MKLDNNLayer : public Layer {
-protected:
+ protected:
   // batch size
   int bs_;
   // their sizes are always from the first input layer
@@ -95,7 +95,7 @@ protected:
   // tmp input argument to save input grad, only used to merge grad
   Argument tmpInArg_;
 
-public:
+ public:
   explicit MKLDNNLayer(const LayerConfig& config)
       : Layer(config),
         ih_(0),
@@ -162,7 +162,7 @@ public:
    */
   void addOutputArgument(int deviceId) { Layer::addOutputArgument(deviceId); }
 
-protected:
+ protected:
   /**
    * Some layers may have different condition to reset the forward.
    * The function returns the condition that do not need reset forward.
@@ -233,7 +233,7 @@ protected:
    */
   void resetMergeGrad(MKLDNNMatrixPtr& out);
 
-protected:
+ protected:
   /**
    * Set deviceId of this layer.
    */
@@ -340,7 +340,7 @@ protected:
     }
   }
 
-private:
+ private:
   /**
    * clear all grad
    */
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.h b/paddle/gserver/layers/MKLDNNPoolLayer.h
index 12821cda7308602dd2fe834f52c614e6112b7cea..1eb0ee4ad946f61e32b7d4f4fd376dda89d6acf7 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.h
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.h
@@ -27,7 +27,7 @@ typedef mkldnn::pooling_backward pool_bwd;
  * The config file api is mkldnn_pool
  */
 class MKLDNNPoolLayer : public MKLDNNLayer {
-protected:
+ protected:
   // padding height and width
   int ph_, pw_;
   // stride height and width
@@ -44,7 +44,7 @@ protected:
   // test_pooling_forward.cpp, pool need workspace for backward
   std::shared_ptr<mkldnn::memory> workspace_;
 
-public:
+ public:
   explicit MKLDNNPoolLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
 
   ~MKLDNNPoolLayer() {}
@@ -70,7 +70,7 @@ public:
                        << ", sw: " << sw_;
   }
 
-protected:
+ protected:
   void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
   void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
                   MKLDNNMatrixPtr in,
diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
index 37eb362d45215edc736984f8da784fe74bb43f2b..441025a9c9d75786b17db84c74995a96b6a06ea8 100644
--- a/paddle/gserver/layers/MKLPackedRecurrentLayer.h
+++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
@@ -29,7 +29,7 @@ namespace paddle {
  */
 
 class MKLPackedRecurrentLayer : public RecurrentLayer {
-public:
+ public:
   explicit MKLPackedRecurrentLayer(const LayerConfig& config)
       : RecurrentLayer(config) {}
 
@@ -38,7 +38,7 @@ public:
 
   void backward(const UpdateCallback& callback) override;
 
-protected:
+ protected:
   void forwardBatch(int batchSize,
                     size_t numSequences,
                     const int* starts) override;
@@ -47,7 +47,7 @@ protected:
                      size_t numSequences,
                      const int* starts) override;
 
-protected:
+ protected:
   /// packed_weight_ contains same data with
   /// RecurrentLayer::weight_ but is packed
   std::unique_ptr<MKLPackedWeight> packed_weight_;
diff --git a/paddle/gserver/layers/MKLPackedWeight.h b/paddle/gserver/layers/MKLPackedWeight.h
index 28b8a7db7cc3d2be12d6ce9291de1e415cf77bbc..b01a961d007a0e2e343db7b51e50fd3ee776435e 100644
--- a/paddle/gserver/layers/MKLPackedWeight.h
+++ b/paddle/gserver/layers/MKLPackedWeight.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 
 class MKLPackedWeight {
-protected:
+ protected:
   /// The pointer of weight
   real *weight_;
   /// The pointer of cblas packed gemm to weight
@@ -30,7 +30,7 @@ protected:
   size_t width_;
   bool transW_;
 
-public:
+ public:
   explicit MKLPackedWeight(MatrixPtr weight, bool transW = false) {
     packedWeight_ = nullptr;
     weight_ = weight->getData();
@@ -59,7 +59,7 @@ public:
                         dst->getWidth());
   }
 
-protected:
+ protected:
   void pack_(real *src) {
     if (!packedWeight_) {
       packedWeight_ = cblas_sgemm_alloc(CblasBMatrix, 1, width_, height_);
diff --git a/paddle/gserver/layers/MaxIdLayer.cpp b/paddle/gserver/layers/MaxIdLayer.cpp
index 84e375d7441ce3ccd8a5df94df22d85d104b5d96..eecd4996e962857b09001a1bb36bc027cbaa4308 100644
--- a/paddle/gserver/layers/MaxIdLayer.cpp
+++ b/paddle/gserver/layers/MaxIdLayer.cpp
@@ -23,11 +23,11 @@ namespace paddle {
  * The config file api is maxid_layer.
  */
 class MaxIdLayer : public Layer {
-private:
+ private:
   /// a predetermined number of best states at each level
   size_t beamSize_;
 
-public:
+ public:
   explicit MaxIdLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/MaxLayer.h b/paddle/gserver/layers/MaxLayer.h
index 9dbc672652dc2670a775f02ecd3a9de9919c8ae0..e46f997c342ce5d6b724629dff6950c4f1680ce8 100644
--- a/paddle/gserver/layers/MaxLayer.h
+++ b/paddle/gserver/layers/MaxLayer.h
@@ -39,11 +39,11 @@ namespace paddle {
  */
 
 class MaxLayer : public SequencePoolLayer {
-protected:
+ protected:
   // maxIndex_[i][j] = k : the value at (i, j) is from input[k].
   IVectorPtr maxIndex_;
 
-public:
+ public:
   explicit MaxLayer(const LayerConfig& config) : SequencePoolLayer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/MaxOutLayer.h b/paddle/gserver/layers/MaxOutLayer.h
index 1fb371836bacb9e02cc32eabfd21bf24165b0734..0eb8674b4c4f3f58b103c6b59ad13931a6992a1b 100644
--- a/paddle/gserver/layers/MaxOutLayer.h
+++ b/paddle/gserver/layers/MaxOutLayer.h
@@ -29,7 +29,7 @@ namespace paddle {
  */
 
 class MaxOutLayer : public Layer {
-protected:
+ protected:
   size_t groups_;
   size_t imgSizeH_, imgSizeW_;
   /// outputChannels_ = channels_ / groups_
@@ -38,7 +38,7 @@ protected:
   size_t featLen_;
   IVectorPtr maxoutId_;
 
-public:
+ public:
   /// return imgSizeH_ * imgSizeW_ * outputChannels_;
   size_t getSize();
 
diff --git a/paddle/gserver/layers/MaxPoolWithMaskLayer.h b/paddle/gserver/layers/MaxPoolWithMaskLayer.h
index 74cc8acf3515b10257ffb185061344fbcc94a337..c948364f6b83b0de1ee07cc185b69346f5cb1a7e 100644
--- a/paddle/gserver/layers/MaxPoolWithMaskLayer.h
+++ b/paddle/gserver/layers/MaxPoolWithMaskLayer.h
@@ -23,10 +23,10 @@ namespace paddle {
  * @brief Basic parent layer of different kinds of pooling
  */
 class MaxPoolWithMaskLayer : public PoolLayer {
-protected:
+ protected:
   Argument mask_;
 
-public:
+ public:
   explicit MaxPoolWithMaskLayer(const LayerConfig& config)
       : PoolLayer(config) {}
 
diff --git a/paddle/gserver/layers/MixedLayer.h b/paddle/gserver/layers/MixedLayer.h
index a1a43c52e4f503178a66ad8aa6c12bec89566081..43ee2bd81854f2dea837734f556c197613f6fdaf 100644
--- a/paddle/gserver/layers/MixedLayer.h
+++ b/paddle/gserver/layers/MixedLayer.h
@@ -30,7 +30,7 @@ namespace paddle {
  * The config file api is mixed_layer.
  */
 class MixedLayer : public Layer {
-public:
+ public:
   explicit MixedLayer(const LayerConfig& config) : Layer(config) {}
 
   ~MixedLayer() {}
@@ -52,7 +52,7 @@ public:
    */
   LayerStatePtr getState() override;
 
-protected:
+ protected:
   std::vector<std::unique_ptr<Projection>> projections_;
   std::vector<std::unique_ptr<Operator>> operators_;
   /// the matrix size of projection state
diff --git a/paddle/gserver/layers/MultiBoxLossLayer.h b/paddle/gserver/layers/MultiBoxLossLayer.h
index 9935da56446c1508549906becfd28548d5deecde..a358cded00bb01bfe5d02f9a6d8a24e4b2e51b74 100644
--- a/paddle/gserver/layers/MultiBoxLossLayer.h
+++ b/paddle/gserver/layers/MultiBoxLossLayer.h
@@ -41,7 +41,7 @@ namespace paddle {
  */
 
 class MultiBoxLossLayer : public CostLayer {
-public:
+ public:
   explicit MultiBoxLossLayer(const LayerConfig& config) : CostLayer(config) {}
 
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
@@ -54,7 +54,7 @@ public:
 
   void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {}
 
-protected:
+ protected:
   inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
   inline LayerPtr getLabelLayer() { return inputLayers_[1]; }
   inline LayerPtr getLocInputLayer(size_t index) {
@@ -64,7 +64,7 @@ protected:
     return inputLayers_[2 + inputNum_ + index];
   }
 
-protected:
+ protected:
   size_t numClasses_;
   real overlapThreshold_;
   real negPosRatio_;
diff --git a/paddle/gserver/layers/MultinomialSampler.h b/paddle/gserver/layers/MultinomialSampler.h
index 1f9e818ee5d21188e3bd39d1225912a1a2ae1598..8cbb229f157c0904e63a696f860ec6739d5167c4 100644
--- a/paddle/gserver/layers/MultinomialSampler.h
+++ b/paddle/gserver/layers/MultinomialSampler.h
@@ -29,7 +29,7 @@ namespace paddle {
  * The computational complexity of generate one sample is O(1).
  */
 class MultinomialSampler {
-public:
+ public:
   MultinomialSampler(const real* prob, int size);
 
   //! protobuf always using double.
@@ -53,7 +53,7 @@ public:
     return gen1([&g, this]() { return rand_(g); });
   }
 
-protected:
+ protected:
   /**
    * @brief Generation
    * @param[in] rand rand is a real random number distribution
diff --git a/paddle/gserver/layers/MultiplexLayer.cpp b/paddle/gserver/layers/MultiplexLayer.cpp
index 82857f8c3ef3e39ec451c1f26bac4996c12350a5..43ecc48cd97fb54d8dc4eb1d87ebf60f5aa040d8 100644
--- a/paddle/gserver/layers/MultiplexLayer.cpp
+++ b/paddle/gserver/layers/MultiplexLayer.cpp
@@ -37,7 +37,7 @@ namespace paddle {
  */
 
 class MultiplexLayer : public Layer {
-protected:
+ protected:
   /**
    * @brief A struct is used to save the copy information, includes input
    * layer index and copy size.
@@ -64,7 +64,7 @@ protected:
   /// Temporary matrix pointer to point to output data.
   MatrixPtr tmpDest_;
 
-public:
+ public:
   explicit MultiplexLayer(const LayerConfig& config) : Layer(config) {}
 
   ~MultiplexLayer() {}
@@ -75,7 +75,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-private:
+ private:
   /**
    * @brief Calculate copy info for input layers.
    */
diff --git a/paddle/gserver/layers/NCELayer.cpp b/paddle/gserver/layers/NCELayer.cpp
index d3d7b1fd9ac3c366d11c3060848e89c24a16a70b..cc48fe100f12446f9522078119ae2ead039a82cc 100644
--- a/paddle/gserver/layers/NCELayer.cpp
+++ b/paddle/gserver/layers/NCELayer.cpp
@@ -54,7 +54,7 @@ class NCELayer : public Layer {
 
   IVectorPtr labelIds_;
 
-public:
+ public:
   explicit NCELayer(const LayerConfig& config)
       : Layer(config),
         numClasses_(config.num_classes()),
diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h
index c89cbbfce9d9e35a6dd300864ee094ef8f9e283a..3807584415f99a7110170748501589dac85eac52 100644
--- a/paddle/gserver/layers/NormLayer.h
+++ b/paddle/gserver/layers/NormLayer.h
@@ -27,7 +27,7 @@ namespace paddle {
  * @note Normalize the input in local region
  */
 class NormLayer : public Layer {
-public:
+ public:
   explicit NormLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -49,12 +49,12 @@ public:
  * Need to implement in the futrue.
  */
 class ResponseNormLayer : public NormLayer {
-protected:
+ protected:
   size_t channels_, size_, outputX_, imgSize_, outputY_, imgSizeY_;
   real scale_, pow_;
   MatrixPtr denoms_;
 
-public:
+ public:
   explicit ResponseNormLayer(const LayerConfig& config) : NormLayer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -76,7 +76,7 @@ public:
  *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
  */
 class CrossChannelNormLayer : public NormLayer {
-public:
+ public:
   explicit CrossChannelNormLayer(const LayerConfig& config)
       : NormLayer(config) {}
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
@@ -85,7 +85,7 @@ public:
   MatrixPtr createSampleMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
   MatrixPtr createSpatialMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
 
-protected:
+ protected:
   size_t channels_;
   std::unique_ptr<Weight> scale_;
   MatrixPtr scaleDiff_;
diff --git a/paddle/gserver/layers/NormProjectionLayer.h b/paddle/gserver/layers/NormProjectionLayer.h
index 898b5823a9011c4b66e045c54afba070dd5cf772..64803a1603599f2e393ec772a32d64f4d271fe71 100644
--- a/paddle/gserver/layers/NormProjectionLayer.h
+++ b/paddle/gserver/layers/NormProjectionLayer.h
@@ -28,7 +28,7 @@ class CMRProjectionNormLayer : public ResponseNormLayer {
   size_t imgSizeH_, imgSizeW_;
   size_t outputH_, outputW_;
 
-public:
+ public:
   explicit CMRProjectionNormLayer(const LayerConfig& config)
       : ResponseNormLayer(config) {}
 
@@ -41,7 +41,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   TensorShape shape_;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/Operator.h b/paddle/gserver/layers/Operator.h
index a620926cccd3004d7bef57976047a190b4b566e2..42d525ef3e4534acea7512d5ecdbe8a0e1d110d9 100644
--- a/paddle/gserver/layers/Operator.h
+++ b/paddle/gserver/layers/Operator.h
@@ -34,7 +34,7 @@ namespace paddle {
  * @note: Operator can't have parameters.
  */
 class Operator {
-public:
+ public:
   static Operator* create(const OperatorConfig& config, bool useGpu);
 
   Operator(const OperatorConfig& config, bool useGpu)
@@ -81,7 +81,7 @@ public:
    */
   virtual LayerStatePtr getState() { return nullptr; }
 
-protected:
+ protected:
   /// Config of operator
   OperatorConfig config_;
   bool useGpu_;
diff --git a/paddle/gserver/layers/OuterProdLayer.cpp b/paddle/gserver/layers/OuterProdLayer.cpp
index 75f4abf93e5db11dc688f8f2e0b2a36bf70fbccc..11a910f3316114b309efe9007a156e842b3d6229 100644
--- a/paddle/gserver/layers/OuterProdLayer.cpp
+++ b/paddle/gserver/layers/OuterProdLayer.cpp
@@ -28,12 +28,12 @@ namespace paddle {
  */
 
 class OuterProdLayer : public Layer {
-protected:
+ protected:
   MatrixPtr tmpMtx0;
   MatrixPtr tmpRow0;
   MatrixPtr tmpRow1;
 
-public:
+ public:
   explicit OuterProdLayer(const LayerConfig& config) : Layer(config) {}
 
   ~OuterProdLayer() {}
diff --git a/paddle/gserver/layers/PadLayer.h b/paddle/gserver/layers/PadLayer.h
index 7e09d7f8a0d4dfd5300298ad0514b69781d87016..46b8a595978489c630b3ff2429ecb19d7c12521a 100644
--- a/paddle/gserver/layers/PadLayer.h
+++ b/paddle/gserver/layers/PadLayer.h
@@ -24,7 +24,7 @@ namespace paddle {
  *         the 4th dimenstion according padc_, padh_ and padw_.
  */
 class PadLayer : public Layer {
-public:
+ public:
   explicit PadLayer(const LayerConfig& config) : Layer(config) {}
 
   ~PadLayer() {}
@@ -34,7 +34,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   void setOutDims(const size_t batchSize);
   void setTensorDim(const size_t batchSize);
 
diff --git a/paddle/gserver/layers/ParameterReluLayer.h b/paddle/gserver/layers/ParameterReluLayer.h
index 3725fa4a1199285b703590255af492ebffdaab2c..4553413fcdbecbc83e1f50e8ffbe874fdf05d828 100644
--- a/paddle/gserver/layers/ParameterReluLayer.h
+++ b/paddle/gserver/layers/ParameterReluLayer.h
@@ -36,7 +36,7 @@ namespace paddle {
  */
 
 class ParameterReluLayer : public Layer {
-protected:
+ protected:
   std::unique_ptr<Weight> weight_;
 
   /**
@@ -51,7 +51,7 @@ protected:
    */
   size_t partialSum_;
 
-public:
+ public:
   explicit ParameterReluLayer(const LayerConfig& config) : Layer(config) {}
 
   ~ParameterReluLayer() {}
diff --git a/paddle/gserver/layers/Pool3DLayer.h b/paddle/gserver/layers/Pool3DLayer.h
index 59ee73f7cb9fb4287c12f3c7d0cacfc812484770..32605f8b7028cfb4909c885e83017a8cffa79575 100644
--- a/paddle/gserver/layers/Pool3DLayer.h
+++ b/paddle/gserver/layers/Pool3DLayer.h
@@ -26,7 +26,7 @@ namespace paddle {
  * Pools the input within regions
  */
 class Pool3DLayer : public Layer {
-public:
+ public:
   explicit Pool3DLayer(const LayerConfig& config) : Layer(config) {}
   ~Pool3DLayer() {}
 
@@ -36,7 +36,7 @@ public:
   void backward(const UpdateCallback& callback) override;
   size_t getSize();
 
-protected:
+ protected:
   int channels_;
   int sizeX_, sizeY_, sizeZ_;
   int strideW_, strideH_, strideD_;
diff --git a/paddle/gserver/layers/PoolLayer.h b/paddle/gserver/layers/PoolLayer.h
index 58d5fb0a095e8326f9b6f9cb2a97bb88022ceed8..99f8f148e2eb00f7e431e7d8c5acbf9e27574017 100644
--- a/paddle/gserver/layers/PoolLayer.h
+++ b/paddle/gserver/layers/PoolLayer.h
@@ -26,7 +26,7 @@ namespace paddle {
  * Pools the input within regions
  */
 class PoolLayer : public Layer {
-protected:
+ protected:
   size_t channels_, sizeX_, stride_, outputX_, imgSize_;
   int confPadding_;
 
@@ -40,7 +40,7 @@ protected:
 
   bool excludeMode_;
 
-public:
+ public:
   explicit PoolLayer(const LayerConfig& config) : Layer(config) {}
 
   /**
diff --git a/paddle/gserver/layers/PoolProjection.h b/paddle/gserver/layers/PoolProjection.h
index c99287dbf0f4503c180b9b4e9e46abafa67bf64d..8004cc1550337160b7f022c97a23ed8eb9d43ca4 100644
--- a/paddle/gserver/layers/PoolProjection.h
+++ b/paddle/gserver/layers/PoolProjection.h
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 
 class PoolProjection : public Projection {
-protected:
+ protected:
   size_t imgSizeY_, imgSize_;
   size_t outputY_, outputX_;
   size_t strideY_, stride_;
@@ -30,7 +30,7 @@ protected:
   std::string poolType_;
   bool excludeMode_;
 
-public:
+ public:
   PoolProjection(const ProjectionConfig& config,
                  ParameterPtr parameter,
                  bool useGpu);
@@ -45,7 +45,7 @@ public:
 };
 
 class MaxPoolProjection : public PoolProjection {
-public:
+ public:
   MaxPoolProjection(const ProjectionConfig& config,
                     ParameterPtr parameter,
                     bool useGpu)
@@ -56,7 +56,7 @@ public:
 };
 
 class AvgPoolProjection : public PoolProjection {
-public:
+ public:
   AvgPoolProjection(const ProjectionConfig& config,
                     ParameterPtr parameter,
                     bool useGpu)
diff --git a/paddle/gserver/layers/PoolProjectionLayer.h b/paddle/gserver/layers/PoolProjectionLayer.h
index 5a97a7769aaeebcfd4fe2c10d8ac0cc8892f68e3..9ad144cc2ad426caa522bf1061a750d47e64a755 100644
--- a/paddle/gserver/layers/PoolProjectionLayer.h
+++ b/paddle/gserver/layers/PoolProjectionLayer.h
@@ -24,13 +24,13 @@ namespace paddle {
  * @brief Basic parent layer of different kinds of pooling
  */
 class PoolProjectionLayer : public PoolLayer {
-protected:
+ protected:
   size_t imgSizeH_, imgSizeW_;
   size_t outputH_, outputW_;
   std::unique_ptr<PoolProjection> poolProjection_;
   ProjectionConfig projectionConfig_;
 
-public:
+ public:
   explicit PoolProjectionLayer(const LayerConfig& config) : PoolLayer(config) {
     PoolConfig* conf = projectionConfig_.mutable_pool_conf();
     *conf = config_.inputs(0).pool_conf();
diff --git a/paddle/gserver/layers/PowerLayer.cpp b/paddle/gserver/layers/PowerLayer.cpp
index 18f650fcdaded5ad7199510594b873fc18c3d7b5..7e8d60db8fe588026c6040099745c3aefd7237b5 100644
--- a/paddle/gserver/layers/PowerLayer.cpp
+++ b/paddle/gserver/layers/PowerLayer.cpp
@@ -32,10 +32,10 @@ namespace paddle {
  */
 
 class PowerLayer : public Layer {
-protected:
+ protected:
   MatrixPtr tmpMtx;
 
-public:
+ public:
   explicit PowerLayer(const LayerConfig& config) : Layer(config) {}
 
   ~PowerLayer() {}
diff --git a/paddle/gserver/layers/PrintLayer.cpp b/paddle/gserver/layers/PrintLayer.cpp
index 5a527d598dd5e11ae0b74a32c9b9884e73ed45a8..6fbcc447f92208439bddd14d421d62cab30d81f4 100644
--- a/paddle/gserver/layers/PrintLayer.cpp
+++ b/paddle/gserver/layers/PrintLayer.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 
 class PrintLayer : public Layer {
-public:
+ public:
   explicit PrintLayer(const LayerConfig& config) : Layer(config) {}
 
   void forward(PassType passType) override {
diff --git a/paddle/gserver/layers/PriorBox.cpp b/paddle/gserver/layers/PriorBox.cpp
index 56a4d942f0fdcb981f52f6ce0f644ec57a0e3c9a..39d2c2d737fa90737635efdb209610e156c8662f 100644
--- a/paddle/gserver/layers/PriorBox.cpp
+++ b/paddle/gserver/layers/PriorBox.cpp
@@ -28,7 +28,7 @@ namespace paddle {
  */
 
 class PriorBoxLayer : public Layer {
-public:  // NOLINT
+ public:  // NOLINT
   explicit PriorBoxLayer(const LayerConfig& config) : Layer(config) {}
   bool init(const LayerMap& layerMap,
             const ParameterMap& parameterMap) override;
@@ -36,7 +36,7 @@ public:  // NOLINT
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override {}
 
-protected:  // NOLINT
+ protected:  // NOLINT
   int numPriors_;
   std::vector<int> minSize_;
   std::vector<int> maxSize_;
diff --git a/paddle/gserver/layers/Projection.h b/paddle/gserver/layers/Projection.h
index 1f0b96c79ec7313cd9c5ff9139a455b3269b222b..88a41355cfce711e1e9522655058d0f1198e4e76 100644
--- a/paddle/gserver/layers/Projection.h
+++ b/paddle/gserver/layers/Projection.h
@@ -37,7 +37,7 @@ namespace paddle {
  * to output Argument.
  */
 class Projection {
-public:
+ public:
   static Projection* create(const ProjectionConfig& config,
                             ParameterPtr parameter,
                             bool useGpu);
@@ -98,7 +98,7 @@ public:
    */
   size_t getOutputSize() const { return config_.output_size(); }
 
-protected:
+ protected:
   /**
    * Create layer function. Function is called in forward or backward.
    * \param function, Layer::forward_ or Layer::backward_
@@ -119,7 +119,7 @@ protected:
     func->init(config);
   }
 
-protected:
+ protected:
   /// Config of projection
   ProjectionConfig config_;
   /// Parameter of projection
diff --git a/paddle/gserver/layers/ROIPoolLayer.h b/paddle/gserver/layers/ROIPoolLayer.h
index b1735e9748dc3956aade010f33303b55d4f9f439..801a9b3aebe6d718ea38b76246a6056891d0b1f6 100644
--- a/paddle/gserver/layers/ROIPoolLayer.h
+++ b/paddle/gserver/layers/ROIPoolLayer.h
@@ -33,7 +33,7 @@ namespace paddle {
  */
 
 class ROIPoolLayer : public Layer {
-protected:
+ protected:
   size_t channels_;
   size_t width_;
   size_t height_;
@@ -44,7 +44,7 @@ protected:
   // Since there is no int matrix, use real maxtrix instead.
   MatrixPtr maxIdxs_;
 
-public:
+ public:
   explicit ROIPoolLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/RecurrentLayer.h b/paddle/gserver/layers/RecurrentLayer.h
index 8fd4fe6b78ae6474f3cfcec605f25b72af8295bb..94e633e65777aad540738ea67ea1b4e03dd75954 100644
--- a/paddle/gserver/layers/RecurrentLayer.h
+++ b/paddle/gserver/layers/RecurrentLayer.h
@@ -40,7 +40,7 @@ namespace paddle {
  */
 
 class RecurrentLayer : public Layer {
-public:
+ public:
   explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -56,7 +56,7 @@ public:
 
   LayerStatePtr getState() override;
 
-protected:
+ protected:
   /**
    * @brief If user do not set --rnn_use_batch=true, it will
    * compute rnn forward one sequence by one sequence in default.
@@ -110,7 +110,7 @@ protected:
                              size_t numSequences,
                              const int* starts);
 
-protected:
+ protected:
   std::unique_ptr<Weight> weight_;
   std::unique_ptr<Weight> bias_;
 
diff --git a/paddle/gserver/layers/RecurrentLayerGroup.cpp b/paddle/gserver/layers/RecurrentLayerGroup.cpp
index 44b57185c5a5fa7703ca477b990a73cdad2c2aa1..6694e8f2996fdd2c98da1507e5fb3b90b271c850 100644
--- a/paddle/gserver/layers/RecurrentLayerGroup.cpp
+++ b/paddle/gserver/layers/RecurrentLayerGroup.cpp
@@ -27,7 +27,7 @@ namespace paddle {
  * between RecurrentLayerGroupBegin and RecurrentLayerGroupEnd.
  */
 class RecurrentLayerGroup : public Layer {
-public:
+ public:
   explicit RecurrentLayerGroup(const LayerConfig& config) : Layer(config) {}
 
   void initSubNetwork(NeuralNetwork* rootNetwork,
@@ -58,7 +58,7 @@ public:
     callback(*network_);
   }
 
-private:
+ private:
   std::unique_ptr<RecurrentGradientMachine> network_;
 };
 
diff --git a/paddle/gserver/layers/ResizeLayer.cpp b/paddle/gserver/layers/ResizeLayer.cpp
index 831f4c3b7e103bc51d870cfa44616980adca08e8..d4ae9945934a40719d253d4b53915530423448af 100644
--- a/paddle/gserver/layers/ResizeLayer.cpp
+++ b/paddle/gserver/layers/ResizeLayer.cpp
@@ -24,7 +24,7 @@ namespace paddle {
  * resize matrix: (height * width / size) * size
  */
 class ResizeLayer : public Layer {
-public:
+ public:
   explicit ResizeLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/RotateLayer.h b/paddle/gserver/layers/RotateLayer.h
index 3b619921ab741e1236a495e497e18e265bd6e110..7ecbff20167dd95f782f2d61dc34697ab3273934 100644
--- a/paddle/gserver/layers/RotateLayer.h
+++ b/paddle/gserver/layers/RotateLayer.h
@@ -32,7 +32,7 @@ namespace paddle {
  */
 
 class RotateLayer : public Layer {
-public:
+ public:
   explicit RotateLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
@@ -40,7 +40,7 @@ public:
   void forward(PassType passType);
   void backward(const UpdateCallback& callback = nullptr);
 
-private:
+ private:
   int batchSize_;
   int size_;
   int height_;
diff --git a/paddle/gserver/layers/RowConvLayer.h b/paddle/gserver/layers/RowConvLayer.h
index ba0af1de68a5f77d9ffefac6ef5193bb9d1b4f83..3b74df0b1af5caef1a1abd3d3c5b3ae3b67c429b 100644
--- a/paddle/gserver/layers/RowConvLayer.h
+++ b/paddle/gserver/layers/RowConvLayer.h
@@ -22,7 +22,7 @@ namespace paddle {
  * \brief Row Convolution Layer.
  */
 class RowConvLayer : public Layer {
-public:
+ public:
   explicit RowConvLayer(const LayerConfig& config) : Layer(config) {}
 
   ~RowConvLayer() {}
@@ -32,7 +32,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-protected:
+ protected:
   // Row convolution weight, context_lenght_ * fan_out.
   // fan_out is the size of output feature.
   std::unique_ptr<Weight> weight_;
diff --git a/paddle/gserver/layers/RowL2NormLayer.cpp b/paddle/gserver/layers/RowL2NormLayer.cpp
index 7ff0c9bae927cae2bc6a332bc0bde013e07edd0a..d5e6e10a0276adb74ec31c13d9e8acc77414a85b 100644
--- a/paddle/gserver/layers/RowL2NormLayer.cpp
+++ b/paddle/gserver/layers/RowL2NormLayer.cpp
@@ -26,12 +26,12 @@ namespace paddle {
  */
 
 class RowL2NormLayer : public Layer {
-protected:
+ protected:
   MatrixPtr inSquare_;
   MatrixPtr l2NormReciprocal_;
   MatrixPtr dotSum_;
 
-public:
+ public:
   explicit RowL2NormLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/SamplingIdLayer.cpp b/paddle/gserver/layers/SamplingIdLayer.cpp
index 2edd915d226edfd7e48df1a066d5a6f51f259511..dbce63588126c012e3b9713e8be749e0001ddec7 100644
--- a/paddle/gserver/layers/SamplingIdLayer.cpp
+++ b/paddle/gserver/layers/SamplingIdLayer.cpp
@@ -31,7 +31,7 @@ class SamplingIdLayer : public Layer {
   std::uniform_real_distribution<double> rand1_;
   std::vector<Argument> tmpCpuInput_;
 
-public:
+ public:
   explicit SamplingIdLayer(const LayerConfig& config)
       : Layer(config), rand1_(0, 1) {}
 
diff --git a/paddle/gserver/layers/ScaleShiftLayer.cpp b/paddle/gserver/layers/ScaleShiftLayer.cpp
index 799d1fe51a65da10bef637894931627315daf0a2..8af78a2e27d2b50572f8bdd6e98696f3d1967eb1 100644
--- a/paddle/gserver/layers/ScaleShiftLayer.cpp
+++ b/paddle/gserver/layers/ScaleShiftLayer.cpp
@@ -30,11 +30,11 @@ namespace paddle {
  */
 
 class ScaleShiftLayer : public Layer {
-protected:
+ protected:
   std::unique_ptr<Weight> scale_;
   std::unique_ptr<Weight> offset_;
 
-public:
+ public:
   explicit ScaleShiftLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/ScaleSubRegionLayer.h b/paddle/gserver/layers/ScaleSubRegionLayer.h
index 6e861be4858cfc21a42ef7293652d5cdf81be5f5..fe431698bc6cd5e52e2c545756b40be8b307e644 100644
--- a/paddle/gserver/layers/ScaleSubRegionLayer.h
+++ b/paddle/gserver/layers/ScaleSubRegionLayer.h
@@ -29,7 +29,7 @@ namespace paddle {
  *                  region.
  */
 class ScaleSubRegionLayer : public Layer {
-public:
+ public:
   explicit ScaleSubRegionLayer(const LayerConfig& config) : Layer(config) {}
 
   ~ScaleSubRegionLayer() {}
@@ -40,7 +40,7 @@ public:
 
   void backward(const UpdateCallback& callback = nullptr);
 
-protected:
+ protected:
   TensorShape shape_;
   TensorShape indicesShape_;
   size_t imgH_;
diff --git a/paddle/gserver/layers/ScalingLayer.cpp b/paddle/gserver/layers/ScalingLayer.cpp
index 1d98a7373d172d40cddc9b4611cb00434f17e00b..15e07daebee194a789da52d37a192e031348300c 100644
--- a/paddle/gserver/layers/ScalingLayer.cpp
+++ b/paddle/gserver/layers/ScalingLayer.cpp
@@ -32,7 +32,7 @@ namespace paddle {
  */
 
 class ScalingLayer : public Layer {
-public:
+ public:
   explicit ScalingLayer(const LayerConfig& config) : Layer(config) {}
 
   ~ScalingLayer() {}
diff --git a/paddle/gserver/layers/ScalingProjection.cpp b/paddle/gserver/layers/ScalingProjection.cpp
index 99b5b68f543842d23f20b626fddd66b677ebe059..4d871cafc4d0194a61044d76a766236209c33d47 100644
--- a/paddle/gserver/layers/ScalingProjection.cpp
+++ b/paddle/gserver/layers/ScalingProjection.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 
 class ScalingProjection : public Projection {
-public:
+ public:
   ScalingProjection(const ProjectionConfig& config,
                     const ParameterPtr& parameter,
                     bool useGpu)
@@ -48,7 +48,7 @@ public:
     }
   }
 
-protected:
+ protected:
   std::unique_ptr<Weight> weight_;
 };
 
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
index 81564074185a5d9fc80d4d3a64af998098ab5472..4b32ce8b162c2a8b1a6c34adc0885a7701f5f91e 100644
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
+++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
@@ -33,11 +33,11 @@ namespace paddle {
  * The config file api is selective_fc_layer.
  */
 class SelectiveFullyConnectedLayer : public Layer {
-protected:
+ protected:
   WeightList weights_;
   std::unique_ptr<Weight> biases_;
 
-private:
+ private:
   /**
    * Get selected columns each forward.
    */
@@ -60,7 +60,7 @@ private:
   /// if true, means output_.value is the same as Fc Layer
   bool fullOutput_;
 
-public:
+ public:
   explicit SelectiveFullyConnectedLayer(const LayerConfig& config)
       : Layer(config), selCols_(nullptr) {}
 
@@ -94,7 +94,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-private:
+ private:
   /**
    * @brief Make SelectiveFC act as FullyConnectedLayer
    */
diff --git a/paddle/gserver/layers/SequenceConcatLayer.cpp b/paddle/gserver/layers/SequenceConcatLayer.cpp
index cf573f3f33fcd70c6768b164f158cb1f545414fc..c84c3ce4f080cc19f4937f04585accb5b2b347f9 100644
--- a/paddle/gserver/layers/SequenceConcatLayer.cpp
+++ b/paddle/gserver/layers/SequenceConcatLayer.cpp
@@ -29,10 +29,10 @@ namespace paddle {
  */
 
 class SequenceConcatLayer : public Layer {
-protected:
+ protected:
   std::unique_ptr<Weight> biases_;
 
-public:
+ public:
   explicit SequenceConcatLayer(const LayerConfig& config) : Layer(config) {}
 
   ~SequenceConcatLayer() {}
diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
index 6c4ae775c16ac76e237fb8f8ee5ec9ed8f11802e..28d0a9296d4accd4152e886ccae12a776fdb8f7f 100644
--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -38,12 +38,12 @@ namespace paddle {
  */
 
 class SequenceLastInstanceLayer : public SequencePoolLayer {
-protected:
+ protected:
   MatrixPtr tmpSrc_;
   MatrixPtr tmpDest_;
   std::vector<int> instanceIds_;
 
-public:
+ public:
   explicit SequenceLastInstanceLayer(const LayerConfig& config)
       : SequencePoolLayer(config) {}
 
diff --git a/paddle/gserver/layers/SequencePoolLayer.h b/paddle/gserver/layers/SequencePoolLayer.h
index 254e4cc6b3aacf21565cb03e5bdb52a2beb9fea8..01183060afd58376bb718dda64d8106cce4899f9 100644
--- a/paddle/gserver/layers/SequencePoolLayer.h
+++ b/paddle/gserver/layers/SequencePoolLayer.h
@@ -41,7 +41,7 @@ namespace paddle {
  */
 
 class SequencePoolLayer : public Layer {
-protected:
+ protected:
   int type_;
   std::unique_ptr<Weight> biases_;
   enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
@@ -51,7 +51,7 @@ protected:
   // Whether the input sequence is reversed or not.
   bool reversed_ = false;
 
-public:
+ public:
   explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/SequenceReshapeLayer.cpp b/paddle/gserver/layers/SequenceReshapeLayer.cpp
index fb96669917236b98809f1cda0d023600f1e76731..319310af8c4ac3bdefd814ad05b7fde6070f2340 100644
--- a/paddle/gserver/layers/SequenceReshapeLayer.cpp
+++ b/paddle/gserver/layers/SequenceReshapeLayer.cpp
@@ -29,12 +29,12 @@ namespace paddle {
  */
 
 class SequenceReshapeLayer : public Layer {
-protected:
+ protected:
   std::unique_ptr<Weight> biases_;
 
   MatrixPtr reshapedOutputGrad;
 
-public:
+ public:
   explicit SequenceReshapeLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/SequenceSliceLayer.cpp b/paddle/gserver/layers/SequenceSliceLayer.cpp
index 1b7c33477ea64c1cdb7c8e85d7a5302b299d7552..a6d810b583aab6e44faa583795686f06e17beeb9 100644
--- a/paddle/gserver/layers/SequenceSliceLayer.cpp
+++ b/paddle/gserver/layers/SequenceSliceLayer.cpp
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 
 class SequenceSliceLayer : public Layer {
-public:
+ public:
   explicit SequenceSliceLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -30,7 +30,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-private:
+ private:
   /*
    * TODO(caoying)
    * In PaddePaddle, currently all matrices are real number types,
diff --git a/paddle/gserver/layers/SequenceToBatch.h b/paddle/gserver/layers/SequenceToBatch.h
index 8743a5ef10f61970d3d48b105b9da29bcd10ba83..5200e702d9bc947746567c19ca7d552750828131 100644
--- a/paddle/gserver/layers/SequenceToBatch.h
+++ b/paddle/gserver/layers/SequenceToBatch.h
@@ -39,7 +39,7 @@ namespace paddle {
  *
  */
 class SequenceToBatch {
-public:
+ public:
   explicit SequenceToBatch(bool useGpu) : useGpu_(useGpu) {}
 
   /* resize and calculate the batchIndex_ */
@@ -82,7 +82,7 @@ public:
     numBatch_ = seq2batch.numBatch_;
   }
 
-protected:
+ protected:
   void sequence2BatchCopy(Matrix &batch,
                           Matrix &sequence,
                           IVector &seq2BatchIdx,
diff --git a/paddle/gserver/layers/SliceProjection.cpp b/paddle/gserver/layers/SliceProjection.cpp
index 5627ad1eb3a49a73261bc2197cbd3735489509d2..b474f2db759adfad337f9485a5a38588b6839c54 100644
--- a/paddle/gserver/layers/SliceProjection.cpp
+++ b/paddle/gserver/layers/SliceProjection.cpp
@@ -44,14 +44,14 @@ namespace paddle {
  * The config file api is slice_projection.
  */
 class SliceProjection : public Projection {
-public:
+ public:
   SliceProjection(const ProjectionConfig& config,
                   const ParameterPtr& parameter,
                   bool useGpu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 
-protected:
+ protected:
   std::vector<std::pair<size_t, size_t>> slices_;
 };
 
diff --git a/paddle/gserver/layers/SlopeInterceptLayer.cpp b/paddle/gserver/layers/SlopeInterceptLayer.cpp
index c94a07e5da7442bba1ce7e9c09c4ffea3e5cd4ac..f7f4735c1b72d4ac6540714573fd7e15ef99ea5b 100644
--- a/paddle/gserver/layers/SlopeInterceptLayer.cpp
+++ b/paddle/gserver/layers/SlopeInterceptLayer.cpp
@@ -36,7 +36,7 @@ namespace paddle {
  */
 
 class SlopeInterceptLayer : public Layer {
-public:
+ public:
   explicit SlopeInterceptLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.h b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
index 6cb5fdf83e2b88ce4adb392807a1fdbac253c51c..421bdfe09c46f656f500daff195c755274bf8bb7 100644
--- a/paddle/gserver/layers/SpatialPyramidPoolLayer.h
+++ b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
@@ -29,7 +29,7 @@ namespace paddle {
  */
 
 class SpatialPyramidPoolLayer : public Layer {
-protected:
+ protected:
   size_t channels_;
   size_t imgSizeW_;
   size_t imgSizeH_;
@@ -40,7 +40,7 @@ protected:
   std::vector<Argument> projOutput_;
   std::vector<std::pair<size_t, size_t>> projCol_;
 
-public:
+ public:
   explicit SpatialPyramidPoolLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/SubNestedSequenceLayer.cpp b/paddle/gserver/layers/SubNestedSequenceLayer.cpp
index db240ab0c96510263d90b291f6396ac51a73fbbd..e2bb00bbfacb26dc736a63877119b379f22b5983 100644
--- a/paddle/gserver/layers/SubNestedSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubNestedSequenceLayer.cpp
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 
 class SubNestedSequenceLayer : public Layer {
-public:
+ public:
   explicit SubNestedSequenceLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -30,7 +30,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
 
-private:
+ private:
   /*
    * This functions generates the indices of rows in a batch according to the
    * indices of selected sub-sequence in each sequence.
diff --git a/paddle/gserver/layers/SubSequenceLayer.cpp b/paddle/gserver/layers/SubSequenceLayer.cpp
index 808627f09273950bb6f52a4a6e497bcb8ea170f7..ba49f5710f9d0bb985cf1e80d5c4a972d8f046a6 100644
--- a/paddle/gserver/layers/SubSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubSequenceLayer.cpp
@@ -27,12 +27,12 @@ namespace paddle {
  */
 
 class SubSequenceLayer : public Layer {
-protected:
+ protected:
   std::unique_ptr<Weight> biases_;
   MatrixPtr tmpSrc_;
   MatrixPtr tmpDest_;
 
-public:
+ public:
   explicit SubSequenceLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/SumToOneNormLayer.cpp b/paddle/gserver/layers/SumToOneNormLayer.cpp
index ffbe14925300ad1ffbd33f43a6c0afadddd231e6..00764717e8b6be30230e44626974033e929352da 100644
--- a/paddle/gserver/layers/SumToOneNormLayer.cpp
+++ b/paddle/gserver/layers/SumToOneNormLayer.cpp
@@ -32,13 +32,13 @@ namespace paddle {
  */
 
 class SumToOneNormLayer : public Layer {
-protected:
+ protected:
   /// reciprocalRowSum_ = \f$1 / \sum_{k=1}^N in[k]\f$
   MatrixPtr reciprocalRowSum_;
   /// dotSum = output_.grad \f$.*\f$ output_.value
   MatrixPtr dotSum_;
 
-public:
+ public:
   explicit SumToOneNormLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/SwitchOrderLayer.h b/paddle/gserver/layers/SwitchOrderLayer.h
index 882437f4434c2e61a5b08328d2f79c1e7f589204..8a551a2bba698374841e73dc4dbad403034dd300 100644
--- a/paddle/gserver/layers/SwitchOrderLayer.h
+++ b/paddle/gserver/layers/SwitchOrderLayer.h
@@ -22,7 +22,7 @@ namespace paddle {
  * \brief  This layer calculate softmax in image channel dimension.
  */
 class SwitchOrderLayer : public Layer {
-public:
+ public:
   explicit SwitchOrderLayer(const LayerConfig& config) : Layer(config) {}
 
   ~SwitchOrderLayer() {}
@@ -34,7 +34,7 @@ public:
   void setInDims();
   void setOutDims();
 
-protected:
+ protected:
   std::vector<std::shared_ptr<FunctionBase>> nchw2nhwc_;
   std::vector<std::shared_ptr<FunctionBase>> nhwc2nchw_;
   TensorShape inDims_;
diff --git a/paddle/gserver/layers/TableProjection.h b/paddle/gserver/layers/TableProjection.h
index ffb05e68f068a7b9abb0db5cea6133e64300cb55..60286149f4227fbc758dca7864c6d1f67782c7ae 100644
--- a/paddle/gserver/layers/TableProjection.h
+++ b/paddle/gserver/layers/TableProjection.h
@@ -32,7 +32,7 @@ namespace paddle {
  * @note If \f$ids[i] = -1\f$, it will be ignored.
  */
 class TableProjection : public Projection {
-public:
+ public:
   TableProjection(const ProjectionConfig& config,
                   const ParameterPtr& parameter,
                   bool useGpu);
@@ -43,7 +43,7 @@ public:
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 
-protected:
+ protected:
   std::unique_ptr<Weight> table_;
 };
 
diff --git a/paddle/gserver/layers/TensorLayer.h b/paddle/gserver/layers/TensorLayer.h
index 8a323aa15f6f3761c45b6ca7e3be8f15621a189e..5c1ee40ceda9387138a82368ec4edcbae4bd3419 100644
--- a/paddle/gserver/layers/TensorLayer.h
+++ b/paddle/gserver/layers/TensorLayer.h
@@ -37,11 +37,11 @@ namespace paddle {
  */
 
 class TensorLayer : public Layer {
-protected:
+ protected:
   WeightList weights_;
   std::unique_ptr<Weight> biases_;
 
-public:
+ public:
   explicit TensorLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/TransLayer.h b/paddle/gserver/layers/TransLayer.h
index 03d094862459c80aee8899c0352ffce732db08af..1cd8fd91f785d5a43fc7d7663e657702b32fa534 100644
--- a/paddle/gserver/layers/TransLayer.h
+++ b/paddle/gserver/layers/TransLayer.h
@@ -29,7 +29,7 @@ namespace paddle {
  * The config file api is trans_layer.
  */
 class TransLayer : public Layer {
-public:
+ public:
   explicit TransLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
index 755389f7074c252c0fad396e629c6ffedc74b531..45f59779896f993aface284e3485e1e3d801f4c5 100644
--- a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
+++ b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
@@ -24,14 +24,14 @@ namespace paddle {
  * The config file api is trans_full_matrix_projection.
  */
 class TransposedFullMatrixProjection : public Projection {
-public:
+ public:
   TransposedFullMatrixProjection(const ProjectionConfig& config,
                                  ParameterPtr parameter,
                                  bool useGPu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 
-protected:
+ protected:
   std::unique_ptr<Weight> weight_;
 };
 
diff --git a/paddle/gserver/layers/UpsampleLayer.h b/paddle/gserver/layers/UpsampleLayer.h
index 25efbac5e9e6e92653f7c2b2f4dca9221737e5d6..c9d079c3141c37517866bfdad10d9b2cdb89f7d5 100644
--- a/paddle/gserver/layers/UpsampleLayer.h
+++ b/paddle/gserver/layers/UpsampleLayer.h
@@ -30,7 +30,7 @@ namespace paddle {
  */
 
 class UpsampleLayer : public Layer {
-public:
+ public:
   explicit UpsampleLayer(const LayerConfig& config) : Layer(config) {}
   ~UpsampleLayer() {}
 
@@ -42,7 +42,7 @@ public:
 
   size_t getOutputSize();
 
-protected:
+ protected:
   size_t scale_, scaleY_;
   size_t upsampleSize_, upsampleSizeY_;
   size_t padOutX_, padOutY_;
diff --git a/paddle/gserver/layers/ValidationLayer.h b/paddle/gserver/layers/ValidationLayer.h
index f412d685c0541537bd4318fec2dae06215c4afbe..be41128ef4530f32a63c757648c2f393fd118ea6 100644
--- a/paddle/gserver/layers/ValidationLayer.h
+++ b/paddle/gserver/layers/ValidationLayer.h
@@ -23,7 +23,7 @@ DECLARE_int32(trainer_id);
 namespace paddle {
 
 class ValidationLayer : public Layer {
-public:
+ public:
   explicit ValidationLayer(const LayerConfig& config) : Layer(config) {}
 
   bool init(const LayerMap& layerMap,
@@ -51,7 +51,7 @@ public:
  * AucValidation
  */
 class AucValidation : public ValidationLayer {
-public:
+ public:
   explicit AucValidation(const LayerConfig& config)
       : ValidationLayer(config),
         cpuOutput_(nullptr),
@@ -72,7 +72,7 @@ public:
   };
   std::vector<PredictionResult> predictArray_;
 
-private:
+ private:
   bool passBegin_;
   std::unique_ptr<Evaluator> evaluator_;
   MatrixPtr cpuOutput_;
@@ -84,7 +84,7 @@ private:
  * positive-negative pair rate Validation
  */
 class PnpairValidation : public ValidationLayer {
-public:
+ public:
   explicit PnpairValidation(const LayerConfig& config)
       : ValidationLayer(config) {}
 
@@ -95,7 +95,7 @@ public:
 
   void onPassEnd() override;
 
-private:
+ private:
   bool passBegin_;
   std::unique_ptr<Evaluator> evaluator_;
 };
diff --git a/paddle/gserver/layers/WarpCTCLayer.h b/paddle/gserver/layers/WarpCTCLayer.h
index 6f6be359c0aa46a4f3775f8405e1aa51ca1ae147..3017ca794ecc14f5a3cbd0b302a4953a191a5065 100644
--- a/paddle/gserver/layers/WarpCTCLayer.h
+++ b/paddle/gserver/layers/WarpCTCLayer.h
@@ -26,7 +26,7 @@ namespace paddle {
  * The config file api is warp_ctc_layer.
  */
 class WarpCTCLayer : public Layer {
-public:
+ public:
   explicit WarpCTCLayer(const LayerConfig& config) : Layer(config) {}
   ~WarpCTCLayer() {}
 
@@ -35,7 +35,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override;
 
-protected:
+ protected:
   /**
    * sequence matrix and batch matrix copy:
    * sequence (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
@@ -49,7 +49,7 @@ protected:
                         const ICpuGpuVectorPtr& seqStartPositions,
                         bool normByTimes);
 
-protected:
+ protected:
   size_t numClasses_;
   size_t blank_;
   size_t maxSequenceLength_;
diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h
index c1faa6fd90e06d8c742e97c9ce51eeba3c24a550..41ac46b70ab08d4071f4e6abfca94667268015d7 100644
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ b/paddle/gserver/tests/MKLDNNTester.h
@@ -44,7 +44,7 @@ class MKLDNNTester {
     std::vector<VectorPtr> paraValues;
   };
 
-protected:
+ protected:
   std::vector<TestConfig> configs_;
   vector<string> layerNames_;
   vector<vector<DataLayerPtr>> dataLayers_;
@@ -65,7 +65,7 @@ protected:
   /// passType, PASS_TRAIN, PASS_TEST or PASS_GC (Gradient Check pass)
   PassType passType_;
 
-public:
+ public:
   explicit MKLDNNTester(size_t iter = 3, float epsilon = 1e-4) {
     iter_ = iter;
     eps_ = epsilon;
@@ -75,7 +75,7 @@ public:
 
   ~MKLDNNTester() {}
 
-public:
+ public:
   void run(const TestConfig& dnn,
            const TestConfig& ref,
            size_t batchSize,
@@ -97,7 +97,7 @@ public:
                            bool use_mkldnn,
                            size_t iter = 2);
 
-private:
+ private:
   void reset(const TestConfig& dnn, const TestConfig& ref, size_t batchSize);
   void setInputImgSize();
   void runOnce();
diff --git a/paddle/gserver/tests/test_MultinomialSampler.cpp b/paddle/gserver/tests/test_MultinomialSampler.cpp
index 4a295ea9d51788f988fe79f8439cc7769f661d8e..043025239e744601cbef3ca5c241509872963bd8 100644
--- a/paddle/gserver/tests/test_MultinomialSampler.cpp
+++ b/paddle/gserver/tests/test_MultinomialSampler.cpp
@@ -27,7 +27,7 @@ using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
 class MultinomialSamplerTester : public MultinomialSampler {
-public:
+ public:
   MultinomialSamplerTester(real* prob, int size)
       : MultinomialSampler(prob, size) {}
 
diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
index 72324fcf29cc60867005da25b35a8075fd590a89..9770567b88a2af946b30439300540ed61694ba10 100644
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -26,7 +26,7 @@ DECLARE_int32(seed);
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 class TrainerForTest : public paddle::Trainer {
-public:
+ public:
   void startTrain() {
     GradientMachine& gm = *this->trainerInternal_.getGradientMachine();
     gm.start();
diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
index e5ce922f15749cb18b93f64e0e08f437c5633065..b54e37b7dbf8bffeb949f709e6a4f9ec86ea13c3 100644
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -225,7 +225,7 @@ TEST(Layer, RecurrentLayer) {
 #include "paddle/gserver/layers/RecurrentLayer.h"
 template <class T>
 class TestRecurrentLayer {
-public:
+ public:
   LayerConfig config_;
   bool useGpu_;
   bool useBatch_;
diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h
index ae60f6fe5fa142bdffeafc31b5816b8fcc94ad5c..c43a83891eb6b7eae278169736149ad1d89e950e 100644
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -27,7 +27,7 @@ namespace paddle {
  * This is the base class of all Allocator class.
  */
 class Allocator {
-public:
+ public:
   virtual ~Allocator() {}
   virtual void* alloc(size_t size) = 0;
   virtual void free(void* ptr) = 0;
@@ -38,7 +38,7 @@ public:
  * @brief CPU allocator implementation.
  */
 class CpuAllocator : public Allocator {
-public:
+ public:
   ~CpuAllocator() {}
 
   /**
@@ -76,7 +76,7 @@ public:
  * @brief GPU allocator implementation.
  */
 class GpuAllocator : public Allocator {
-public:
+ public:
   ~GpuAllocator() {}
 
   /**
@@ -107,7 +107,7 @@ public:
  * @brief CPU pinned memory allocator implementation.
  */
 class CudaHostAllocator : public Allocator {
-public:
+ public:
   ~CudaHostAllocator() {}
 
   /**
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
index 00ce5a19491048f3339d608ac37669816a9ad3f5..1958629aa0354fcc332b1e5677a64c29397e0d26 100644
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -43,7 +43,7 @@ typedef bool_constant<bool, true> true_type;
   address += row * ld + col;
 
 class MatrixOffset {
-public:
+ public:
   size_t aCol_;
   size_t aRow_;
   size_t bCol_;
@@ -72,14 +72,14 @@ public:
 
 template <class T>
 class BaseMatrixT : public TensorExpression<BaseMatrixT<T>, T> {
-public:
+ public:
   size_t height_, width_;
   size_t stride_;
   T* data_;
   bool trans_;
   bool useGpu_;
 
-public:
+ public:
   virtual ~BaseMatrixT() {}
   BaseMatrixT(size_t height, size_t width, T* data, bool trans, bool useGpu)
       : height_(height),
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h
index 22b6b71688bd555cf8bf8a29088ad01b092d67cf..172792c2950ce56281715cb7f3eb076da252d77e 100644
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 
 class CpuSparseMatrix : public Matrix {
-public:
+ public:
   CpuSparseMatrix(size_t height,
                   size_t width,
                   size_t nnz, /* used to allocate space */
@@ -291,10 +291,10 @@ public:
     LOG(FATAL) << "not supported!";
   }
 
-private:
+ private:
   MatrixPtr clone(size_t height = 0, size_t width = 0, bool useGpu = false);
 
-protected:
+ protected:
   void sparseResize();
   /*for csr , record row start position, for csc, record row index for every no
    * zero value*/
@@ -310,10 +310,10 @@ protected:
   static ThreadLocal<std::vector<CpuSparseMatrixPtr>> cpuLocalMats_;
 
   // BaseMatrixT interface
-public:
+ public:
   bool isSparse() const { return true; }
 
-private:
+ private:
   using Matrix::mul;
   using Matrix::copyFrom;
   using Matrix::rowMax;
@@ -329,7 +329,7 @@ private:
 namespace paddle {
 
 class CpuSparseMatrix : public Matrix {
-public:
+ public:
   CpuSparseMatrix(size_t height,
                   size_t width,
                   size_t nnz, /* used to allocate space */
diff --git a/paddle/math/ExecViaCpu.h b/paddle/math/ExecViaCpu.h
index 9b2a3c2b8accd384aac896e86ef8315a744633e1..ec2337545e9e3efdf31d3d786a096a67283715f2 100644
--- a/paddle/math/ExecViaCpu.h
+++ b/paddle/math/ExecViaCpu.h
@@ -31,17 +31,17 @@ namespace paddle {
 
 template <typename Arg>
 class CopyToCpu {
-public:
+ public:
   explicit CopyToCpu(Arg& arg) : arg_(arg) {}
   Arg& copiedArg() const { return arg_; }
 
-private:
+ private:
   Arg& arg_;
 };
 
 template <>
 class CopyToCpu<Matrix> {
-public:
+ public:
   explicit CopyToCpu(Matrix& arg) : arg_(arg) {
     if (arg.useGpu()) {
       CHECK(!arg.isTransposed()) << "Not supported";
@@ -59,14 +59,14 @@ public:
   }
   Matrix& copiedArg() const { return copied_ ? *copied_ : arg_; }
 
-private:
+ private:
   Matrix& arg_;
   MatrixPtr copied_;
 };
 
 template <>
 class CopyToCpu<const Matrix> {
-public:
+ public:
   explicit CopyToCpu(const Matrix& arg) : arg_(arg) {
     if (arg.useGpu()) {
       CHECK(!arg.isTransposed()) << "Not supported";
@@ -79,14 +79,14 @@ public:
   }
   const Matrix& copiedArg() const { return copied_ ? *copied_ : arg_; }
 
-private:
+ private:
   const Matrix& arg_;
   MatrixPtr copied_;
 };
 
 template <>
 class CopyToCpu<IVector> {
-public:
+ public:
   explicit CopyToCpu(IVector& arg) : arg_(arg) {
     if (arg.useGpu()) {
       copied_ = IVector::create(arg.getSize(), /* useGpu= */ false);
@@ -100,14 +100,14 @@ public:
   }
   IVector& copiedArg() const { return copied_ ? *copied_ : arg_; }
 
-private:
+ private:
   IVector& arg_;
   IVectorPtr copied_;
 };
 
 template <>
 class CopyToCpu<const IVector> {
-public:
+ public:
   explicit CopyToCpu(const IVector& arg) : arg_(arg) {
     if (arg.useGpu()) {
       copied_ = IVector::create(arg.getSize(), /* useGpu= */ false);
@@ -116,7 +116,7 @@ public:
   }
   const IVector& copiedArg() const { return copied_ ? *copied_ : arg_; }
 
-private:
+ private:
   const IVector& arg_;
   IVectorPtr copied_;
 };
@@ -128,7 +128,7 @@ class GpuFuncWrapperImp;
 
 template <typename F, typename R, typename... Args>
 class GpuFuncWrapperBase {
-public:
+ public:
   typedef R ResultType;
   R operator()(F&& f, Args... args) {
     return f(CopyToCpu<typename std::remove_reference<Args>::type>(args)
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
index e1fb81679adf4658a58ceee73c8d5da6c0b61050..d4a78f3e54b73add3c00e17f13d91359839d3d14 100644
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -35,7 +35,7 @@ typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;
  *
  */
 class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
-public:
+ public:
   MKLDNNMatrix(CpuMatrixPtr m, mkldnn::memory::primitive_desc pd)
       : CpuMatrix(m->getData(), m->getHeight(), m->getWidth(), false),
         mkldnn::memory(pd, m->getData()),
@@ -107,7 +107,7 @@ public:
     dst.copyFrom(*m_);
   }
 
-public:
+ public:
   /**
    * Reorder this MKLDNNMatrix from other format.
    * Support inplace reorder.
@@ -226,7 +226,7 @@ public:
    */
   mkldnn::engine getEngine() { return getPrimitiveDesc().get_engine(); }
 
-protected:
+ protected:
   /**
    * Do reorder once.
    * Can support inplace.
@@ -248,7 +248,7 @@ protected:
     set_data_handle(data);
   }
 
-private:
+ private:
   // save the CpuMatrixPtr in case the buffer released outside
   CpuMatrixPtr m_;
 };
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index de404cad89fba8021b8645a40e25c1f5b7e86596..f48119aa511578b21602a225277f01b4c6a9e9a8 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "MathFunctions.h"
+#include "paddle/math/MathFunctions.h"
 #include "hl_matrix_apply.cuh"
 #include "hl_matrix_ops.cuh"
 #include "paddle/utils/DynamicLoader.h"
@@ -240,6 +240,36 @@ template <>
 void vAdd<double>(const int n, const double* a, const double* b, double* r) {
   vdAdd(n, a, b, r);
 }
+
+template <>
+void vTanh<float>(const int n, const float* a, float* r) {
+  vsTanh(n, a, r);
+}
+
+template <>
+void vTanh<double>(const int n, const double* a, double* r) {
+  vdTanh(n, a, r);
+}
+
+template <>
+void vInvSqrt<float>(const int n, const float* a, float* r) {
+  vsInvSqrt(n, a, r);
+}
+
+template <>
+void vInvSqrt<double>(const int n, const double* a, double* r) {
+  vdInvSqrt(n, a, r);
+}
+
+template <>
+void vLog1p<float>(const int n, const float* a, float* r) {
+  vsLog1p(n, a, r);
+}
+
+template <>
+void vLog1p<double>(const int n, const double* a, double* r) {
+  vdLog1p(n, a, r);
+}
 #else
 
 DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
@@ -277,17 +307,6 @@ void vAdd(const int n, const T* a, const T* b, T* r) {
                                                      n);
 }
 
-template void vExp(const int n, const float* a, float* r);
-template void vExp(const int n, const double* a, double* r);
-template void vLog(const int n, const float* a, float* r);
-template void vLog(const int n, const double* a, double* r);
-template void vPow(const int n, const float* a, const float b, float* r);
-template void vPow(const int n, const double* a, const double b, double* r);
-template void vAdd(const int n, const float* a, const float* b, float* r);
-template void vAdd(const int n, const double* a, const double* b, double* r);
-
-#endif
-
 DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
 template <class T>
 void vInvSqrt(const int n, const T* a, T* r) {
@@ -311,11 +330,19 @@ void vTanh(const int n, const T* a, T* r) {
       binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
+template void vExp(const int n, const float* a, float* r);
+template void vExp(const int n, const double* a, double* r);
+template void vLog(const int n, const float* a, float* r);
+template void vLog(const int n, const double* a, double* r);
+template void vPow(const int n, const float* a, const float b, float* r);
+template void vPow(const int n, const double* a, const double b, double* r);
+template void vAdd(const int n, const float* a, const float* b, float* r);
+template void vAdd(const int n, const double* a, const double* b, double* r);
 template void vInvSqrt(const int n, const double* a, double* r);
 template void vInvSqrt(const int n, const float* a, float* r);
 template void vLog1p(const int n, const float* a, float* r);
 template void vLog1p(const int n, const double* a, double* r);
 template void vTanh(const int n, const float* a, float* r);
 template void vTanh(const int n, const double* a, double* r);
-
+#endif
 }  // namespace paddle
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 04e9614eabc47c4c661ace2106e8ca96f45a1d49..4c3b2c95361065372f5969a2da73bce0eb9d123f 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -77,7 +77,7 @@ typedef std::shared_ptr<CpuSparseMatrix> CpuSparseMatrixPtr;
  * instead.
  */
 class Matrix : public BaseMatrix {
-protected:
+ protected:
   Matrix(MemoryHandlePtr memHandle,
          size_t height,
          size_t width,
@@ -95,11 +95,11 @@ protected:
 
   static ThreadLocal<MatrixPtr> tmpMat_;
 
-public:
+ public:
   size_t elementCnt_;  // maximal number of elements which can be held in data_
   MemoryHandlePtr memoryHandle_;
 
-public:
+ public:
   virtual ~Matrix() {}
 
   static MatrixPtr create(MemoryHandlePtr memHandle,
@@ -412,7 +412,7 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-public:
+ public:
   /// Only set all variables to 0 or NULL but not free them.
   virtual void clear() {
     height_ = 0;
@@ -1228,7 +1228,7 @@ inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
 }
 
 class GpuMatrix : public Matrix {
-public:
+ public:
   GpuMatrix();
 
   GpuMatrix(size_t height, size_t width, bool trans = false);
@@ -1660,11 +1660,11 @@ public:
 };
 
 class CpuMatrix : public Matrix {
-private:
+ private:
   MatrixPtr sftmaxSum_;
   MatrixPtr sftmaxDot_;
 
-public:
+ public:
   CpuMatrix(size_t height, size_t width, bool trans = false);
   CpuMatrix(real* data, size_t height, size_t width, bool trans = false)
       : Matrix(data, height, width, trans, false) {}
@@ -1892,7 +1892,7 @@ public:
   real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
   virtual real* getRowBuf(size_t row) { return getRow(row); }
 
-public:
+ public:
   /// add b to each sample of this.
   void addBias(Matrix& b, real scale);
   void addSharedBias(Matrix& b, real scale);
@@ -2128,7 +2128,7 @@ public:
 };
 
 class SharedCpuMatrix : public CpuMatrix {
-public:
+ public:
 #ifndef PADDLE_MOBILE_INFERENCE
   /* blockNum is number of partitions of the matrix  */
   SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false)
@@ -2160,12 +2160,12 @@ public:
 
   ~SharedCpuMatrix() {}
 
-public:
+ public:
   virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
   virtual void add(Matrix& b, real p1, real p2);
   virtual void add(real p1, real p2);
 
-private:
+ private:
   using Matrix::mul;
   void initShared(int blockNum);
   void initBlock(int blockNum);
diff --git a/paddle/math/MatrixBitCode.cpp b/paddle/math/MatrixBitCode.cpp
index 61a9923bc2e6f358738f80de4a30d83c0cc00656..f7a949294b54a5a874e1239a13ca9dce3ba18e94 100644
--- a/paddle/math/MatrixBitCode.cpp
+++ b/paddle/math/MatrixBitCode.cpp
@@ -27,7 +27,7 @@ struct SimpleCode {
   inline bool calcBit(int bit) const { return c_ & (1 << bit); }
   inline int getLength() const { return findLastSet(c_) - 1; }
 
-private:
+ private:
   size_t c_;
 };
 
@@ -39,7 +39,7 @@ struct SimpleCodeTable {
   size_t size() const { return numClasses_; }
   int getMaxCodeLength() const { return findLastSet(numClasses_ - 1); }
 
-private:
+ private:
   size_t numClasses_;
   int maxCodeLength_;
 };
diff --git a/paddle/math/MemoryHandle.h b/paddle/math/MemoryHandle.h
index 03ee413c1218376635c4696ebb774c584aa67aa4..516e09dbed47ac6b039ccb094614c9588eeb3cd5 100644
--- a/paddle/math/MemoryHandle.h
+++ b/paddle/math/MemoryHandle.h
@@ -20,16 +20,16 @@ limitations under the License. */
 namespace paddle {
 
 class MemoryHandle {
-protected:
+ protected:
   explicit MemoryHandle(size_t size);
   virtual ~MemoryHandle() {}
 
-public:
+ public:
   void* getBuf() const { return buf_; }
   size_t getSize() const { return size_; }
   size_t getAllocSize() const { return allocSize_; }
 
-protected:
+ protected:
   PoolAllocator* allocator_;
   size_t size_;       // the requested size
   size_t allocSize_;  // the allocated size
@@ -43,7 +43,7 @@ protected:
  * The raw handle will be released at destructor
  */
 class GpuMemoryHandle : public MemoryHandle {
-public:
+ public:
   explicit GpuMemoryHandle(size_t size);
   virtual ~GpuMemoryHandle();
 };
@@ -54,7 +54,7 @@ public:
  * The raw handle will be released at destructor
  */
 class CpuMemoryHandle : public MemoryHandle {
-public:
+ public:
   explicit CpuMemoryHandle(size_t size);
   virtual ~CpuMemoryHandle();
 };
diff --git a/paddle/math/PoolAllocator.h b/paddle/math/PoolAllocator.h
index 90141fef3fd43fe221874cc50e688f6db9e2dee6..7239cf1c4494e207081e325a7e6067ba26a9c852 100644
--- a/paddle/math/PoolAllocator.h
+++ b/paddle/math/PoolAllocator.h
@@ -27,7 +27,7 @@ namespace paddle {
  * @brief Memory pool allocator implementation.
  */
 class PoolAllocator {
-public:
+ public:
   /**
    * @brief constructor.
    * @param allocator a Allocator object.
@@ -47,7 +47,7 @@ public:
   void free(void* ptr, size_t size);
   std::string getName() { return name_; }
 
-private:
+ private:
   void freeAll();
   void printAll();
   std::unique_ptr<Allocator> allocator_;
diff --git a/paddle/math/RowBuffer.h b/paddle/math/RowBuffer.h
index 2e4d11a86bf8bd1308b2972f549bc7c201044785..6950afaa21d60615b27c06a151b0afbb296653bf 100644
--- a/paddle/math/RowBuffer.h
+++ b/paddle/math/RowBuffer.h
@@ -26,7 +26,7 @@ namespace paddle {
  * If not set memory handler, then the data could be auto growth.
  */
 class RowBuffer {
-public:
+ public:
   /**
    * @brief RowBuffer create a auto-growth row buffer. The row length is width.
    * @param width the length of each row, a.k.a matrix width.
@@ -129,7 +129,7 @@ public:
    */
   inline size_t getWidth() const { return width_; }
 
-private:
+ private:
   //! TODO(yuyang18): Add resize method to CpuMemHandlePtr, then we can get rid
   //! of std::vector here.
   CpuMemHandlePtr preallocatedBuf_;
diff --git a/paddle/math/SparseMatrix.h b/paddle/math/SparseMatrix.h
index 7c525f4edf3d53544c195f8e253c27a03854a793..9181fa29233677d8f4fac503905cc31eb66cb6c1 100644
--- a/paddle/math/SparseMatrix.h
+++ b/paddle/math/SparseMatrix.h
@@ -25,7 +25,7 @@ namespace paddle {
 typedef std::shared_ptr<_hl_sparse_matrix_s> hl_sparse_matrix_s_ptr;
 
 class GpuSparseMatrix : public Matrix {
-public:
+ public:
   MemoryHandlePtr sMemoryHandle_;
   int* rows_;
   int* cols_;
@@ -36,7 +36,7 @@ public:
   SparseValueType valueType_;
   SparseFormat format_;
 
-public:
+ public:
   GpuSparseMatrix(size_t height,
                   size_t width,
                   size_t nnz, /* used to allocate space */
@@ -73,7 +73,7 @@ public:
                   bool trans,
                   MemoryHandlePtr sMemoryHandle);
 
-protected:
+ protected:
   struct Element {
     int row;
     int col;
@@ -82,7 +82,7 @@ protected:
         : row(rowIn), col(colIn), val(valIn) {}
   };
 
-public:
+ public:
   ~GpuSparseMatrix() {}
 
   void resize(size_t newHeight,
@@ -211,13 +211,13 @@ public:
    */
   void rowMax(IVector& maxIds, Matrix& maxVal);
 
-protected:
+ protected:
   void sparseResize();
 
   void copyRow(int offsets, size_t colNum, const sparse_non_value_t* row);
   void copyRow(int offsets, size_t colNum, const sparse_float_value_t* row);
 
-public:
+ public:
   void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
 
   void copyFrom(CpuSparseMatrix& src, hl_stream_t stream);
@@ -228,10 +228,10 @@ public:
   void trimFromCSC(const CpuSparseMatrix& src);
 
   // BaseMatrixT interface
-public:
+ public:
   bool isSparse() const { return true; }
 
-private:
+ private:
   using Matrix::mul;
   using Matrix::copyFrom;
   using Matrix::rowMax;
@@ -248,7 +248,7 @@ private:
 namespace paddle {
 
 class GpuSparseMatrix : public Matrix {
-public:
+ public:
   GpuSparseMatrix(size_t height,
                   size_t width,
                   size_t nnz, /* used to allocate space */
diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h
index 3920de32df7de925d6e22e17b93b15bff8785675..cf6779e8b0b1d6b0c13b21a08ffff5af76e57ba6 100644
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -29,7 +29,7 @@ namespace paddle {
  * Sparse Row
  */
 class SparseRowCpuMatrix : public CpuMatrix {
-public:
+ public:
   struct IndexDict {
     // In the following, global id means the row id in the original matrix.
     // Local id means the row id in the local storage which only contains
@@ -53,7 +53,7 @@ public:
 
   virtual ~SparseRowCpuMatrix() {}
 
-public:
+ public:
   /**
    *  Get the row buf
    *
@@ -163,7 +163,7 @@ public:
     return indexDictHandle_->localIndices;
   }
 
-protected:
+ protected:
   template <typename Func>
   void apply(Func f) {
     f(buf_->data(), localIndices_->size() * width_);
@@ -204,7 +204,7 @@ class SyncThreadPool;
 
 /// For prefetching parameters from remote Parameter server
 class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
-public:
+ public:
   SparsePrefetchRowCpuMatrix(CpuMemHandlePtr dataHandle,
                              size_t height,
                              size_t width,
@@ -229,13 +229,13 @@ public:
    */
   void setupIndices();
 
-protected:
+ protected:
   void addRows(const unsigned int* ids, size_t len);
   SyncThreadPool* pool_;
 };
 
 class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {
-public:
+ public:
   SparseAutoGrowRowCpuMatrix(size_t height,
                              size_t width,
                              IndexDictPtr indexDictHandle = nullptr,
@@ -258,7 +258,7 @@ public:
 };
 
 class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {
-public:
+ public:
   CacheRowCpuMatrix(size_t height,
                     size_t width,
                     IndexDictPtr indexDictHandle = nullptr,
@@ -287,7 +287,7 @@ public:
 
   virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
 
-public:
+ public:
   CpuVectorPtr sourceDataVec_;
   real* sourceData_;
 };
@@ -299,7 +299,7 @@ public:
  * ids are hashed by worker thread id.
  */
 class SparseRowIdsCpuMatrix : public CpuMatrix {
-public:
+ public:
   SparseRowIdsCpuMatrix(CpuMemHandlePtr dataHandle,
                         size_t height,
                         size_t width,
@@ -310,7 +310,7 @@ public:
 
   std::vector<uint32_t>& getIds(size_t threadId) { return idsArray_[threadId]; }
 
-private:
+ private:
   std::vector<std::vector<uint32_t>> idsArray_;
 };
 
@@ -320,13 +320,13 @@ private:
 namespace paddle {
 
 class SparseRowCpuMatrix : public CpuMatrix {
-public:
+ public:
   void reserveStore() {}
   void clearIndices() {}
 };
 
 class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
-public:
+ public:
   void setupIndices() {}
   void addRows(MatrixPtr input) {}
   void addRows(IVectorPtr ids) {}
diff --git a/paddle/math/Storage.h b/paddle/math/Storage.h
index ba8f4689a1e896304aa14821b40fc8ff0c304bb2..61a9aa2a07442d9e4ede80c961e17e079eb8b3ba 100644
--- a/paddle/math/Storage.h
+++ b/paddle/math/Storage.h
@@ -25,7 +25,7 @@ namespace paddle {
  * @brief Storage manager for multiple devices.
  */
 class StorageEngine {
-public:
+ public:
   /**
    * @return Storage singleton
    */
@@ -41,7 +41,7 @@ public:
    */
   PoolAllocator* getCpuAllocator();
 
-protected:
+ protected:
   StorageEngine();
   ~StorageEngine();
   RWLock lock_;
diff --git a/paddle/math/TensorApply.h b/paddle/math/TensorApply.h
index 7d79cae5a11851b190afbb9ac94efdf2ba2510b7..8b642047bffa33b47dfb8ffc8e3fd2a9b7dbae3a 100644
--- a/paddle/math/TensorApply.h
+++ b/paddle/math/TensorApply.h
@@ -21,7 +21,7 @@ namespace paddle {
  */
 template <typename Derived, class T>
 class TensorApply {
-public:
+ public:
   explicit INLINE TensorApply(const Derived& p)
       : data_(p.data_),
         stride_(p.stride_),
@@ -52,7 +52,7 @@ public:
  */
 template <typename Derived, class T>
 class TensorApply<const Derived, T> {
-public:
+ public:
   explicit INLINE TensorApply(const Derived& p)
       : data_(p.data_),
         stride_(p.stride_),
@@ -77,7 +77,7 @@ public:
 
 template <typename Derived, class T>
 class TensorApply<const TensorExpression<Derived, T>, T> {
-public:
+ public:
   explicit TensorApply(const TensorExpression<Derived, T>& expr)
       : expr_(expr.derived()) {}
 
@@ -97,7 +97,7 @@ public:
  */
 template <class OP, typename ArgType, class T>
 class TensorApply<const TensorUnaryOp<OP, ArgType, T>, T> {
-public:
+ public:
   explicit INLINE TensorApply(const TensorUnaryOp<OP, ArgType, T>& expr)
       : op_(expr.op_), expr_(expr.expr_) {}
 
@@ -118,7 +118,7 @@ public:
  */
 template <class OP, typename LhsType, typename RhsType, class T>
 class TensorApply<const TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
-public:
+ public:
   explicit INLINE TensorApply(
       const TensorBinaryOp<OP, LhsType, RhsType, T>& expr)
       : op_(expr.op_), lhs_(expr.lhs_), rhs_(expr.rhs_) {
@@ -153,7 +153,7 @@ public:
  */
 template <typename ArgType1, typename ArgType2, typename ArgType3, class T>
 class TensorApply<const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>, T> {
-public:
+ public:
   explicit INLINE TensorApply(
       const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>& expr)
       : expr1_(expr.expr1_), expr2_(expr.expr2_), expr3_(expr.expr3_) {
@@ -192,7 +192,7 @@ public:
  */
 template <class OP, typename ArgType, class T>
 class TensorApply<const TensorConstant<OP, ArgType, T>, T> {
-public:
+ public:
   explicit INLINE TensorApply(const TensorConstant<OP, ArgType, T>& expr)
       : op_(expr.op_), expr_(expr.expr_) {}
 
diff --git a/paddle/math/TensorAssign.h b/paddle/math/TensorAssign.h
index 113d98c16b22b06971040b1a1ce52c696f6c3c14..7d4726ddba43202970c37dd1a08f842104b24ada 100644
--- a/paddle/math/TensorAssign.h
+++ b/paddle/math/TensorAssign.h
@@ -25,7 +25,7 @@ namespace paddle {
  */
 template <typename LhsType, typename RhsType, class T>
 class TensorAssignOp {
-public:
+ public:
   explicit TensorAssignOp(const LhsType& lhs, const RhsType& rhs)
       : lhs_(lhs), rhs_(rhs) {
 #ifndef __CUDA_ARCH__
@@ -49,7 +49,7 @@ public:
   }
   INLINE bool useGpu() const { return lhs_.useGpu(); }
 
-private:
+ private:
   TensorApply<LhsType, T> lhs_;
   TensorApply<const RhsType, T> rhs_;
 };
diff --git a/paddle/math/TensorExpression.h b/paddle/math/TensorExpression.h
index 83229ae65dd1f4ed6b885c3d6195b3758b8ba039..f6da9adfca50e49ca260e20313c8979a38e1b06b 100644
--- a/paddle/math/TensorExpression.h
+++ b/paddle/math/TensorExpression.h
@@ -40,7 +40,7 @@ class TensorAssignOp;
  */
 template <typename Derived, class T>
 class TensorExpression {
-public:
+ public:
   /**
    * Element wise unary expression.
    */
@@ -355,7 +355,7 @@ public:
     return TensorAssignOp<Derived, ExpressionType, T>(derived(), expr);
   }
 
-protected:
+ protected:
   const Derived& derived() const { return *static_cast<const Derived*>(this); }
 };
 
@@ -365,7 +365,7 @@ protected:
 template <class OP, typename ExprType, class T>
 class TensorUnaryOp
     : public TensorExpression<TensorUnaryOp<OP, ExprType, T>, T> {
-public:
+ public:
   explicit TensorUnaryOp(const OP op, const ExprType& expr)
       : op_(op), expr_(expr) {}
 
@@ -379,7 +379,7 @@ public:
 template <class OP, typename LhsType, typename RhsType, class T>
 class TensorBinaryOp
     : public TensorExpression<TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
-public:
+ public:
   explicit TensorBinaryOp(const OP op, const LhsType& lhs, const RhsType& rhs)
       : op_(op), lhs_(lhs), rhs_(rhs) {}
 
@@ -395,7 +395,7 @@ template <typename ExprType1, typename ExprType2, typename ExprType3, class T>
 class TensorTernaryOp : public TensorExpression<
                             TensorTernaryOp<ExprType1, ExprType2, ExprType3, T>,
                             T> {
-public:
+ public:
   explicit TensorTernaryOp(const ExprType1& expr1,
                            const ExprType2& expr2,
                            const ExprType3& expr3)
@@ -412,7 +412,7 @@ public:
 template <class OP, typename ExprType, class T>
 class TensorConstant
     : public TensorExpression<TensorConstant<OP, ExprType, T>, T> {
-public:
+ public:
   explicit TensorConstant(const OP op, const ExprType& expr)
       : op_(op), expr_(expr) {}
 
diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h
index 3efbc769dff5aa1dbc9d5015b0cbac313710d70d..964b42cae52af9b487ab17103bc5e999514e4dd1 100644
--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
@@ -40,13 +40,13 @@ class Matrix;
 
 template <class T>
 class BaseVector : public BaseMatrixT<T> {
-public:
+ public:
   BaseVector(size_t size, T* data, bool useGpu)
       : BaseMatrixT<T>(1, size, data, false, useGpu), size_(this->width_) {}
 
   ~BaseVector() {}
 
-protected:
+ protected:
   size_t& size_;
 };
 
@@ -57,7 +57,7 @@ protected:
  */
 template <class T>
 class VectorT : public BaseVector<T> {
-protected:
+ protected:
   VectorT(size_t size, MemoryHandlePtr memoryHandle, size_t offset, bool useGpu)
       : BaseVector<T>(size,
                       reinterpret_cast<T*>(memoryHandle->getBuf()) + offset,
@@ -71,7 +71,7 @@ protected:
   VectorT(size_t size, T* data, bool useGpu)
       : BaseVector<T>(size, data, useGpu) {}
 
-public:
+ public:
   virtual ~VectorT() {}
 
   static std::shared_ptr<VectorT<T>> create(size_t size, bool useGpu);
@@ -281,7 +281,7 @@ public:
     }
   }
 
-protected:
+ protected:
   friend class GpuVectorT<T>;
   friend class CpuVectorT<T>;
   virtual void copyTo(CpuVectorT<T>* dest) const = 0;
@@ -297,7 +297,7 @@ std::ostream& operator<<(std::ostream& os, const VectorT<T>& vec) {
 
 template <class T>
 class GpuVectorT : public VectorT<T> {
-public:
+ public:
   explicit GpuVectorT(size_t size);
   GpuVectorT(size_t size, GpuMemHandlePtr memHandle, size_t offset)
       : VectorT<T>(size, memHandle, offset, true) {}
@@ -343,14 +343,14 @@ public:
     TensorGpuApply<T>(*this, expr);
   }
 
-protected:
+ protected:
   virtual void copyTo(CpuVectorT<T>* dest) const;
   virtual void copyTo(GpuVectorT<T>* dest) const;
 };
 
 template <class T>
 class CpuVectorT : public VectorT<T> {
-public:
+ public:
   explicit CpuVectorT(size_t size);
   CpuVectorT(size_t size, MemoryHandlePtr memoryHandle, size_t offset)
       : VectorT<T>(size, memoryHandle, offset, false) {}
@@ -415,7 +415,7 @@ public:
 
 template <class T>
 class ParallelCpuVectorT : public CpuVectorT<T> {
-public:
+ public:
   ParallelCpuVectorT(size_t size, SyncThreadPool* pool)
       : CpuVectorT<T>(size), pool_(pool) {}
 
@@ -434,7 +434,7 @@ public:
 
   virtual void exec(SyncThreadPool::JobFunc jobFunc);
 
-private:
+ private:
   typedef std::function<void(CpuVectorT<T>& vec)> ExecFunc;
   void parallelExec(ExecFunc func);
   SyncThreadPool* pool_;
@@ -445,7 +445,7 @@ private:
  */
 template <class T>
 class CpuGpuVectorT {
-public:
+ public:
   /**
    * @brief An enum type of SyncedFlag using to
    *        mark data memory is in CPU or GPU.
@@ -670,7 +670,7 @@ public:
     setSync(flag);
   }
 
-protected:
+ protected:
   void resizeOrCreate(size_t size, bool useGpu);
 
   /**
diff --git a/paddle/math/tests/TensorCheck.h b/paddle/math/tests/TensorCheck.h
index f4332ede36356bc666612a240448c1be71e5170e..40ac04ef5d4baa0239bb03b04c3a6cce0fcac5a5 100644
--- a/paddle/math/tests/TensorCheck.h
+++ b/paddle/math/tests/TensorCheck.h
@@ -32,7 +32,7 @@ using paddle::CpuVectorT;
 using paddle::GpuVectorT;
 
 class AssertEqual {
-public:
+ public:
   AssertEqual(real err = 0) : err_(err) {}
 
   inline bool operator()(real a, real b) {
@@ -51,7 +51,7 @@ public:
     return true;
   }
 
-private:
+ private:
   real err_;
 };
 
@@ -60,71 +60,71 @@ class CopyToCpu;
 
 template <>
 class CopyToCpu<CpuMatrix> {
-public:
+ public:
   explicit CopyToCpu(const CpuMatrix& arg) : arg_(arg) {}
   const CpuMatrix& copiedArg() const { return arg_; }
 
-private:
+ private:
   const CpuMatrix& arg_;
 };
 
 template <>
 class CopyToCpu<GpuMatrix> {
-public:
+ public:
   explicit CopyToCpu(const GpuMatrix& arg)
       : arg_(arg.getHeight(), arg.getWidth()) {
     arg_.copyFrom(arg);
   }
   CpuMatrix& copiedArg() { return arg_; }
 
-private:
+ private:
   CpuMatrix arg_;
 };
 
 template <>
 class CopyToCpu<Matrix> {
-public:
+ public:
   explicit CopyToCpu(const Matrix& arg)
       : arg_(arg.getHeight(), arg.getWidth()) {
     arg_.copyFrom(arg);
   }
   CpuMatrix& copiedArg() { return arg_; }
 
-private:
+ private:
   CpuMatrix arg_;
 };
 
 template <typename T>
 class CopyToCpu<CpuVectorT<T>> {
-public:
+ public:
   explicit CopyToCpu(const CpuVectorT<T>& arg) : arg_(arg) {}
   const CpuVectorT<T>& copiedArg() const { return arg_; }
 
-private:
+ private:
   const CpuVectorT<T>& arg_;
 };
 
 template <typename T>
 class CopyToCpu<GpuVectorT<T>> {
-public:
+ public:
   explicit CopyToCpu(const GpuVectorT<T>& arg) : arg_(arg.getSize()) {
     arg_.copyFrom(arg);
   }
   CpuVectorT<T>& copiedArg() { return arg_; }
 
-private:
+ private:
   CpuVectorT<T> arg_;
 };
 
 template <typename T>
 class CopyToCpu<VectorT<T>> {
-public:
+ public:
   explicit CopyToCpu(const VectorT<T>& arg) : arg_(arg.getSize()) {
     arg_.copyFrom(arg);
   }
   CpuVectorT<T>& copiedArg() { return arg_; }
 
-private:
+ private:
   CpuVectorT<T> arg_;
 };
 
diff --git a/paddle/math/tests/TestUtils.h b/paddle/math/tests/TestUtils.h
index d2b9706432f84fa082e071eb09d2ffe7402a085f..e1966ec8a74747960420ec80fdfbb957f7cf177f 100644
--- a/paddle/math/tests/TestUtils.h
+++ b/paddle/math/tests/TestUtils.h
@@ -56,31 +56,31 @@ using paddle::GpuSparseMatrix;
 
 template <typename T1, typename T2>
 class ReplaceType {
-public:
+ public:
   typedef T1 type;
 };
 
 template <>
 class ReplaceType<BaseMatrix, CpuMatrix> {
-public:
+ public:
   typedef CpuMatrix type;
 };
 
 template <>
 class ReplaceType<BaseMatrix, GpuMatrix> {
-public:
+ public:
   typedef GpuMatrix type;
 };
 
 template <>
 class ReplaceType<Matrix, CpuMatrix> {
-public:
+ public:
   typedef CpuMatrix type;
 };
 
 template <>
 class ReplaceType<Matrix, GpuMatrix> {
-public:
+ public:
   typedef GpuMatrix type;
 };
 
@@ -180,25 +180,25 @@ R call(C& obj, R (FC::*f)(FArgs...), Args&&... args) {
 
 template <typename T>
 class ReturnType {
-public:
+ public:
   typedef T type;
 };
 
 template <>
 class ReturnType<CpuMatrix> {
-public:
+ public:
   typedef GpuMatrix type;
 };
 
 template <>
 class ReturnType<CpuIVector> {
-public:
+ public:
   typedef GpuIVector type;
 };
 
 template <>
 class ReturnType<CpuSparseMatrix> {
-public:
+ public:
   typedef GpuSparseMatrix type;
 };
 
@@ -234,7 +234,7 @@ GpuSparseMatrix autoArgs(CpuSparseMatrix& v) {
 }
 
 class AutoCompare {
-public:
+ public:
   /**
    * err is the allowed calculation error.
    * The smaller the value of err,
@@ -285,7 +285,7 @@ public:
     TensorCheck(compare, cpu, gpu);
   }
 
-protected:
+ protected:
   CpuMatrix cpu;
   GpuMatrix gpu;
   AssertEqual compare;
diff --git a/paddle/math/tests/test_ExecViaCpu.cpp b/paddle/math/tests/test_ExecViaCpu.cpp
index 513c7b440e0aa6f20cc8209a3624f32f4892225b..72256cb9d4c93159418d27c7ca0d4f8b9a412a64 100644
--- a/paddle/math/tests/test_ExecViaCpu.cpp
+++ b/paddle/math/tests/test_ExecViaCpu.cpp
@@ -39,7 +39,7 @@ real f(Matrix& mat1,
 }
 
 class Functor {
-public:
+ public:
   real operator()(Matrix& mat1,
                   const Matrix& mat2,
                   IVector& vec1,
@@ -49,7 +49,7 @@ public:
     return a_;
   }
 
-private:
+ private:
   real a_;
 };
 
diff --git a/paddle/math/tests/test_TrainingAlgorithm.cpp b/paddle/math/tests/test_TrainingAlgorithm.cpp
index fb146176ca8eb97a9cdbaf9ebd5c4997a8439718..fb58d26734cab5d7d7bbbbe1cf8a920e4195b4bb 100644
--- a/paddle/math/tests/test_TrainingAlgorithm.cpp
+++ b/paddle/math/tests/test_TrainingAlgorithm.cpp
@@ -28,14 +28,14 @@ DEFINE_double(max_diff, 1e-13, "max diff allowed");
 #endif
 
 class SetMaxDiff {
-public:
+ public:
   explicit SetMaxDiff(double max_diff) {
     max_diff_ = FLAGS_max_diff;
     FLAGS_max_diff = max_diff;
   }
   ~SetMaxDiff() { FLAGS_max_diff = max_diff_; }
 
-private:
+ private:
   double max_diff_;
 };
 
diff --git a/paddle/math/tests/test_perturbation.cpp b/paddle/math/tests/test_perturbation.cpp
index ef99dab60a874846d04c5ce07d38b2857640ad7b..969400666f12e4c6001f270be3ec144e7e4d0702 100644
--- a/paddle/math/tests/test_perturbation.cpp
+++ b/paddle/math/tests/test_perturbation.cpp
@@ -32,7 +32,7 @@ const int TGT_SIZE = 21;
 const int CHANNELS = 3;
 
 class PerturbationTest : public testing::Test {
-protected:
+ protected:
   virtual void SetUp() { generateTestImages(gpuImages_); }
 
   virtual void TearDown() {}
diff --git a/paddle/optimizer/CMakeLists.txt b/paddle/optimizer/CMakeLists.txt
index 25fc35311fc63988c64a445d72fc6255e49e8d4b..7c80faa48ce960a3a7eb7d88eda4f2b09756410e 100644
--- a/paddle/optimizer/CMakeLists.txt
+++ b/paddle/optimizer/CMakeLists.txt
@@ -7,6 +7,10 @@ set(OPITMIZER_SRCS
     sgd_optimizer.cc
   )
 
-cc_library(paddle_optimizer STATIC SRCS ${OPITMIZER_SRCS} DEPS paddle_proto glog)
-cc_test(serialization_test SRCS serialization_test.cc DEPS paddle_proto)
-cc_test(parameter_optimizer_test SRCS parameter_optimizer_test.cc DEPS paddle_optimizer)
+add_library(paddle_optimizer ${OPITMIZER_SRCS})
+target_link_libraries(paddle_optimizer paddle_proto glog)
+
+if (WITH_TESTING)
+    add_unittest(serialization_test serialization_test.cc)
+    add_unittest(parameter_optimizer_test parameter_optimizer_test.cc)
+endif()
diff --git a/paddle/optimizer/adadelta_optimizer.h b/paddle/optimizer/adadelta_optimizer.h
index 74df9d54be734fedec8aeddff5f50b1d1aefb1d3..5beb62295a83ba4826e9a6b9caf21de78d2e8ced 100644
--- a/paddle/optimizer/adadelta_optimizer.h
+++ b/paddle/optimizer/adadelta_optimizer.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace optimizer {
 
 class AdadeltaOptimizer : public ParameterOptimizer {
-public:
+ public:
   AdadeltaOptimizer(
       Tensor *parameter, LrPolicy *lr, double rho, double epsilon, double decay)
       : ParameterOptimizer(parameter, lr),
@@ -40,7 +40,7 @@ public:
   std::string SerializeState();
   void DeserializeState(const std::string &state);
 
-private:
+ private:
   Tensor *accum_gradient_;
   Tensor *accum_delta_;
   Tensor *update_delta_;
diff --git a/paddle/optimizer/adagrad_optimizer.h b/paddle/optimizer/adagrad_optimizer.h
index 1d58402d78ff9ada8b084a472d46c96580d01e5b..b6fc06739970984cf4bbd27d3e6e1e9066bc350f 100644
--- a/paddle/optimizer/adagrad_optimizer.h
+++ b/paddle/optimizer/adagrad_optimizer.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace optimizer {
 
 class AdagradOptimizer : public ParameterOptimizer {
-public:
+ public:
   AdagradOptimizer(Tensor *parameter,
                    LrPolicy *lr,
                    double epsilon,
@@ -36,7 +36,7 @@ public:
   std::string SerializeState();
   void DeserializeState(const std::string &state);
 
-private:
+ private:
   Tensor *accum_gradient_;
   double epsilon_;
   double decay_;
diff --git a/paddle/optimizer/adam_optimizer.h b/paddle/optimizer/adam_optimizer.h
index 7977226c8602745d5733021a51fc03d932b0921a..fce10960068364b40592b26a6b439494d75cfa03 100644
--- a/paddle/optimizer/adam_optimizer.h
+++ b/paddle/optimizer/adam_optimizer.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace optimizer {
 
 class AdamOptimizer : public ParameterOptimizer {
-public:
+ public:
   AdamOptimizer(Tensor *parameter,
                 LrPolicy *lr,
                 double beta_1,
@@ -42,7 +42,7 @@ public:
   std::string SerializeState();
   void DeserializeState(const std::string &state);
 
-private:
+ private:
   Tensor *momentums_;
   Tensor *velocitys_;
   double beta_1_;
diff --git a/paddle/optimizer/lr_policy.h b/paddle/optimizer/lr_policy.h
index 14422d1f42fc45d5e9a560c45259d4003a0b3d11..d639c9f22c8ad77267f68e2c3b35257211bf90df 100644
--- a/paddle/optimizer/lr_policy.h
+++ b/paddle/optimizer/lr_policy.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace optimizer {
 
 class LrPolicy {
-public:
+ public:
   virtual ~LrPolicy() {}
   virtual double LearningRate(const uint64_t num_sample_passed) = 0;
   virtual std::string SerializeState() = 0;
@@ -29,7 +29,7 @@ public:
 
 // constant learning rate policy
 class ConstLr final : public LrPolicy {
-public:
+ public:
   ConstLr(double lr) : learning_rate_(lr){};
   double LearningRate(const uint64_t num_sample_passed) {
     return learning_rate_;
@@ -45,12 +45,12 @@ public:
     learning_rate_ = state.learning_rate();
   }
 
-private:
+ private:
   double learning_rate_;
 };
 
 class LinearLr final : public LrPolicy {
-public:
+ public:
   LinearLr(double lr, double lr_decay_a, double lr_decay_b)
       : learning_rate_(lr), lr_decay_a_(lr_decay_a), lr_decay_b_(lr_decay_b) {}
   double LearningRate(const uint64_t num_sample_passed) {
@@ -72,7 +72,7 @@ public:
     lr_decay_b_ = state.lr_decay_b();
   }
 
-private:
+ private:
   double learning_rate_;
   double lr_decay_a_;
   double lr_decay_b_;
diff --git a/paddle/optimizer/parameter_optimizer.h b/paddle/optimizer/parameter_optimizer.h
index c7cf8db3ee05c75c171b68bcbcb06a5ae8fa5b48..d5abca82d55c12aed0f4fca0c4c1f21d20586155 100644
--- a/paddle/optimizer/parameter_optimizer.h
+++ b/paddle/optimizer/parameter_optimizer.h
@@ -26,7 +26,7 @@ namespace paddle {
 namespace optimizer {
 
 class ParameterOptimizer {
-public:
+ public:
   /**
    * @brief  update hook for algorithm need to traverse parameter more than
    * once.
@@ -45,7 +45,7 @@ public:
   virtual std::string SerializeState() = 0;
   virtual void DeserializeState(const std::string &state) = 0;
 
-protected:
+ protected:
   Tensor *parameter_;
   // learning rate policy
   LrPolicy *lr_policy_;
diff --git a/paddle/optimizer/parameter_optimizer_test.cc b/paddle/optimizer/parameter_optimizer_test.cc
index d663e2fd007febd3b9f0f43d213d63d2b20656b8..1d9572999e9e0f10092eecbc1b41369a89629da7 100644
--- a/paddle/optimizer/parameter_optimizer_test.cc
+++ b/paddle/optimizer/parameter_optimizer_test.cc
@@ -38,7 +38,7 @@ paddle::optimizer::Tensor* FixedTensor(size_t size) {
 }
 
 class OptimizerTest : public testing::Test {
-public:
+ public:
   virtual ~OptimizerTest() {}
   // init paddle::optimizer::Tensor shape
   const size_t kSize = 5;
@@ -115,7 +115,7 @@ public:
     }
   }
 
-private:
+ private:
   std::vector<paddle::optimizer::ParameterOptimizer*> opts_;
   paddle::OptimizerConfig config_;
 };
diff --git a/paddle/optimizer/sgd_optimizer.h b/paddle/optimizer/sgd_optimizer.h
index f504d98adb8a01fd69ff313075b4c417222c765e..a8957cde54abd6667143d2a8265d732c849294e3 100644
--- a/paddle/optimizer/sgd_optimizer.h
+++ b/paddle/optimizer/sgd_optimizer.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace optimizer {
 
 class SGDOptimizer : public ParameterOptimizer {
-public:
+ public:
   SGDOptimizer(Tensor* parameter, LrPolicy* lr, double m, double d, bool n)
       : ParameterOptimizer(parameter, lr),
         momentums_(nullptr),
@@ -39,7 +39,7 @@ public:
   std::string SerializeState();
   void DeserializeState(const std::string& state);
 
-private:
+ private:
   Tensor* momentums_;
   double momentum_;
   double decay_;
diff --git a/paddle/optimizer/tensor.h b/paddle/optimizer/tensor.h
index fd32398a237e7e08a198707347cd3c0a4ed77bb3..d2cef99074335be6f9852d60daa103b9b45a550d 100644
--- a/paddle/optimizer/tensor.h
+++ b/paddle/optimizer/tensor.h
@@ -26,7 +26,7 @@ namespace optimizer {
 
 template <class T>
 class TensorT {
-public:
+ public:
   TensorT(size_t size) : height_(1), width_(size) {
     // new T[size]() initializes all element to zero value.
     data_ptr_ = std::shared_ptr<T>(new T[size](), std::default_delete<T[]>());
@@ -54,7 +54,7 @@ public:
   // TODO: replace with tensorshape
   size_t size() const { return this->width_ * this->height_; }
 
-protected:
+ protected:
   size_t height_;
   size_t width_;
   std::shared_ptr<T> data_ptr_;
diff --git a/paddle/parameter/AverageOptimizer.h b/paddle/parameter/AverageOptimizer.h
index 4ad3c18d56abf16d1274c5b3b8e0347b85e64dea..f0fe2fd28e4be7df8ebc52fd9b9b5540f3d76949 100644
--- a/paddle/parameter/AverageOptimizer.h
+++ b/paddle/parameter/AverageOptimizer.h
@@ -21,7 +21,7 @@ namespace paddle {
 // After Optimization, parameter values are further averaged within
 // time range.
 class AverageOptimizer : public ParameterOptimizer {
-public:
+ public:
   // if *useParameterApply* set, use PARAMETER_APPLY to store averaged parameter
   // else use PARAMETER_VALUE, and value backup in PARAMETER_GRADIENT
   AverageOptimizer(const OptimizationConfig& optConfig,
@@ -65,7 +65,7 @@ public:
 
   virtual void setNoDecay() { optimizer_->setNoDecay(); }
 
-protected:
+ protected:
   std::unique_ptr<ParameterOptimizer> optimizer_;
   bool useApply_;
 
@@ -98,7 +98,7 @@ protected:
 
 // Average Optimizer with Sparse support.
 class AverageSparseOptimizer : public AverageOptimizer {
-public:
+ public:
   AverageSparseOptimizer(const OptimizationConfig& optConfig,
                          ParameterOptimizer* optimizer,
                          bool useParameterApply)
@@ -130,7 +130,7 @@ public:
     t0Vec_.assign(t0Vec_.size(), 0);
   }
 
-protected:
+ protected:
   /**
    *  counting batches, clear after catch up with
    *  t(timer_) is current time,
diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/parameter/FirstOrderOptimizer.h
index 047989fcad52afc1d4d4c347258d0fb2f069f3d4..86b9a591aff7a58aafa194c64cb09cd6636d0454 100644
--- a/paddle/parameter/FirstOrderOptimizer.h
+++ b/paddle/parameter/FirstOrderOptimizer.h
@@ -22,7 +22,7 @@ namespace paddle {
 
 // Plain SGD optimization.
 class SgdOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit SgdOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig) {
     addParameterType(PARAMETER_MOMENTUM);
@@ -77,7 +77,7 @@ class SparseMomentumParameterOptimizer : public ParameterOptimizer {
     \gamma_t: learning rate at the t'th step
   */
 
-public:
+ public:
   explicit SparseMomentumParameterOptimizer(
       const OptimizationConfig& optConfig);
   virtual void init(size_t numRows, const ParameterConfig* config);
@@ -89,7 +89,7 @@ public:
       const ParameterConfig& config) const;
   virtual void finishBatch();
 
-private:
+ private:
   real alpha_;
   real beta_;
   real tau_;
@@ -98,7 +98,7 @@ private:
   real momentum_;
   real decayRate_;
 
-protected:
+ protected:
   int64_t timer_;
   mutable std::vector<int64_t> t0Vec_;
   bool isParameterSparse_;
@@ -109,7 +109,7 @@ protected:
  * http://www.magicbroom.info/Papers/DuchiHaSi10.pdf
  */
 class AdagradParameterOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit AdagradParameterOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig) {
     addParameterType(PARAMETER_MOMENTUM);
@@ -129,7 +129,7 @@ public:
   virtual TraverseCallback needSpecialTraversal(
       const ParameterConfig& config) const;
 
-protected:
+ protected:
   int64_t numUpdates_;
   static const int64_t kMaxNumAccumulates = 16384;
 };
@@ -139,7 +139,7 @@ protected:
  * http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf
  */
 class AdaDeltaParameterOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit AdaDeltaParameterOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig) {
     addParameterType(PARAMETER_MOMENTUM);
@@ -158,14 +158,14 @@ public:
                       const ParameterConfig& config,
                       size_t sparseId) const;
 
-protected:
+ protected:
   real rou_;
   real epsilon_;
 };
 
 // RMSProp Parameter Optimization.
 class RMSPropParameterOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit RMSPropParameterOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig) {
     addParameterType(PARAMETER_MOMENTUM);
@@ -191,7 +191,7 @@ public:
                       const ParameterConfig& config,
                       size_t sparseId) const;
 
-protected:
+ protected:
   real rou_;
   real epsilon_;
 
@@ -208,7 +208,7 @@ protected:
 
 // Decayed AdaGrad Optimization.
 class DecayedAdagradParameterOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit DecayedAdagradParameterOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig) {
     addParameterType(PARAMETER_MOMENTUM);
@@ -233,7 +233,7 @@ public:
                       const ParameterConfig& config,
                       size_t sparseId) const;
 
-protected:
+ protected:
   real rou_;
   real epsilon_;
 
@@ -253,7 +253,7 @@ protected:
  * Reference Paper: http://arxiv.org/abs/1412.6980 Algorithm 1
  */
 class AdamParameterOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit AdamParameterOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig),
         beta1_(optConfig.adam_beta1()),
@@ -275,7 +275,7 @@ public:
                       const ParameterConfig& config,
                       size_t sparseId) const;
 
-protected:
+ protected:
   real beta1_;
   real beta2_;
   real epsilon_;
@@ -288,7 +288,7 @@ protected:
  * Reference Paper: http://arxiv.org/abs/1412.6980 Algorithm 2
  */
 class AdamaxParameterOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit AdamaxParameterOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig),
         beta1_(optConfig.adam_beta1()),
@@ -305,7 +305,7 @@ public:
                       const ParameterConfig& config,
                       size_t sparseId) const;
 
-protected:
+ protected:
   real beta1_;
   real beta2_;
   int64_t step_;
@@ -315,7 +315,7 @@ protected:
 // Used in pserver,
 // when PARAMETER_DELTA stores in PARAMETER_GRADIENT.
 class AddOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit AddOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig) {}
 
@@ -333,7 +333,7 @@ public:
 
 // A optimizer which does nothing.
 class DummyOptimizer : public ParameterOptimizer {
-public:
+ public:
   explicit DummyOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig) {}
 
@@ -344,7 +344,7 @@ public:
 
 // Do gradient clipping before sgd update
 class OptimizerWithGradientClipping : public ParameterOptimizer {
-public:
+ public:
   OptimizerWithGradientClipping(const OptimizationConfig& optConfig,
                                 ParameterOptimizer* optimizer)
       : ParameterOptimizer(optConfig), optimizer_(optimizer) {
@@ -374,7 +374,7 @@ public:
 
   virtual void setNoDecay() { optimizer_->setNoDecay(); }
 
-protected:
+ protected:
   std::unique_ptr<ParameterOptimizer> optimizer_;
 };
 
diff --git a/paddle/parameter/LearningRateScheduler.cpp b/paddle/parameter/LearningRateScheduler.cpp
index b6b58e3ddad6a0e8811bf56502c3f2f0c8728f5c..d57d2189a45dc8cbcea7a8a5f25c5ec7ac71cca3 100644
--- a/paddle/parameter/LearningRateScheduler.cpp
+++ b/paddle/parameter/LearningRateScheduler.cpp
@@ -28,20 +28,20 @@ LearningRateScheduler* LearningRateScheduler::create(
 // LRS stands for LearningRateScheduler
 
 class BaseLRS : public LearningRateScheduler {
-public:
+ public:
   explicit BaseLRS(const OptimizationConfig& config)
       : learningRate_(config.learning_rate()),
         a_(config.learning_rate_decay_a()),
         b_(config.learning_rate_decay_b()) {}
 
-protected:
+ protected:
   real learningRate_;
   real a_;
   real b_;
 };
 
 class ConstLRS : public BaseLRS {
-public:
+ public:
   explicit ConstLRS(const OptimizationConfig& config) : BaseLRS(config) {}
   virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
     return learningRate_;
@@ -50,7 +50,7 @@ public:
 REGISTER_LEARNING_RATE_SCHEDULER(constant, ConstLRS);
 
 class PolyLRS : public BaseLRS {
-public:
+ public:
   explicit PolyLRS(const OptimizationConfig& config) : BaseLRS(config) {}
   virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
     return learningRate_ * pow(1.0 + a_ * numSamplesProcessed, -b_);
@@ -59,7 +59,7 @@ public:
 REGISTER_LEARNING_RATE_SCHEDULER(poly, PolyLRS);
 
 class CaffePolyLRS : public BaseLRS {
-public:
+ public:
   explicit CaffePolyLRS(const OptimizationConfig& config) : BaseLRS(config) {}
   virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
     if (numSamplesProcessed > a_) {
@@ -78,7 +78,7 @@ public:
 REGISTER_LEARNING_RATE_SCHEDULER(caffe_poly, CaffePolyLRS);
 
 class ExpLRS : public BaseLRS {
-public:
+ public:
   explicit ExpLRS(const OptimizationConfig& config) : BaseLRS(config) {}
   virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
     double decayRatio = (double)numSamplesProcessed / b_;
@@ -88,7 +88,7 @@ public:
 REGISTER_LEARNING_RATE_SCHEDULER(exp, ExpLRS);
 
 class DiscreteExpLRS : public BaseLRS {
-public:
+ public:
   explicit DiscreteExpLRS(const OptimizationConfig& config) : BaseLRS(config) {}
   virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
     int numDecays = floor(numSamplesProcessed / b_);
@@ -98,7 +98,7 @@ public:
 REGISTER_LEARNING_RATE_SCHEDULER(discexp, DiscreteExpLRS);
 
 class LinearLRS : public BaseLRS {
-public:
+ public:
   explicit LinearLRS(const OptimizationConfig& config) : BaseLRS(config) {}
   virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
     return std::max(learningRate_ - a_ * numSamplesProcessed, b_);
@@ -113,7 +113,7 @@ REGISTER_LEARNING_RATE_SCHEDULER(linear, LinearLRS);
   then learning_rate = learning_rate_base * rate_i
 */
 class ManualLRS : public BaseLRS {
-public:
+ public:
   explicit ManualLRS(const OptimizationConfig& config)
       : BaseLRS(config), currentSegment_(0), lastNum_(0) {
     std::vector<std::string> pieces;
@@ -151,7 +151,7 @@ public:
     return learningRate_ * rates_.back();
   }
 
-protected:
+ protected:
   std::vector<real> rates_;
   std::vector<int64_t> segments_;
   size_t currentSegment_;
@@ -161,7 +161,7 @@ protected:
 REGISTER_LEARNING_RATE_SCHEDULER(manual, ManualLRS);
 
 class PassManualLRS : public ManualLRS {
-public:
+ public:
   explicit PassManualLRS(const OptimizationConfig& config)
       : ManualLRS(config) {}
   virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
diff --git a/paddle/parameter/LearningRateScheduler.h b/paddle/parameter/LearningRateScheduler.h
index aea99a1c204b46e937135cbde22360a12d087ae2..3fad97040248dcf8a22988c38153df31f267ed37 100644
--- a/paddle/parameter/LearningRateScheduler.h
+++ b/paddle/parameter/LearningRateScheduler.h
@@ -26,7 +26,7 @@ namespace paddle {
   })
 
 class LearningRateScheduler {
-public:
+ public:
   static LearningRateScheduler* create(const OptimizationConfig& config);
   virtual ~LearningRateScheduler() {}
   virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) = 0;
diff --git a/paddle/parameter/OptimizerWithRegularizer.h b/paddle/parameter/OptimizerWithRegularizer.h
index 7219d96d924dfa26d3ab52b8c6a2ce1249e4f45c..bd29b3966324b2e206cfe56cc15678539d1e870e 100644
--- a/paddle/parameter/OptimizerWithRegularizer.h
+++ b/paddle/parameter/OptimizerWithRegularizer.h
@@ -20,7 +20,7 @@ namespace paddle {
 
 // add regularizer for objective function to do optimization
 class OptimizerWithRegularizer : public ParameterOptimizer {
-public:
+ public:
   static ParameterOptimizer* create(const OptimizationConfig& optConfig,
                                     const ParameterConfig& paraConfig,
                                     bool isParameterSparse,
@@ -67,7 +67,7 @@ public:
     regularizer_->update(vecs, config, optimizer_->getLearningRate(), 0, 1);
   }
 
-protected:
+ protected:
   std::unique_ptr<ParameterOptimizer> optimizer_;
   Regularizer* regularizer_;
 
@@ -84,7 +84,7 @@ protected:
 // Regularized Loss function for every num of batches
 class OptimizerWithRegularizerEveryNumBatches
     : public OptimizerWithRegularizer {
-public:
+ public:
   OptimizerWithRegularizerEveryNumBatches(const OptimizationConfig& optConfig,
                                           ParameterOptimizer* optimizer,
                                           Regularizer* regularizer)
@@ -112,7 +112,7 @@ public:
   virtual TraverseCallback startCatchUpWith() const;
   virtual void finishCatchUpWith() { baseTimer_ = timer_; }
 
-protected:
+ protected:
   bool isRegularizationBatch(const ParameterConfig& config) const {
     return ((timer_ + 1) % config.num_batches_regularization() == 0);
   }
@@ -125,7 +125,7 @@ protected:
 
 // Regularized Loss function with Sparse support
 class OptimizerWithRegularizerSparse : public OptimizerWithRegularizer {
-public:
+ public:
   OptimizerWithRegularizerSparse(const OptimizationConfig& optConfig,
                                  ParameterOptimizer* optimizer,
                                  Regularizer* regularizer)
@@ -145,7 +145,7 @@ public:
     t0Vec_.assign(t0Vec_.size(), 0);
   }
 
-protected:
+ protected:
   /**
    *  t0Vec_ are last occur time of i rows
    *  if one block is update by multi threads,
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
index 24ac10f3fe5977553332a9a8402d6795577b5ad8..ef519bf35a4f051b4477eb04b5eb2c5f0b5e29e8 100644
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -58,7 +58,7 @@ class Parameter;
 typedef std::shared_ptr<Parameter> ParameterPtr;
 
 class Parameter {
-public:
+ public:
   Parameter(const ParameterConfig& config, bool useGpu, bool doInit = true);
   const std::string& getName() const { return config_.name(); }
 
@@ -311,7 +311,7 @@ public:
     }
   }
 
-protected:
+ protected:
   /**
    * @brief create matrix to matType.
    *
@@ -326,7 +326,7 @@ protected:
 
   void clearUpdate() { updateCounter_ = 0; }
 
-protected:
+ protected:
   ParameterConfig config_;
 
   bool useGpu_;
@@ -363,7 +363,7 @@ protected:
 
   std::vector<std::shared_ptr<IParameterUpdaterHook>> updaterHooks_;
 
-public:
+ public:
   void setSharedCount(int cnt) { sharedCount_ = cnt; }
   int getSharedCount() { return sharedCount_; }
 
diff --git a/paddle/parameter/ParameterOptimizer.h b/paddle/parameter/ParameterOptimizer.h
index a8d0ca72f21d04e0e65a9dd6a07e8f53b23e4223..019afa1358ae255fd096e84e5eb1d7b0b9d6859f 100644
--- a/paddle/parameter/ParameterOptimizer.h
+++ b/paddle/parameter/ParameterOptimizer.h
@@ -30,12 +30,12 @@ namespace paddle {
  *    may be called many times, should be no state change between calls.
  */
 class ParameterOptimizer {
-public:
+ public:
   typedef std::function<void(
       const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId)>
       TraverseCallback;
 
-public:
+ public:
   explicit ParameterOptimizer(const OptimizationConfig& optConfig)
       : applyDecay_(true),
         optConfig_(optConfig),
@@ -175,7 +175,7 @@ public:
   static ParameterOptimizer* create(const OptimizationConfig& optConfig,
                                     bool inPserver = false);
 
-protected:
+ protected:
   typedef std::vector<ParameterOptimizer::TraverseCallback> TraverseCallbackVec;
 
   static TraverseCallback composeCallbacks(
diff --git a/paddle/parameter/ParameterUpdaterBase.h b/paddle/parameter/ParameterUpdaterBase.h
index 717e1c6721b6e4d3ff81172eb06213677c3bff98..493512886cad3ea9b74026d6dfcc4fc90f6aadb9 100644
--- a/paddle/parameter/ParameterUpdaterBase.h
+++ b/paddle/parameter/ParameterUpdaterBase.h
@@ -21,7 +21,7 @@ namespace paddle {
 class ParameterOptimizer;
 
 class ParameterUpdater {
-public:
+ public:
   ParameterUpdater() : parameterTypes_{PARAMETER_VALUE, PARAMETER_GRADIENT} {}
   virtual ~ParameterUpdater() {}
 
@@ -89,7 +89,7 @@ public:
   virtual void setForwardbackwardTime(uint64_t delta) {}
 #endif
 
-protected:
+ protected:
   virtual void updateImpl(Parameter* para) = 0;
 
   std::vector<ParameterType> parameterTypes_;
@@ -101,7 +101,7 @@ protected:
 // part of all Parameters. It's useful when we need different
 // update strategy for different Parameter.
 class ParameterUpdaterComposite : public ParameterUpdater {
-public:
+ public:
   ParameterUpdaterComposite() {}
   virtual ~ParameterUpdaterComposite() {}
 
@@ -173,7 +173,7 @@ public:
         [&](int tid, size_t numThreads) { updaters_[tid]->restore(); });
   }
 
-protected:
+ protected:
   virtual void updateImpl(Parameter* para) {}
   std::vector<std::unique_ptr<ParameterUpdater>> updaters_;
   std::unique_ptr<SyncThreadPool> syncThreadPool_;
diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp
index e6aec3c34820764b3515f47f13a432961de1a673..989185b66a5b7785bb0572fba59a72adeef9797b 100644
--- a/paddle/parameter/ParameterUpdaterHook.cpp
+++ b/paddle/parameter/ParameterUpdaterHook.cpp
@@ -37,7 +37,7 @@ namespace paddle {
  */
 
 class StaticPruningHook : public IParameterUpdaterHook {
-public:
+ public:
   explicit StaticPruningHook(const ParameterUpdaterHookConfig &hookConfig)
       : initCount_(0) {
     sparsityRatio_ = hookConfig.sparsity_ratio();
@@ -96,7 +96,7 @@ public:
     paraVec->dotMul(*maskVec_);
   }
 
-private:
+ private:
   SameThreadChecker updateThreadChecker_;
   std::atomic<size_t> initCount_;
   VectorPtr maskVec_;
@@ -116,12 +116,12 @@ IParameterUpdaterHook::~IParameterUpdaterHook() {}
  * May be extracted to Util.h to unify the hasher.
  */
 class StringIntPairHasher {
-public:
+ public:
   size_t operator()(const std::pair<std::string, int> &k) const {
     return intHasher_(strHasher_(k.first) + k.second);
   }
 
-private:
+ private:
   std::hash<std::string> strHasher_;
   std::hash<int> intHasher_;
 };
diff --git a/paddle/parameter/ParameterUpdaterHook.h b/paddle/parameter/ParameterUpdaterHook.h
index d30530ec393c097bf77e5e376e3c4dc84b321ed8..cb96e4cf007572e9688c11719017a9d2771ecd51 100644
--- a/paddle/parameter/ParameterUpdaterHook.h
+++ b/paddle/parameter/ParameterUpdaterHook.h
@@ -29,7 +29,7 @@ class Parameter;
  * parameter optimization.
  */
 class IParameterUpdaterHook {
-public:
+ public:
   virtual ~IParameterUpdaterHook();
 
   /**
@@ -53,7 +53,7 @@ public:
    */
   virtual void init(Parameter* para) = 0;
 
-protected:
+ protected:
   /**
    * Ctor.
    */
diff --git a/paddle/parameter/Regularizer.h b/paddle/parameter/Regularizer.h
index 6bed7b0ddfe7b72c697af60f5243f9037999d54a..fa5384e23251b918cc914df36c16ad790a5c59c5 100644
--- a/paddle/parameter/Regularizer.h
+++ b/paddle/parameter/Regularizer.h
@@ -20,7 +20,7 @@ namespace paddle {
 
 // Regularizer function for parameter, e.g. L1/L2
 class Regularizer {
-public:
+ public:
   virtual void update(const VectorPtr vecs[],
                       const ParameterConfig& paraConfig,
                       real learningRate,  // learningrate from optimizer
diff --git a/paddle/parameter/Weight.h b/paddle/parameter/Weight.h
index 7314c29d0db92db06d5b921c09de39d3b0029ef3..113dd6530c82fe1e831ad4a35e9cbcb9880b9243 100644
--- a/paddle/parameter/Weight.h
+++ b/paddle/parameter/Weight.h
@@ -23,12 +23,12 @@ limitations under the License. */
 namespace paddle {
 
 class Weight {
-private:
+ private:
   MatrixPtr weight_;
   MatrixPtr weightGrad_;
   ParameterPtr parameter_;
 
-public:
+ public:
   Weight(size_t height, size_t width, ParameterPtr parameter);
   Weight(size_t height, size_t width, ParameterPtr parameter, size_t offset);
 
diff --git a/paddle/parameter/tests/test_common.cpp b/paddle/parameter/tests/test_common.cpp
index 6e10becabbbbb8861095fed5aab9ac1e05bcac91..89dcc6c751eb2ec07bfe8297c93d56c824086211 100644
--- a/paddle/parameter/tests/test_common.cpp
+++ b/paddle/parameter/tests/test_common.cpp
@@ -24,7 +24,7 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 
 class CommonTest : public ::testing::Test {
-protected:
+ protected:
   CommonTest() : testStat_("test") {}
   virtual ~CommonTest() {}
   virtual void SetUp() {
@@ -51,7 +51,7 @@ protected:
 
   virtual void TreaDown() { LOG(INFO) << "All Test Finished."; }
 
-protected:
+ protected:
   std::vector<std::pair<real, real>> valueUint_;
   std::vector<size_t> sizeVec_;
   real learningRate_;
diff --git a/paddle/pserver/BaseClient.h b/paddle/pserver/BaseClient.h
index a932d34712f56de1cbbf84a9db4476f862febca0..d50230e73a3a7d128cbfd1d70517fddd228fb1bb 100644
--- a/paddle/pserver/BaseClient.h
+++ b/paddle/pserver/BaseClient.h
@@ -32,7 +32,7 @@ namespace paddle {
  * connections.
  */
 class BaseClient {
-protected:
+ protected:
   typedef std::unique_ptr<std::thread> ThreadPtr;
   typedef std::vector<std::vector<iovec>> InputIovs;
   typedef std::vector<SendParameterRequest> SendRequest;
@@ -49,7 +49,7 @@ protected:
     SendDataRequestVec parallelDataRequests;
   };
 
-public:
+ public:
   explicit BaseClient(bool separate = false, int numPorts = FLAGS_ports_num);
 
   virtual ~BaseClient();
@@ -141,7 +141,7 @@ public:
     return dataType;
   }
 
-protected:
+ protected:
   /// for a > 0, b > 0:
   /// return the smallest x s.t. b*x >= a
   static int divup(int a, int b) { return (a + b - 1) / b; }
@@ -264,7 +264,7 @@ protected:
    */
   virtual void recv(int threadId) = 0;
 
-protected:
+ protected:
   bool stopping_;
   /// nodes * ports that means the number of real pservers
   int serviceNum_;
diff --git a/paddle/pserver/LightNetwork.h b/paddle/pserver/LightNetwork.h
index 2aaa26a5c708f9c01f006136619f599bcfe0db71..bcfc9655e989e80e08e9dce9b8734c0643cbf661 100644
--- a/paddle/pserver/LightNetwork.h
+++ b/paddle/pserver/LightNetwork.h
@@ -41,7 +41,7 @@ class SocketServer : public Thread {
   // rdmaCpu controls the cpu affinity of RDMA server daemon,
   // which could benifit performance. rdmaCpu = -1 means TCP
   // is used instead of RDMA transport.
-public:
+ public:
   SocketServer(const std::string& addr, int port, int rdmaCpu);
   ~SocketServer();
 
@@ -50,7 +50,7 @@ public:
   typedef std::function<void(const std::vector<iovec>& outputIovs)>
       ResponseCallback;
 
-protected:
+ protected:
   //
   // The derived class needs to implement this function
   // to handle the request received by SocketWorker
@@ -70,13 +70,13 @@ protected:
 
   friend class SocketWorker;
 
-private:
+ private:
   void rdmaServer();
   void tcpServer();
 
   void detach() {}  // detach accept thread is forbidden
 
-protected:
+ protected:
   enum ChannelType tcpRdma_;
   // for rdma
   int rdmaCpu_;
@@ -96,7 +96,7 @@ protected:
  * @note  all parameter processing will run in the context of this worker
  */
 class SocketWorker : public Thread {
-public:
+ public:
   SocketWorker(std::unique_ptr<SocketChannel>&& channel, SocketServer* server)
       : channel_(std::move(channel)), server_(server) {}
 
@@ -104,7 +104,7 @@ public:
 
   virtual void run();
 
-protected:
+ protected:
   std::unique_ptr<SocketChannel> channel_;
   SocketServer* server_;
   enum ChannelType tcpRdma_;
@@ -118,12 +118,12 @@ protected:
  *        single cpu core for better load balance performance
  */
 class RdmaClientDaemons {
-private:
+ private:
   RdmaClientDaemons();
 
   static std::unique_ptr<RdmaClientDaemons> daemons_;
 
-public:
+ public:
   static RdmaClientDaemons* get() {
     std::call_once(RdmaClientDaemons::initDataFlag_,
                    &RdmaClientDaemons::getInstance);
@@ -141,10 +141,10 @@ public:
 
   ~RdmaClientDaemons();
 
-public:
+ public:
   friend class SocketClient;
 
-private:
+ private:
   static std::once_flag initDataFlag_;
   static void getInstance() {
     if (!daemons_.get()) daemons_.reset(new RdmaClientDaemons());
@@ -162,19 +162,19 @@ private:
  *        read data
  */
 class SocketClient {
-public:
+ public:
   SocketClient(const std::string& serverAddr,
                int serverPort,
                enum ChannelType channelType);
 
   SocketChannel* getChannel() { return channel_.get(); }
 
-protected:
+ protected:
   std::unique_ptr<SocketChannel> channel_;
   struct sxi_socket* socketDaemon_;
   enum ChannelType tcpRdma_;
 
-private:
+ private:
   void RdmaClient(const std::string& serverAddr, int serverPort);
   void TcpClient(const std::string& serverAddr, int serverPort);
 };
diff --git a/paddle/pserver/ParameterClient2.h b/paddle/pserver/ParameterClient2.h
index d63273ccbc8ed30d9df50d9f8b1a4d1e4fba6720..c96bb787151a525556c8217629109de201762cff 100644
--- a/paddle/pserver/ParameterClient2.h
+++ b/paddle/pserver/ParameterClient2.h
@@ -50,11 +50,11 @@ struct PServerVector {
  * @brief A class to help to prepare server-side operations.
  */
 class PreparedOperations {
-protected:
+ protected:
   class ResultsAdder;
   struct LocalOperationResult;
 
-public:
+ public:
   /**
    * Offers an easy way to prepare operations that will be performed on
    * server-side.
@@ -93,7 +93,7 @@ public:
     return ResultsAdder(&localResults_.back());
   }
 
-protected:
+ protected:
   void addOperationHelper(Operation* op) {}
 
   /**
@@ -151,7 +151,7 @@ protected:
    * @brief ResultsAdder offers easy ways to quickly store operation results.
    */
   class ResultsAdder {
-  public:
+   public:
     explicit ResultsAdder(LocalOperationResult* localResult)
         : localResult_(localResult) {}
     template <typename... Args>
@@ -172,11 +172,11 @@ protected:
       addResult(args...);
     }
 
-  protected:
+   protected:
     LocalOperationResult* localResult_;
   };
 
-protected:
+ protected:
   DoOperationRequest request_;
   std::vector<iovec> inputIovs_;
   struct LocalOperationResult {
@@ -214,7 +214,7 @@ struct ParameterSegments {
  * waiting until all parameters are received to CPU host end.
  */
 class ParameterClient2 : public BaseClient {
-public:
+ public:
   /** Constructor.
    * @param separate True if sending and recieving activities are separated
    *                 into 2 threads, otherwise false.
@@ -232,7 +232,7 @@ public:
   static int calcParameterBlockSize(const std::vector<ParameterPtr>& parameters,
                                     size_t serviceNum);
 
-public:
+ public:
   bool init(const std::vector<ParameterPtr>& parameters);
 
   /// service functions
@@ -514,7 +514,7 @@ public:
   void setForwardbackwardTime(uint64_t delta) { forwardbackwordTime_ = delta; }
 #endif
 
-protected:
+ protected:
   template <typename ProtoIn, typename ProtoOut>
   void multiCall(const char* funcName,
                  const ProtoIn& request,
@@ -529,7 +529,7 @@ protected:
     }
   }
 
-private:
+ private:
   void destroy();
 
   /**
@@ -573,7 +573,7 @@ private:
   /// start necessary threads for threadPool
   void initThreads();
 
-protected:
+ protected:
   /// start port number of pserver
   /// it deduce all ports for dense and sparse with some rules
   int port_;
diff --git a/paddle/pserver/ParameterServer2.h b/paddle/pserver/ParameterServer2.h
index 3ed06b6b045802bcfd48bcff6bd0c1b34e9bbb86..0b8ef5c170c01ec8a5d53f01db9888f82ca68eec 100644
--- a/paddle/pserver/ParameterServer2.h
+++ b/paddle/pserver/ParameterServer2.h
@@ -71,7 +71,7 @@ namespace paddle {
  * to prevent from being polluted.
  */
 class ParameterServer2 : public ProtoServer {
-protected:
+ protected:
   /// parameter_ mutex.
   RWLock parameterMutex_;
 
@@ -169,7 +169,7 @@ protected:
   template <typename T, size_t AlignBytes>
   class ReadWriteBuffer
       : public std::vector<T, AlignedAllocator<T, AlignBytes>> {
-  public:
+   public:
     static_assert(sizeof(T) % AlignBytes == 0 || AlignBytes % sizeof(T) == 0,
                   "Type T must be able to aligned.");
 
@@ -229,7 +229,7 @@ protected:
       return r;
     }
 
-  private:
+   private:
     size_t curOffset_;
   };
 
@@ -298,17 +298,17 @@ protected:
   /// barrier performance tuning sync-sgd required
   std::atomic<int64_t> batchId_;
 
-public:
+ public:
   struct Buffer {
     real* base;
     size_t size;
   };
 
-protected:
+ protected:
   /// async gradient commit control
   bool asyncGrdientCommitCheckAndStat(const SendParameterRequest& request);
 
-public:
+ public:
   /// disable default parameter for overloading
   /// @rdmaCpu:the id of cpu core hosting RDMA server(0-N)
   /// -1 means using TCP transport instead of RDMA
@@ -437,7 +437,7 @@ public:
   void saveValueVector(const SaveValueRequest& request,
                        ProtoResponseCallback callback);
 
-public:
+ public:
   /**
    * @brief initialize parameter server
    */
@@ -512,7 +512,7 @@ public:
                           SendParameterResponse* response,
                           std::vector<Buffer>* outputBuffers);
 
-protected:
+ protected:
   void mergeSegments(BlockSegments* segments);
 
   /// set the unused segments to zero
@@ -641,7 +641,7 @@ protected:
                      const VectorPtr vecs[],
                      const ParameterOptimizer::TraverseCallback& callback);
 
-public:
+ public:
   typedef void (ParameterServer2::*OperatorFunction)(const Operation& operation,
                                                      OperationResult* result);
 
diff --git a/paddle/pserver/ParameterServerController.h b/paddle/pserver/ParameterServerController.h
index 3a9bc74edf240a12fe1f7bd266f0311555349311..1308d62fb1787f19123fe37d49f8e14039c5a39a 100644
--- a/paddle/pserver/ParameterServerController.h
+++ b/paddle/pserver/ParameterServerController.h
@@ -28,7 +28,7 @@ namespace paddle {
  * by gflags or proto.
  */
 class ParameterServerController final {
-public:
+ public:
   DISABLE_COPY(ParameterServerController);
 
   /**
@@ -67,7 +67,7 @@ public:
    */
   void wait();
 
-private:
+ private:
   std::vector<std::unique_ptr<ParameterServer2>> parameterServers_;
 };
 
diff --git a/paddle/pserver/ProtoServer.h b/paddle/pserver/ProtoServer.h
index 3f78799dbfe1d4b80249e8cb27f269e6358903dd..2943867de5885ab1af1aa0f69e93a931092b28e3 100644
--- a/paddle/pserver/ProtoServer.h
+++ b/paddle/pserver/ProtoServer.h
@@ -34,7 +34,7 @@ namespace paddle {
  * for single NIC hardward with --port=N(N>1) for small cluster job.
  */
 class ProtoServer : public SocketServer {
-public:
+ public:
   /// rdmaCpu controls the cpu affinity of RDMA server daemon,
   /// which could benifit performance. rdmaCpu = -1 means TCP
   /// is used instead of RDMA transport.
@@ -87,7 +87,7 @@ public:
                          std::unique_ptr<MsgReader> msgReader,
                          ProtoResponseCallbackEx callback)> func);
 
-protected:
+ protected:
   /**
    * @brief handle rpc request
    * @param[in] msgReader  Message reader for reading data from connection
@@ -111,7 +111,7 @@ protected:
   void registerServiceFunctionImp(const std::string& funcName,
                                   ServiceFunction func);
 
-protected:
+ protected:
   /// Tuning bare network overhead: the beginning of receiving request
   ThreadLocal<struct timeval> handleRequestBegin_;
 
@@ -120,7 +120,7 @@ protected:
 };
 
 class ProtoClient : public SocketClient {
-public:
+ public:
   ProtoClient(const std::string& serverAddr,
               int serverPort,
               enum ChannelType channelType = F_TCP)
diff --git a/paddle/pserver/SocketChannel.h b/paddle/pserver/SocketChannel.h
index c0f30d0db760045a8c0cb001fcadaae8f0c03f9d..8b45ac56090ef82e77514566e7df6b366958655e 100644
--- a/paddle/pserver/SocketChannel.h
+++ b/paddle/pserver/SocketChannel.h
@@ -33,7 +33,7 @@ enum ChannelType {
 
 /// reading a set of blocks of data from SocketChannel.
 class MsgReader {
-public:
+ public:
   MsgReader(SocketChannel* channel, size_t numIovs);
   ~MsgReader() {
     /// ensure all data blocks have been processed
@@ -75,7 +75,7 @@ public:
   void readBlocks(const std::vector<void*>& bufs);
   void readNextBlock(void* buf);
 
-protected:
+ protected:
   SocketChannel* channel_;
   std::vector<size_t> blockLengths_;
   size_t currentBlockIndex_;
@@ -84,7 +84,7 @@ protected:
 /// APIs for reading and writing byte stream data or naive iov data
 /// from the APIs both RDMA and TCP exhibits byte stream style
 class SocketChannel {
-public:
+ public:
   SocketChannel(int socket, const std::string& peerName)
       : tcpSocket_(socket), peerName_(peerName) {
     tcpRdma_ = F_TCP;
@@ -137,7 +137,7 @@ public:
   /// return null to indicate socket is closed
   std::unique_ptr<MsgReader> readMessage();
 
-protected:
+ protected:
   struct MessageHeader {
     int64_t totalLength;  /// include the header
     int64_t numIovs;
diff --git a/paddle/pserver/SparseParameterDistribution.h b/paddle/pserver/SparseParameterDistribution.h
index 13f199548d56262e77e91e45052f3e435dea407c..e168f36c75e9452fff547f139a67a553cc6b796a 100644
--- a/paddle/pserver/SparseParameterDistribution.h
+++ b/paddle/pserver/SparseParameterDistribution.h
@@ -31,7 +31,7 @@ namespace paddle {
  * if unbalanced distribution exhibts by default.
  */
 class SparseParameterDistribution {
-public:
+ public:
   /// serviceNum means the number of ParameterServers
   explicit SparseParameterDistribution(size_t serviceNum);
   ~SparseParameterDistribution() {}
@@ -39,7 +39,7 @@ public:
   void probeDistribution(int serverId, size_t data);
   void checkAndResetDistribution();
 
-private:
+ private:
   std::vector<size_t> data_;
   std::atomic<size_t> totBytes_;
 
diff --git a/paddle/pserver/test/SocketTest.cpp b/paddle/pserver/test/SocketTest.cpp
index 6019dccaadf7fab5a1db7183c07cbbd9562dab2e..206cd17c379f529579c103893cfb492524bc6f8d 100644
--- a/paddle/pserver/test/SocketTest.cpp
+++ b/paddle/pserver/test/SocketTest.cpp
@@ -30,12 +30,12 @@ struct MessageHeader {
 };
 
 class Thread {
-public:
+ public:
   void start();
   virtual void run() = 0;
   virtual ~Thread() {}
 
-protected:
+ protected:
   std::unique_ptr<std::thread> thread_;
 };
 
@@ -44,13 +44,13 @@ void Thread::start() {
 }
 
 class SocketChannel {
-public:
+ public:
   explicit SocketChannel(int socket) : socket_(socket) {}
   int getSocketFd() const { return socket_; }
   uint64_t readAll(void* buf, size_t size);
   uint64_t writeAll(const void* buf, size_t size);
 
-protected:
+ protected:
   int socket_;
 };
 
@@ -79,7 +79,7 @@ uint64_t SocketChannel::writeAll(const void* buf, size_t size) {
 }
 
 class SocketWorker : public Thread {
-public:
+ public:
   explicit SocketWorker(int socket) : channel_(socket) {}
   virtual void run();
 
@@ -88,19 +88,19 @@ public:
 
   // write n bytes
 
-protected:
+ protected:
   SocketChannel channel_;
   std::string buffer_;
 };
 
 class SocketServer : public Thread {
-public:
+ public:
   explicit SocketServer(int port)
       : port_(port), socket_(0), maxPendingConnections_(100) {}
 
   virtual void run();
 
-protected:
+ protected:
   int port_;
   int socket_;
   int maxPendingConnections_;
@@ -161,11 +161,11 @@ void SocketWorker::run() {
 }
 
 class SocketClient {
-public:
+ public:
   SocketClient(const std::string& serverAddr, int serverPort);
   SocketChannel* getChannel() const { return channel_.get(); }
 
-protected:
+ protected:
   std::unique_ptr<SocketChannel> channel_;
 };
 
diff --git a/paddle/pserver/test/test_ParameterServer2.cpp b/paddle/pserver/test/test_ParameterServer2.cpp
index e742cd0871da865e02a60a125a936eea8f15e575..01d179258dffaf996a57022801ee3bd60a268f77 100644
--- a/paddle/pserver/test/test_ParameterServer2.cpp
+++ b/paddle/pserver/test/test_ParameterServer2.cpp
@@ -26,7 +26,7 @@ DEFINE_string(server_addr, "127.0.0.1", "assign server address");
 DEFINE_int32(server_cpu, 0, "assign server cpu");
 
 class ParameterServer2Tester : public ParameterServer2 {
-public:
+ public:
   ParameterServer2Tester(std::string serverAddr,
                          int port,
                          int rdmaCpu = -1,
@@ -88,7 +88,7 @@ public:
   void waitPassFinishTest();
   void synchronizeTest();
 
-protected:
+ protected:
   ParameterClient2 client_;
   vector<ParameterConfig> clientConfigs_;
   vector<ParameterPtr> parameters_;
diff --git a/paddle/pserver/test/test_ProtoServer.cpp b/paddle/pserver/test/test_ProtoServer.cpp
index d68a8d2180cc3081346106132799498f6dc3fa20..a66b14a1cc58d11988e4936a9c35d98b8bf5edc1 100644
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
@@ -28,7 +28,7 @@ DEFINE_bool(benchmark, false, "Do benchmark. Skip some tests");
 using namespace paddle;  // NOLINT
 
 class MyServer : public ProtoServer {
-public:
+ public:
   explicit MyServer(int port, int rdmaCpu = -1)
       : ProtoServer(FLAGS_server_addr, port, rdmaCpu),
         status_(PSERVER_STATUS_NOT_SET) {
@@ -62,7 +62,7 @@ public:
     callback(response);
   }
 
-protected:
+ protected:
   PServerStatus status_;
   std::string buffer_;
 };
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
deleted file mode 100755
index baff7628ea01caa0248af82c6eed2c3b546cdb35..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/build.sh
+++ /dev/null
@@ -1,256 +0,0 @@
-#!/bin/bash
-
-function cmake_gen() {
-    mkdir -p /paddle/build
-    cd /paddle/build
-
-    # build script will not fail if *.deb does not exist
-    rm *.deb 2>/dev/null || true
-    # delete previous built whl packages
-    rm -rf /paddle/paddle/dist 2>/dev/null || true
-
-    # Support build for all python versions, currently
-    # including cp27-cp27m and cp27-cp27mu.
-    PYTHON_FLAGS=""
-    if [ "$1" != "" ]; then
-        echo "using python abi: $1"
-        if [ "$1" == "cp27-cp27m" ]; then
-            export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
-            export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
-            PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
-        -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
-        -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
-        elif [ "$1" == "cp27-cp27mu" ]; then
-            export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
-            export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
-            PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
-        -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
-        -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
-        fi
-    fi
-
-    cat <<EOF
-    ========================================
-    Configuring cmake in /paddle/build ...
-        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
-        ${PYTHON_FLAGS}
-        -DWITH_DSO=ON
-        -DWITH_DOC=${WITH_DOC:-OFF}
-        -DWITH_GPU=${WITH_GPU:-OFF}
-        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
-        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
-        -DWITH_MKL=${WITH_MKL:-ON}
-        -DWITH_AVX=${WITH_AVX:-OFF}
-        -DWITH_GOLANG=${WITH_GOLANG:-OFF}
-        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
-        -DWITH_SWIG_PY=ON
-        -DWITH_C_API=${WITH_C_API:-OFF}
-        -DWITH_PYTHON=${WITH_PYTHON:-ON}
-        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
-        -DCUDNN_ROOT=/usr/
-        -DWITH_TESTING=${WITH_TESTING:-ON}
-        -DWITH_FAST_BUNDLE_TEST=ON
-        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
-        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
-    ========================================
-EOF
-    # Disable UNITTEST_USE_VIRTUALENV in docker because
-    # docker environment is fully controlled by this script.
-    # See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
-    cmake .. \
-        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \
-        ${PYTHON_FLAGS} \
-        -DWITH_DSO=ON \
-        -DWITH_DOC=${WITH_DOC:-OFF} \
-        -DWITH_GPU=${WITH_GPU:-OFF} \
-        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
-        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
-        -DWITH_MKL=${WITH_MKL:-ON} \
-        -DWITH_AVX=${WITH_AVX:-OFF} \
-        -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
-        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
-        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
-        -DWITH_C_API=${WITH_C_API:-OFF} \
-        -DWITH_PYTHON=${WITH_PYTHON:-ON} \
-        -DCUDNN_ROOT=/usr/ \
-        -DWITH_TESTING=${WITH_TESTING:-ON} \
-        -DWITH_FAST_BUNDLE_TEST=ON \
-        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
-        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-}
-
-function run_build() {
-    cat <<EOF
-    ============================================
-    Building in /paddle/build ...
-    ============================================
-EOF
-    make clean
-    make -j `nproc`
-}
-
-function run_test() {
-    if [ ${WITH_TESTING:-ON} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
-    cat <<EOF
-    ========================================
-    Running unit tests ...
-    ========================================
-EOF
-        ctest --output-on-failure
-        # make install should also be test when unittest
-        make install -j `nproc`
-        pip install /usr/local/opt/paddle/share/wheels/*.whl
-        if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
-            paddle version
-        fi
-    fi
-}
-
-
-function gen_docs() {
-    if [[ ${WITH_DOC:-OFF} == "ON" ]]; then
-        cat <<EOF
-    ========================================
-    Building documentation ...
-    In /paddle/build_doc
-    ========================================
-EOF
-        mkdir -p /paddle/build_doc
-        pushd /paddle/build_doc
-        cmake .. \
-            -DWITH_DOC=ON \
-            -DWITH_GPU=OFF \
-            -DWITH_AVX=${WITH_AVX:-ON} \
-            -DWITH_SWIG_PY=ON
-
-        make -j `nproc` paddle_docs paddle_apis
-        popd
-    fi
-
-
-    if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
-        cat <<EOF
-    ========================================
-    Converting C++ source code into HTML ...
-    ========================================
-EOF
-        export WOBOQ_OUT=/paddle/build/woboq_out
-        mkdir -p $WOBOQ_OUT
-        cp -rv /woboq/data $WOBOQ_OUT/../data
-        /woboq/generator/codebrowser_generator \
-            -b /paddle/build \
-            -a \
-            -o $WOBOQ_OUT \
-            -p paddle:/paddle
-        /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-    fi
-}
-
-
-function gen_dockerfile() {
-    # Set BASE_IMAGE according to env variables
-    if [[ ${WITH_GPU} == "ON" ]]; then
-    BASE_IMAGE="nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04"
-    else
-    BASE_IMAGE="ubuntu:16.04"
-    fi
-
-    DOCKERFILE_GPU_ENV=""
-    DOCKERFILE_CUDNN_DSO=""
-    if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
-        DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}"
-        DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/x86_64-linux-gnu/libcudnn.so"
-    fi
-
-    cat <<EOF
-    ========================================
-    Generate /paddle/build/Dockerfile ...
-    ========================================
-EOF
-
-    cat > /paddle/build/Dockerfile <<EOF
-    FROM ${BASE_IMAGE}
-    MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-    ENV HOME /root
-EOF
-
-    if [[ ${WITH_GPU} == "ON"  ]]; then
-        NCCL_DEPS="apt-get install -y libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 &&"
-    else
-        NCCL_DEPS=""
-    fi
-
-    if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then
-        PADDLE_VERSION="paddle version"
-        CMD='"paddle", "version"'
-    else
-        PADDLE_VERSION="true"
-        CMD='"true"'
-    fi
-
-    cat >> /paddle/build/Dockerfile <<EOF
-    ADD python/dist/*.whl /
-    # run paddle version to install python packages first
-    RUN apt-get update &&\
-        ${NCCL_DEPS}\
-        apt-get install -y wget python-pip dmidecode python-tk && easy_install -U pip && \
-        pip install /*.whl; apt-get install -f -y && \
-        apt-get clean -y && \
-        rm -f /*.whl && \
-        ${PADDLE_VERSION} && \
-        ldconfig
-    ${DOCKERFILE_CUDNN_DSO}
-    ${DOCKERFILE_GPU_ENV}
-    ENV NCCL_LAUNCH_MODE PARALLEL
-EOF
-    if [[ ${WITH_GOLANG:-OFF} == "ON" ]]; then
-        cat >> /paddle/build/Dockerfile <<EOF
-        ADD go/cmd/pserver/pserver /usr/bin/
-        ADD go/cmd/master/master /usr/bin/
-EOF
-    fi
-    cat >> /paddle/build/Dockerfile <<EOF
-    # default command shows the paddle version and exit
-    CMD [${CMD}]
-EOF
-}
-
-function gen_capi_package() {
-  if [[ ${WITH_C_API} == "ON" ]]; then
-    install_prefix="/paddle/build/capi_output"
-    rm -rf $install_prefix
-    make DESTDIR="$install_prefix" install
-    cd $install_prefix/usr/local
-    ls | egrep -v "^Found.*item$" | xargs tar -cf /paddle/build/paddle.tgz
-  fi
-}
-
-function gen_fluid_inference_lib() {
-    if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
-    cat <<EOF
-    ========================================
-    Deploying fluid inference library ...
-    ========================================
-EOF
-        make -j `nproc` inference_lib_dist
-    fi
-}
-
-set -xe
-
-cmake_gen ${PYTHON_ABI:-""}
-run_build
-run_test
-gen_docs
-gen_dockerfile
-gen_capi_package
-gen_fluid_inference_lib
-
-if [[ ${WITH_C_API:-OFF} == "ON" ]]; then
-  printf "PaddlePaddle C-API libraries was generated on build/paddle.tgz\n"
-else
-  printf "If you need to install PaddlePaddle in develop docker image,"
-  printf "please make install or pip install build/python/dist/*.whl.\n"
-fi
diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh
deleted file mode 100644
index 7e60079ebf086d0f06219de1e85bdd495105c7b0..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/build_android.sh
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/bin/bash
-
-set -xe
-
-if [ $ANDROID_ABI == "arm64-v8a" ]; then
-  ANDROID_ARCH=arm64
-  if [ $ANDROID_API -lt 21 ]; then
-    echo "Warning: arm64-v8a requires ANDROID_API >= 21."
-    ANDROID_API=21
-  fi
-else # armeabi, armeabi-v7a
-  ANDROID_ARCH=arm
-fi
-
-ANDROID_STANDALONE_TOOLCHAIN=$ANDROID_TOOLCHAINS_DIR/$ANDROID_ARCH-android-$ANDROID_API
-
-cat <<EOF
-============================================
-Generating the standalone toolchain ...
-${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh
-      --arch=$ANDROID_ARCH
-      --platform=android-$ANDROID_API
-      --install-dir=${ANDROID_STANDALONE_TOOLCHAIN}
-============================================
-EOF
-${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh \
-      --arch=$ANDROID_ARCH \
-      --platform=android-$ANDROID_API \
-      --install-dir=$ANDROID_STANDALONE_TOOLCHAIN
-
-BUILD_ROOT=/paddle/build_android
-DEST_ROOT=/paddle/install_android
-
-mkdir -p $BUILD_ROOT
-cd $BUILD_ROOT
-
-if [ $ANDROID_ABI == "armeabi-v7a" ]; then
-  cmake -DCMAKE_SYSTEM_NAME=Android \
-        -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-        -DANDROID_ABI=$ANDROID_ABI \
-        -DANDROID_ARM_NEON=ON \
-        -DANDROID_ARM_MODE=ON \
-        -DHOST_C_COMPILER=/usr/bin/gcc \
-        -DHOST_CXX_COMPILER=/usr/bin/g++ \
-        -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-        -DCMAKE_BUILD_TYPE=MinSizeRel \
-        -DUSE_EIGEN_FOR_BLAS=ON \
-        -DWITH_C_API=ON \
-        -DWITH_SWIG_PY=OFF \
-        ..
-elif [ $ANDROID_ABI == "arm64-v8a" ]; then
-  cmake -DCMAKE_SYSTEM_NAME=Android \
-        -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-        -DANDROID_ABI=$ANDROID_ABI \
-        -DANDROID_ARM_MODE=ON \
-        -DHOST_C_COMPILER=/usr/bin/gcc \
-        -DHOST_CXX_COMPILER=/usr/bin/g++ \
-        -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-        -DCMAKE_BUILD_TYPE=MinSizeRel \
-        -DUSE_EIGEN_FOR_BLAS=OFF \
-        -DWITH_C_API=ON \
-        -DWITH_SWIG_PY=OFF \
-        ..
-elif [ $ANDROID_ABI == "armeabi" ]; then
-  cmake -DCMAKE_SYSTEM_NAME=Android \
-        -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-        -DANDROID_ABI=$ANDROID_ABI \
-        -DANDROID_ARM_MODE=ON \
-        -DHOST_C_COMPILER=/usr/bin/gcc \
-        -DHOST_CXX_COMPILER=/usr/bin/g++ \
-        -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-        -DCMAKE_BUILD_TYPE=MinSizeRel \
-        -DWITH_C_API=ON \
-        -DWITH_SWIG_PY=OFF \
-        ..
-else
-  echo "Invalid ANDROID_ABI: $ANDROID_ABI"
-fi
-
-cat <<EOF
-============================================
-Building in $BUILD_ROOT ...
-============================================
-EOF
-make -j `nproc`
-make install -j `nproc`
diff --git a/paddle/scripts/docker/entrypoint b/paddle/scripts/docker/entrypoint
deleted file mode 100755
index bc194bd909aa308fd5fe920c9319f62a0ec2dac7..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/entrypoint
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-
-/usr/sbin/sshd -D &
-jupyter notebook --ip=0.0.0.0 /paddle/book/
diff --git a/paddle/scripts/docker/test.sh b/paddle/scripts/docker/test.sh
deleted file mode 100755
index 8180737a8f431d6eb8bab4b2ef7bdcc50cce41f3..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/test.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-set -e
-
-# the number of process to run tests
-NUM_PROC=6
-
-# calculate and set the memory usage for each process
-MEM_USAGE=$(printf "%.2f" `echo "scale=5; 1.0 / $NUM_PROC" | bc`)
-export FLAGS_fraction_of_gpu_memory_to_use=$MEM_USAGE
-
-# get the CUDA device count
-CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
-
-for (( i = 0; i < $NUM_PROC; i++ )); do
-    cuda_list=()
-    for (( j = 0; j < $CUDA_DEVICE_COUNT; j++ )); do
-        s=$[i+j]
-        n=$[s%CUDA_DEVICE_COUNT]
-        if [ $j -eq 0 ]; then
-            cuda_list=("$n")
-        else
-            cuda_list="$cuda_list,$n"
-        fi
-    done
-    echo $cuda_list
-    # CUDA_VISIBLE_DEVICES http://acceleware.com/blog/cudavisibledevices-masking-gpus
-    # ctest -I https://cmake.org/cmake/help/v3.0/manual/ctest.1.html?highlight=ctest
-    env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC --output-on-failure &
-done
-wait
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 900ddfd1128da4c2d4f7d23a16c833352379fab2..8eeea1805d8610f6f27f422337f3526688b73de3 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -104,6 +104,8 @@ function cmake_gen() {
         -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+        -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -129,7 +131,8 @@ EOF
         -DWITH_FAST_BUNDLE_TEST=ON \
         -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+        -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
 }
 
 function abort(){
@@ -180,7 +183,7 @@ function build() {
     ============================================
 EOF
     make clean
-    make -j `nproc`
+    make install -j `nproc`
 }
 
 function build_android() {
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
deleted file mode 100755
index e9da0892e0d7463ddf895af0b2357bd7f3532bf6..0000000000000000000000000000000000000000
--- a/paddle/scripts/travis/build_doc.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-set -e
-
-# Create the build directory for CMake.
-mkdir -p $TRAVIS_BUILD_DIR/build
-cd $TRAVIS_BUILD_DIR/build
-
-# Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
-
-make -j `nproc` paddle_docs paddle_apis
-
-# check websites for broken links
-linkchecker doc/v2/en/html/index.html
-linkchecker doc/v2/cn/html/index.html
-linkchecker doc/v2/api/en/html/index.html
diff --git a/paddle/scripts/travis/build_ios.sh b/paddle/scripts/travis/build_ios.sh
deleted file mode 100755
index cbd26ddd2d8a4a99f168b270dc05f339a09d1108..0000000000000000000000000000000000000000
--- a/paddle/scripts/travis/build_ios.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-set -e
-
-# Create the build directory for CMake.
-mkdir -p $TRAVIS_BUILD_DIR/build_ios
-cd $TRAVIS_BUILD_DIR/build_ios
-
-# Compile paddle binaries
-cmake -DCMAKE_SYSTEM_NAME=iOS \
-      -DIOS_PLATFORM=OS \
-      -DCMAKE_OSX_ARCHITECTURES="arm64" \
-      -DWITH_C_API=ON \
-      -DUSE_EIGEN_FOR_BLAS=ON \
-      -DWITH_TESTING=OFF \
-      -DWITH_SWIG_PY=OFF \
-      -DCMAKE_BUILD_TYPE=Release \
-      ..
-
-make -j 2
diff --git a/paddle/scripts/travis/check_style.sh b/paddle/scripts/travis/check_style.sh
deleted file mode 100755
index e71d243efa2041cc0624b8273e1bfabaa03ce106..0000000000000000000000000000000000000000
--- a/paddle/scripts/travis/check_style.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-function abort(){
-    echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
-    echo "Please use pre-commit to check what is wrong." 1>&2
-    exit 1
-}
-
-trap 'abort' 0
-set -e
-
-# install glide
-curl https://glide.sh/get | bash
-eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
-
-# set up go environment for running gometalinter
-mkdir -p $GOPATH/src/github.com/PaddlePaddle/
-ln -sf $TRAVIS_BUILD_DIR $GOPATH/src/github.com/PaddlePaddle/Paddle
-cd  $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd -
-
-go get github.com/alecthomas/gometalinter
-gometalinter --install
-
-cd $TRAVIS_BUILD_DIR
-export PATH=/usr/bin:$PATH
-pre-commit install
-clang-format --version
-
-
-
-if ! pre-commit run -a ; then
-    git diff
-    exit 1
-fi
-
-trap : 0
diff --git a/paddle/scripts/travis/deploy_key.enc b/paddle/scripts/travis/deploy_key.enc
deleted file mode 100644
index b0aa45c5ac626c735735fd8541a43bf8b099d0a0..0000000000000000000000000000000000000000
Binary files a/paddle/scripts/travis/deploy_key.enc and /dev/null differ
diff --git a/paddle/trainer/NewRemoteParameterUpdater.h b/paddle/trainer/NewRemoteParameterUpdater.h
index 6223ba427c9b94494c2bee8f0847442f1b0574c9..02693c675e6f5cb574e52e9681963a5904676028 100644
--- a/paddle/trainer/NewRemoteParameterUpdater.h
+++ b/paddle/trainer/NewRemoteParameterUpdater.h
@@ -29,7 +29,7 @@ namespace paddle {
  * New remote parameter updater for dense parameters that use cclient of go.
  */
 class NewRemoteParameterUpdater : public ParameterUpdater {
-public:
+ public:
   NewRemoteParameterUpdater(const OptimizationConfig& config,
                             const std::string pserverSpec);
   NewRemoteParameterUpdater(const OptimizationConfig& config,
@@ -61,13 +61,13 @@ public:
   virtual void startPass();
   virtual bool finishPass();
 
-protected:
+ protected:
   /**
    * work need to do after finishBatch
    */
   virtual void updateImpl(Parameter* para);
 
-private:
+ private:
   int parameterSize() { return (int)parameters_.size(); }
 
   /**
@@ -104,7 +104,7 @@ private:
     }
   }
 
-protected:
+ protected:
   const OptimizationConfig& trainerConfig_;
   /// internal parameter client object for exchanging data with pserver
   paddle_pserver_client parameterClient_;
diff --git a/paddle/trainer/ParamUtil.h b/paddle/trainer/ParamUtil.h
index 2e05595848760c9abd7d916003656c8103151abf..10746b4d58e3a82c081987a6aaad9e0b42272a03 100644
--- a/paddle/trainer/ParamUtil.h
+++ b/paddle/trainer/ParamUtil.h
@@ -56,7 +56,7 @@ struct ParameterUtilConfig {
  * Utility class for loading and saving parameters
  */
 class ParameterUtil {
-public:
+ public:
   /**
    * Ctor.
    *
@@ -115,7 +115,7 @@ public:
     }
   }
 
-private:
+ private:
   std::shared_ptr<TrainerConfigHelper> config_;
   std::unique_ptr<ParameterUtilConfig> intConfig_;
   GradientMachinePtr gserver_;
diff --git a/paddle/trainer/ParameterUpdater.h b/paddle/trainer/ParameterUpdater.h
index 9e9e948b8856d2712f8894b3d14db9c795d5f694..ef7ab92eca77bab2a8481561713f8034d2b8505d 100644
--- a/paddle/trainer/ParameterUpdater.h
+++ b/paddle/trainer/ParameterUpdater.h
@@ -36,7 +36,7 @@ namespace paddle {
  * @brief Parameter Updater for SGD, and local(not cluster) run.
  */
 class SgdLocalUpdater : public ParameterUpdater {
-public:
+ public:
   /**
    * @brief Ctor. Initialize optimizer locally by optConfig.
    * @param optConfig optimization config.
@@ -131,7 +131,7 @@ public:
     }
   }
 
-protected:
+ protected:
   /**
    * @brief update method. Update value from gradient.
    * @param para parameter that will be updated.
@@ -159,7 +159,7 @@ protected:
  * @deprecated
  */
 class SgdCpuUpdater : public SgdLocalUpdater, public Deprecated {
-public:
+ public:
   explicit SgdCpuUpdater(const OptimizationConfig& optConfig)
       : SgdLocalUpdater(optConfig),
         Deprecated(
@@ -178,7 +178,7 @@ public:
     optimizer_->finishBatch();
   }
 
-protected:
+ protected:
   /**
    * @brief do nothing.
    * @param para
@@ -192,7 +192,7 @@ protected:
  * It will do model average in cpu to reduce gpu memory comsuption.
  */
 class SgdUpdaterWithCpuAverager : public SgdLocalUpdater {
-public:
+ public:
   /**
    * @brief Ctor.
    *
@@ -233,12 +233,12 @@ public:
    */
   virtual void restore();
 
-protected:
+ protected:
   virtual void updateImpl(Parameter* para);
 
   void updateFunc(Parameter* para);
 
-protected:
+ protected:
   std::unique_ptr<ParameterOptimizer> averager_;
 
   /**
diff --git a/paddle/trainer/RemoteParameterUpdater.h b/paddle/trainer/RemoteParameterUpdater.h
index 5e82c944751629632ea8d16992bd8f4178a2fbd5..3a40a46354efd6b92278884c8f5b72504a3ff283 100644
--- a/paddle/trainer/RemoteParameterUpdater.h
+++ b/paddle/trainer/RemoteParameterUpdater.h
@@ -53,7 +53,7 @@ namespace paddle {
  * backward and communication is not supported.
  */
 class RemoteParameterUpdater : public ParameterUpdater {
-public:
+ public:
   RemoteParameterUpdater(
       const OptimizationConfig& config,
       int expectedPassCount,
@@ -101,7 +101,7 @@ public:
   virtual void apply();
   virtual void restore();
 
-protected:
+ protected:
   /**
    * control all pservers with all trainers for sync-sgd
    */
@@ -128,7 +128,7 @@ protected:
    */
   void copyParametersFromDevice(ParameterType parameterType);
 
-protected:
+ protected:
   /// Optimization config used to guide initialization and finishBatch
   OptimizationConfig config_;
   /// internal parameter client object for exchanging data with pserver
@@ -178,7 +178,7 @@ protected:
  * It contains separate send and recv thread for pipeline usage.
  */
 class ConcurrentRemoteParameterUpdater : public RemoteParameterUpdater {
-public:
+ public:
   ConcurrentRemoteParameterUpdater(
       OptimizationConfig config,
       int expectedPassCount,
@@ -194,7 +194,7 @@ public:
    */
   virtual void finishBatch(real cost);
 
-protected:
+ protected:
   virtual void updateImpl(Parameter* para);
   /// internal thread called in send thread
   void send(Parameter* para);  // para == NULL indicate end of a minibatch
@@ -221,7 +221,7 @@ protected:
     return (numBatches_ + 1) % config_.num_batches_per_send_parameter() == 0;
   }
 
-private:
+ private:
   /// send thread used for overlapping
   std::unique_ptr<std::thread> sendThread_;
   /// recv thread used for overlapping
@@ -263,7 +263,7 @@ private:
  * to encapsulate sparse specified message for all pservers.
  */
 class SparseRemoteParameterUpdater : public ParameterUpdater {
-public:
+ public:
   SparseRemoteParameterUpdater(const OptimizationConfig& config,
                                int expectedPassCount,
                                bool testing);
@@ -303,7 +303,7 @@ public:
   }
 #endif
 
-protected:
+ protected:
   /// update implimentation, not implemented
   virtual void updateImpl(Parameter* para) {}
 
@@ -313,7 +313,7 @@ protected:
   /// start controller thread
   void startController();
 
-protected:
+ protected:
   /// optimization config
   OptimizationConfig config_;
   /// internal parameter client
@@ -335,7 +335,7 @@ protected:
  * it directly call internal dense and sparse udpater individually.
  */
 class SparseRemoteParameterUpdaterComposite : public ParameterUpdaterComposite {
-public:
+ public:
   enum {
     UPDATER_SPARSE_REMOTE = 0,  // execute in sync thread pool(tid:0)
     UPDATER_NORMAL = 1,         // execute in Owner thread(tid:1)
@@ -364,7 +364,7 @@ public:
 };
 
 class ParameterUpdaterCreators {
-public:
+ public:
   /**
    * @brief add a creator to create custom ParameterUpdater while training.
    *        The creator is a function with type (alogrithm, optConfig, isLocal,
@@ -407,7 +407,7 @@ public:
     return nullptr;
   }
 
-private:
+ private:
   static std::vector<std::function<ParameterUpdater*(
       const std::string&, const OptimizationConfig&, bool, size_t)>>
       constructors_;
diff --git a/paddle/trainer/Tester.h b/paddle/trainer/Tester.h
index e892744db278586f2fd5b3cb527aa7c17752c477..801c77e3116369732bf4b03107adce6a71dc2184 100644
--- a/paddle/trainer/Tester.h
+++ b/paddle/trainer/Tester.h
@@ -38,7 +38,7 @@ namespace paddle {
  * It is a private class for Trainer.
  */
 class Tester {
-public:
+ public:
   /**
    * Ctor
    * @param config Trainer Config.
@@ -87,7 +87,7 @@ public:
    */
   void test();
 
-protected:
+ protected:
   std::shared_ptr<ParameterClient2> testParameterClient_;
   std::shared_ptr<TrainerConfigHelper> config_;
   std::unique_ptr<TesterConfig> intconfig_;
@@ -107,7 +107,7 @@ protected:
     real cost;
   } testContext_;
 
-private:
+ private:
   /**
    * Test one batch by batchId. It is only used for testOnePass.
    *
diff --git a/paddle/trainer/ThreadParameterUpdater.h b/paddle/trainer/ThreadParameterUpdater.h
index bc08a9e9f0eda1cab7776ba76c67e88add1028a9..b5e6a7ce3c8457364b10c921bca3386fbb6f6cbf 100644
--- a/paddle/trainer/ThreadParameterUpdater.h
+++ b/paddle/trainer/ThreadParameterUpdater.h
@@ -39,7 +39,7 @@ namespace paddle {
    class.
  */
 class SgdThreadUpdater : public ParameterUpdater {
-public:
+ public:
   explicit SgdThreadUpdater(const OptimizationConfig& optConfig);
   virtual ~SgdThreadUpdater() {}
 
@@ -57,7 +57,7 @@ public:
   virtual void apply();
   virtual void restore();
 
-protected:
+ protected:
   // This is the function that will be eventualy called by the GradientMachine.
   // used only for GPU update.
   virtual void updateImpl(Parameter* para);
diff --git a/paddle/trainer/Trainer.h b/paddle/trainer/Trainer.h
index fac589d1d711affcd008f90edf87d865c8362f69..78127b7be5cef34f51a4b540852c139625b571dd 100644
--- a/paddle/trainer/Trainer.h
+++ b/paddle/trainer/Trainer.h
@@ -41,7 +41,7 @@ namespace paddle {
  * train/test a NeuralNetwork.
  */
 class Trainer {
-public:
+ public:
   /**
    * Ctor.
    * @return
@@ -138,7 +138,7 @@ public:
    */
   ParameterUtil* getParameterUtilPtr();
 
-protected:
+ protected:
   /**
    * Train one pass of data.
    *
@@ -159,10 +159,10 @@ protected:
 
   void createTester();
 
-private:
+ private:
   std::unique_ptr<TesterConfig> createTesterConfig();
 
-protected:
+ protected:
   std::shared_ptr<TrainerConfigHelper> config_;
   std::shared_ptr<TrainerStats> stats_;
 
diff --git a/paddle/trainer/TrainerConfigHelper.h b/paddle/trainer/TrainerConfigHelper.h
index f1366cc041b0d983e65a1bf5b02ec2128324c5a8..b21dda964e70fce6e5e9672cc131595ad5af3bbc 100644
--- a/paddle/trainer/TrainerConfigHelper.h
+++ b/paddle/trainer/TrainerConfigHelper.h
@@ -37,7 +37,7 @@ class DataConfig;
  * Define a macro to unify 'final' keyword
  */
 class TrainerConfigHelper /*final*/ {
-public:
+ public:
   DISABLE_COPY(TrainerConfigHelper);
 
   /**
@@ -193,7 +193,7 @@ public:
    */
   static std::shared_ptr<TrainerConfigHelper> createFromFlagConfig();
 
-private:
+ private:
   static std::string getConfigNameFromPassId(int passId,
                                              const std::string& modelPath);
 
diff --git a/paddle/trainer/TrainerInternal.h b/paddle/trainer/TrainerInternal.h
index 7018faab24744f7a087a53130acc56ec6314101e..48ee53a5e60f950bfc3cc299c754b0e72601c818 100644
--- a/paddle/trainer/TrainerInternal.h
+++ b/paddle/trainer/TrainerInternal.h
@@ -34,7 +34,7 @@ namespace paddle {
  * the core training class for driving training logic
  */
 class TrainerInternal {
-public:
+ public:
   struct ParaStat {
     real maxAbsGrad;
     real avgAbsGrad;
@@ -126,7 +126,7 @@ public:
                                     UpdateCallback updateCallback,
                                     bool doPipelineUpdate);
 
-protected:
+ protected:
   std::shared_ptr<ParameterUpdater> parameterUpdater_;
   GradientMachinePtr gradientMachine_;
   std::shared_ptr<TrainerConfigHelper> config_;
diff --git a/paddle/trainer/TrainerInternalConfig.h b/paddle/trainer/TrainerInternalConfig.h
index b47692720efc2ed4f2db84f61ca81fcb52d234c0..43aae381029784278ad58c9398f64af24dffa1df 100644
--- a/paddle/trainer/TrainerInternalConfig.h
+++ b/paddle/trainer/TrainerInternalConfig.h
@@ -37,7 +37,7 @@ namespace paddle {
  * through one mini-batch.
  */
 class TrainerStats {
-public:
+ public:
   /**
    * @brief reset all stats.
    *
@@ -147,7 +147,7 @@ public:
     return os.str();
   }
 
-private:
+ private:
   int64_t numProcessed_;
   real totalCost_;
   real currentCost_;
diff --git a/paddle/trainer/tests/picojson.h b/paddle/trainer/tests/picojson.h
index eaa8b9baf6e4e753a441ab77811f494cbdab80cf..75349537b1c7f10d23bae788e8414a753c7ccab0 100644
--- a/paddle/trainer/tests/picojson.h
+++ b/paddle/trainer/tests/picojson.h
@@ -125,7 +125,7 @@ enum { INDENT_WIDTH = 2 };
 struct null {};
 
 class value {
-public:
+ public:
   typedef std::vector<value> array;
   typedef std::map<std::string, value> object;
   union _storage {
@@ -139,11 +139,11 @@ public:
     object* object_;
   };
 
-protected:
+ protected:
   int type_;
   _storage u_;
 
-public:
+ public:
   value();
   value(int type, bool);
   explicit value(bool b);
@@ -179,7 +179,7 @@ public:
   void serialize(Iter os, bool prettify = false) const;
   std::string serialize(bool prettify = false) const;
 
-private:
+ private:
   template <typename T>
   value(const T*);  // intentionally defined to block implicit conversion of
                     // pointer to bool
@@ -588,13 +588,13 @@ inline std::string value::_serialize(int indent) const {
 
 template <typename Iter>
 class input {
-protected:
+ protected:
   Iter cur_, end_;
   int last_ch_;
   bool ungot_;
   int line_;
 
-public:
+ public:
   input(const Iter& first, const Iter& last)
       : cur_(first), end_(last), last_ch_(-1), ungot_(false), line_(1) {}
   int getc() {
@@ -873,7 +873,7 @@ inline bool _parse(Context& ctx, input<Iter>& in) {
 }
 
 class deny_parse_context {
-public:
+ public:
   bool set_null() { return false; }
   bool set_bool(bool) { return false; }
 #ifdef PICOJSON_USE_INT64
@@ -898,10 +898,10 @@ public:
 };
 
 class default_parse_context {
-protected:
+ protected:
   value* out_;
 
-public:
+ public:
   default_parse_context(value* out) : out_(out) {}
   bool set_null() {
     *out_ = value();
@@ -949,18 +949,18 @@ public:
     return _parse(ctx, in);
   }
 
-private:
+ private:
   default_parse_context(const default_parse_context&);
   default_parse_context& operator=(const default_parse_context&);
 };
 
 class null_parse_context {
-public:
+ public:
   struct dummy_str {
     void push_back(int) {}
   };
 
-public:
+ public:
   null_parse_context() {}
   bool set_null() { return true; }
   bool set_bool(bool) { return true; }
@@ -985,7 +985,7 @@ public:
     return _parse(*this, in);
   }
 
-private:
+ private:
   null_parse_context(const null_parse_context&);
   null_parse_context& operator=(const null_parse_context&);
 };
diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/trainer/tests/test_TrainerOnePass.cpp
index b2a93d4d5eea37ad716b59427f2aa4409d2f537d..de12c4d649c6041f497c0eeac0904ebfc0d5bf97 100644
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
@@ -38,7 +38,7 @@ DECLARE_int32(num_passes);
 DECLARE_int32(saving_period);
 
 class TrainerForTest : public paddle::Trainer {
-public:
+ public:
   inline const std::shared_ptr<ParameterUpdater>& getParameterUpdaterForTest() {
     return this->trainerInternal_.getParameterUpdater();
   }
diff --git a/paddle/utils/ClassRegistrar.h b/paddle/utils/ClassRegistrar.h
index 1ac27bafabd1945d1d01e3bead22b0dd200d8688..5f40a0b25e92c7adcfe3f8c4be96016be801da3b 100644
--- a/paddle/utils/ClassRegistrar.h
+++ b/paddle/utils/ClassRegistrar.h
@@ -41,7 +41,7 @@ namespace paddle {
  */
 template <class BaseClass, typename... CreateArgs>
 class ClassRegistrar {
-public:
+ public:
   typedef std::function<BaseClass*(CreateArgs...)> ClassCreator;
 
   // Register a class using a creation function.
@@ -74,7 +74,7 @@ public:
     }
   }
 
-protected:
+ protected:
   std::map<std::string, ClassCreator> creatorMap_;
 };
 
diff --git a/paddle/utils/CpuId.h b/paddle/utils/CpuId.h
index 869be5be541dafd699a87a8e8893aadadf59b711..ed58211d13ac1e0f80d6728950f0b88dc0ae625f 100644
--- a/paddle/utils/CpuId.h
+++ b/paddle/utils/CpuId.h
@@ -35,7 +35,7 @@ enum simd_t {
 // clang-format on
 
 class SIMDFlags final {
-public:
+ public:
   DISABLE_COPY(SIMDFlags);
 
   SIMDFlags();
@@ -46,7 +46,7 @@ public:
     return !((simd_flags_ & flags) ^ flags);
   }
 
-private:
+ private:
   int simd_flags_ = SIMD_NONE;
 };
 
diff --git a/paddle/utils/CustomStackTrace.h b/paddle/utils/CustomStackTrace.h
index 52a6df94979fd3d8d7d540ed0e3898bb3375d975..b60077ea2d946366910780eeb773635972211e04 100644
--- a/paddle/utils/CustomStackTrace.h
+++ b/paddle/utils/CustomStackTrace.h
@@ -49,7 +49,7 @@ namespace paddle {
  */
 template <typename T>
 class CustomStackTrace {
-public:
+ public:
   /**
    * @brief Pop out an item from the top of the stack if item == top.
    *        Else, just set status to popping.
@@ -136,7 +136,7 @@ public:
     p.push(item);
   }
 
-private:
+ private:
   /**
    * Get thread local attribute, and save them into a map (threadId => TYPE*)
    *
@@ -174,7 +174,7 @@ private:
     return this->getThreadLocal(this->isPushing_, this->pushingBuffers_);
   }
 
-private:
+ private:
   mutable std::mutex mtx_;
 
   std::unordered_map<std::thread::id, std::stack<T>*> stackBuffers_;
diff --git a/paddle/utils/Error.h b/paddle/utils/Error.h
index 7cde98306026ca1de76089749aaea265d151da33..1fc8482e3a1bef869d4df147bbd3cab6e62ccf49 100644
--- a/paddle/utils/Error.h
+++ b/paddle/utils/Error.h
@@ -95,7 +95,7 @@ namespace paddle {
  * log(FATAL) and CHECK in Paddle, 'check' method will be removed.
  */
 class Error {
-public:
+ public:
   /**
    * Construct a no-error value.
    */
@@ -138,7 +138,7 @@ public:
    */
   bool isOK() const { return msg_ == nullptr; }
 
-private:
+ private:
   std::shared_ptr<std::string> msg_;
 };
 
diff --git a/paddle/utils/GlobalConstants.h b/paddle/utils/GlobalConstants.h
index 0ec1c28dfbb2a7db9fa84c9eb2bc4dad806b78e9..3f45e82268435e4c22d1879e909b0c90838d6693 100644
--- a/paddle/utils/GlobalConstants.h
+++ b/paddle/utils/GlobalConstants.h
@@ -78,7 +78,7 @@ enum ParameterType {
 using namespace enumeration_wrapper;  // NOLINT
 
 class TrainAlgorithm {
-public:
+ public:
   static const std::string SGD;
   static const std::string AsyncSGD;
   static const std::string OWLQN;
diff --git a/paddle/utils/Locks.h b/paddle/utils/Locks.h
index e87abb9139f1c3f250f8b8fe1afdd8883f682647..65f983685f5e178345a6a875a79a6573ce1ccca1 100644
--- a/paddle/utils/Locks.h
+++ b/paddle/utils/Locks.h
@@ -42,7 +42,7 @@ namespace paddle {
  * Use unlock() to unlock the lock.
  */
 class RWLock {
-public:
+ public:
   RWLock() { pthread_rwlock_init(&rwlock_, NULL); }
   ~RWLock() { pthread_rwlock_destroy(&rwlock_); }
   RWLock(const RWLock&) = delete;
@@ -62,7 +62,7 @@ public:
   void lock_shared() { pthread_rwlock_rdlock(&rwlock_); }
   void unlock() { pthread_rwlock_unlock(&rwlock_); }
 
-protected:
+ protected:
   pthread_rwlock_t rwlock_;
 };
 
@@ -71,7 +71,7 @@ protected:
  * using RAII management mechanism.
  */
 class ReadLockGuard {
-public:
+ public:
   /**
    * @brief Construct Function. Lock on rwlock in read mode.
    */
@@ -86,7 +86,7 @@ public:
    */
   ~ReadLockGuard() { rwlock_->unlock(); }
 
-protected:
+ protected:
   RWLock* rwlock_;
 };
 
@@ -98,7 +98,7 @@ protected:
  */
 class SpinLockPrivate;
 class SpinLock {
-public:
+ public:
   DISABLE_COPY(SpinLock);
   SpinLock();
   ~SpinLock();
@@ -107,7 +107,7 @@ public:
   void lock();
   void unlock();
 
-private:
+ private:
   SpinLockPrivate* m;
 };
 
@@ -116,7 +116,7 @@ private:
  */
 class SemaphorePrivate;
 class Semaphore {
-public:
+ public:
   //! Disable copy & assign
   Semaphore(const Semaphore& other) = delete;
   Semaphore& operator=(const Semaphore&& other) = delete;
@@ -124,7 +124,7 @@ public:
   //! Enable move.
   Semaphore(Semaphore&& other) : m(std::move(other.m)) {}
 
-public:
+ public:
   /**
    * @brief Construct Function.
    * @param[in] initValue the initial value of the
@@ -156,7 +156,7 @@ public:
    */
   void post();
 
-private:
+ private:
   SemaphorePrivate* m;
 };
 
@@ -166,7 +166,7 @@ private:
  */
 class ThreadBarrierPrivate;
 class ThreadBarrier {
-public:
+ public:
   DISABLE_COPY(ThreadBarrier);
 
   /**
@@ -184,7 +184,7 @@ public:
    */
   void wait();
 
-private:
+ private:
   ThreadBarrierPrivate* m;
 };
 
@@ -192,7 +192,7 @@ private:
  * A wrapper for condition variable with mutex.
  */
 class LockedCondition : public std::condition_variable {
-public:
+ public:
   /**
    * @brief execute op and notify one thread which was blocked.
    * @param[in] op a thread can do something in op before notify.
@@ -235,7 +235,7 @@ public:
    */
   std::mutex* mutex() { return &mutex_; }
 
-protected:
+ protected:
   std::mutex mutex_;
 };
 
diff --git a/paddle/utils/PythonUtil.h b/paddle/utils/PythonUtil.h
index daebaffc855518425ae43942c22ec150d2e327f0..6f8d7e09309503e47aca7ae2d20774c748703b21 100644
--- a/paddle/utils/PythonUtil.h
+++ b/paddle/utils/PythonUtil.h
@@ -55,12 +55,12 @@ std::string callPythonFunc(const std::string& moduleName,
  * NOTE: the lock of this guard is reentrant or recursive.
  */
 class PyGuard {
-public:
+ public:
   PyGuard();
   PyGuard(const PyGuard& other) = delete;
   PyGuard& operator=(const PyGuard& other) = delete;
 
-private:
+ private:
   std::lock_guard<std::recursive_mutex> guard_;
 };
 
@@ -133,7 +133,7 @@ std::string getPyCallStack();
  * Implements getAttr method for object.
  */
 class ObjectHelper {
-public:
+ public:
   explicit ObjectHelper(const PyObjectPtr& obj) : obj_(obj) {}
 
   /**
@@ -192,7 +192,7 @@ public:
     return PyObject_IsTrue(tmp.get());
   }
 
-private:
+ private:
   const PyObjectPtr& obj_;
 };
 
@@ -202,7 +202,7 @@ private:
  * The python sequence means list or tuple.
  */
 class SequenceHelper {
-public:
+ public:
   explicit SequenceHelper(const PyObjectPtr& seq) : seq_(seq.get()) {
     CHECK(PySequence_Check(seq_));
   }
@@ -248,12 +248,12 @@ public:
     }
   }
 
-private:
+ private:
   PyObject* seq_;
 };
 
 class DictHelper {
-public:
+ public:
   explicit DictHelper(PyObject* d) : dict_(d) {}
 
   explicit DictHelper(const PyObjectPtr& d) : dict_(d.get()) {}
@@ -275,7 +275,7 @@ public:
     this->set(key, list);
   }
 
-private:
+ private:
   inline void checkDict() { CHECK(PyDict_Check(this->dict_)); }
 
   PyObject* dict_;
@@ -289,7 +289,7 @@ inline static bool isCallable(const PyObjectPtr& obj) {
  * Wrap a callable object.
  */
 class CallableHelper {
-public:
+ public:
   explicit CallableHelper(const PyObjectPtr& obj) : obj_(obj) {
     CHECK(py::isCallable(obj_));
   }
@@ -315,7 +315,7 @@ public:
     return PyObject_Call(obj_.get(), args.get(), kwargs.get());
   }
 
-private:
+ private:
   const PyObjectPtr& obj_;
   PyObjectPtr args;
   PyObjectPtr kwargs;
diff --git a/paddle/utils/Queue.h b/paddle/utils/Queue.h
index f054738f87c02d2d749eec8d6c7bb55b506a6d91..189e1a14f7b2d133408a50418d96431164248f0e 100644
--- a/paddle/utils/Queue.h
+++ b/paddle/utils/Queue.h
@@ -56,7 +56,7 @@ namespace paddle {
  */
 template <class T>
 class Queue {
-public:
+ public:
   /**
    * @brief Construct Function. Default capacity of Queue is zero.
    */
@@ -147,7 +147,7 @@ public:
     });
   }
 
-private:
+ private:
   std::deque<T> elements_;
   int numElements_;
   std::mutex queueLock_;
@@ -185,7 +185,7 @@ private:
  */
 template <typename T>
 class BlockingQueue {
-public:
+ public:
   /**
    * @brief Construct Function.
    * @param[in] capacity the max numer of elements the queue can have.
@@ -244,7 +244,7 @@ public:
     return queue_.empty();
   }
 
-private:
+ private:
   std::mutex mutex_;
   std::condition_variable notEmpty_;
   std::condition_variable notFull_;
diff --git a/paddle/utils/Stat.h b/paddle/utils/Stat.h
index 79fd3b8cf043e62922dfd046754ee8ac261990c5..100e9eba909466fcca57f755405ab63b638a8ebd 100644
--- a/paddle/utils/Stat.h
+++ b/paddle/utils/Stat.h
@@ -33,7 +33,7 @@ namespace paddle {
 class Stat;
 
 class StatInfo {
-public:
+ public:
   explicit StatInfo(Stat* stat = nullptr) : stat_(stat) {
     total_ = 0;
     max_ = 0;
@@ -61,7 +61,7 @@ class Stat;
 typedef std::shared_ptr<Stat> StatPtr;
 
 class StatSet {
-public:
+ public:
   explicit StatSet(const std::string& name) : name_(name) {}
   ~StatSet() {}
 
@@ -102,7 +102,7 @@ public:
   // pserver code logic, -_- ).
   void reset(bool clearRawData = true);
 
-private:
+ private:
   std::unordered_map<std::string, StatPtr> statSet_;
   const std::string name_;
   RWLock lock_;
@@ -112,7 +112,7 @@ extern StatSet globalStat;
 
 /*@brief : a simple stat*/
 class Stat {
-public:
+ public:
   explicit Stat(const std::string& statName)
       : destructStat_(nullptr), name_(statName), openThreadInfo_(false) {}
   ~Stat() {}
@@ -137,7 +137,7 @@ public:
 
   friend class StatInfo;
 
-private:
+ private:
   void mergeThreadStat(StatInfo& allThreadStat);
 
   std::mutex lock_;
@@ -164,7 +164,7 @@ inline uint64_t nowInMicroSec() {
  * A simple help class to measure time interval
  */
 class Timer {
-public:
+ public:
   explicit Timer(bool autoStart = true) : total_(0), startStamp_(0) {
     if (autoStart) {
       start();
@@ -181,13 +181,13 @@ public:
 
   void reset() { total_ = 0; }
 
-protected:
+ protected:
   uint64_t total_;
   uint64_t startStamp_;
 };
 
 class TimerOnce {
-public:
+ public:
   TimerOnce(Stat* stat,
             const char* info = "",
             uint64_t threshold = -1,
@@ -208,7 +208,7 @@ public:
     stat_->addSample(span);
   }
 
-private:
+ private:
   Stat* stat_;
   const char* info_;
   Timer timer_;
@@ -280,11 +280,11 @@ inline StatSet& registerTimerArg2(uint64_t threshold = -1,
 #endif  // DISABLE_TIMER
 
 class GpuProfiler final {
-public:
+ public:
   GpuProfiler(std::string statName, std::string info);
   ~GpuProfiler();
 
-private:
+ private:
   std::lock_guard<std::recursive_mutex> guard_;
 };
 
diff --git a/paddle/utils/Thread.h b/paddle/utils/Thread.h
index ef36a8c5b2b0e95d759da8a781d781b71d067b7a..2ee6eba1a68202282537788160a77f7689a2ffdb 100644
--- a/paddle/utils/Thread.h
+++ b/paddle/utils/Thread.h
@@ -29,7 +29,7 @@ namespace paddle {
  */
 
 class Thread {
-public:
+ public:
   /**
    * @brief Construct Function. Default thread pointer is null.
    */
@@ -62,7 +62,7 @@ public:
    */
   virtual void run() = 0;
 
-protected:
+ protected:
   std::unique_ptr<std::thread> thread_;
 };
 
@@ -73,7 +73,7 @@ protected:
  * Use addJob() to add a new job to the job queue.
  */
 class ThreadWorker : protected Thread {
-public:
+ public:
   typedef std::function<void()> JobFunc;
 
   /**
@@ -116,7 +116,7 @@ public:
     finishCV_.wait([this] { return empty_; });
   }
 
-protected:
+ protected:
   /**
    * @brief Execute jobs in the job queue sequentianlly,
    * @note If finish all the jobs in the job queue,
@@ -150,7 +150,7 @@ protected:
  * JobFunc can use tid to divide input data.
  */
 class SyncThreadPool {
-public:
+ public:
   typedef std::function<void(int tid, size_t numThreads)> JobFunc;
 
   /**
@@ -236,7 +236,7 @@ public:
     }
   }
 
-protected:
+ protected:
   /**
    * @brief Start all the workers in the pool, call their run() function.
    */
@@ -285,7 +285,7 @@ protected:
     }
   }
 
-protected:
+ protected:
   pid_t ownerThreadId_;
   bool stopping_;
   ThreadBarrier jobStartBarrier_;
@@ -323,7 +323,7 @@ protected:
  */
 template <class T>
 class MultiThreadWorker {
-public:
+ public:
   typedef T ResultType;
   typedef std::shared_ptr<ResultType> ResultPtrType;
   typedef std::function<ResultPtrType()> JobFunc;
@@ -424,7 +424,7 @@ public:
    */
   bool testResult() { return results_.empty(); }
 
-protected:
+ protected:
   /**
    * @brief Do the jobs in the job queue sequentianlly
    * and enqueue the result into the result queue.
@@ -476,7 +476,7 @@ protected:
  *    thread pool.
  */
 class AsyncThreadPool {
-public:
+ public:
   typedef std::function<void()> JobFunc;
 
   AsyncThreadPool() { LOG(FATAL) << "Not implemented"; }
@@ -594,7 +594,7 @@ public:
     }
   }
 
-protected:
+ protected:
   /**
    * @brief Execute the jobs in the job queue.
    */
@@ -606,7 +606,7 @@ protected:
     }
   }
 
-private:
+ private:
   std::vector<std::unique_ptr<std::thread>> workers_;
   Queue<JobFunc> jobs_;
   bool stopping_;
diff --git a/paddle/utils/ThreadLocal.h b/paddle/utils/ThreadLocal.h
index 0a27b8b97b83a9066af23039a317c437ea56777a..c5b07506d36875ead65887ea2e221e762be0d621 100644
--- a/paddle/utils/ThreadLocal.h
+++ b/paddle/utils/ThreadLocal.h
@@ -49,7 +49,7 @@ namespace paddle {
  */
 template <class T>
 class ThreadLocal {
-public:
+ public:
   ThreadLocal() {
     CHECK_EQ(pthread_key_create(&threadSpecificKey_, dataDestructor), 0);
   }
@@ -92,7 +92,7 @@ public:
    */
   operator T*() { return get(); }
 
-private:
+ private:
   static void dataDestructor(void* p) { delete (T*)p; }
 
   pthread_key_t threadSpecificKey_;
@@ -111,7 +111,7 @@ private:
  */
 template <class T>
 class ThreadLocalD {
-public:
+ public:
   ThreadLocalD() { CHECK_EQ(pthread_key_create(&threadSpecificKey_, NULL), 0); }
   ~ThreadLocalD() {
     pthread_key_delete(threadSpecificKey_);
@@ -150,7 +150,7 @@ public:
    */
   T& operator*() { return *get(); }
 
-private:
+ private:
   static void dataDestructor(void* p) { delete (T*)p; }
 
   void updateMap(T* p) {
@@ -172,7 +172,7 @@ private:
  * @brief Thread-safe C-style random API.
  */
 class ThreadLocalRand {
-public:
+ public:
   /**
    * initSeed just like srand,
    * called by main thread,
@@ -205,7 +205,7 @@ public:
    */
   static int getDefaultSeed() { return defaultSeed_; }
 
-protected:
+ protected:
   static unsigned int defaultSeed_;
   static ThreadLocal<unsigned int> seed_;
 };
@@ -214,7 +214,7 @@ protected:
  * @brief Thread-safe C++ style random engine.
  */
 class ThreadLocalRandomEngine {
-public:
+ public:
   /**
    * get random_engine for each thread.
    *
@@ -222,7 +222,7 @@ public:
    */
   static std::default_random_engine& get();
 
-protected:
+ protected:
   static ThreadLocal<std::default_random_engine> engine_;
 };
 
diff --git a/paddle/utils/Util.h b/paddle/utils/Util.h
index 9579881ea3b92abab0189631184bab515afb67a3..e6f05e30d308b8b94935897e947350934a5971ee 100644
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
@@ -179,7 +179,7 @@ void loadFileList(const std::string& fileListFileName,
  */
 void registerInitFunction(std::function<void()> func, int priority = 0);
 class InitFunction {
-public:
+ public:
   explicit InitFunction(std::function<void()> func, int priority = 0) {
     registerInitFunction(func, priority);
   }
@@ -191,7 +191,7 @@ public:
  * When the SetDevice object is destructed, it will restore device environment.
  */
 class SetDevice {
-public:
+ public:
   explicit SetDevice(int deviceId) {
     isSet_ = deviceId >= 0;
     devId_ = 0;
@@ -206,7 +206,7 @@ public:
     }
   }
 
-protected:
+ protected:
   bool isSet_;
   int devId_;
 };
@@ -240,7 +240,7 @@ inline void enablePeerAccess(int d1, int d2) {
  * }
  */
 class AsyncGpuBlock {
-public:
+ public:
   AsyncGpuBlock() : syncFlag_(hl_get_sync_flag()) { hl_set_sync_flag(false); }
   ~AsyncGpuBlock() {
     if (syncFlag_) {
@@ -249,7 +249,7 @@ public:
     }
   }
 
-private:
+ private:
   bool syncFlag_;
 };
 
@@ -378,7 +378,7 @@ std::string join(const std::string& part1,
  * A Checker for each invoke of method in same thread.
  */
 class SameThreadChecker {
-public:
+ public:
   SameThreadChecker() {}
 
   /**
@@ -400,7 +400,7 @@ public:
         << invokeThreadId_ << " current invoked in " << curThreadId;
   }
 
-private:
+ private:
   std::once_flag onceFlag_;
   std::thread::id invokeThreadId_;
 };
@@ -421,7 +421,7 @@ private:
  */
 template <typename KType, typename VType, typename Hash>
 class WeakKVCache {
-public:
+ public:
   WeakKVCache() {}
 
   std::shared_ptr<VType> get(const KType& key,
@@ -442,7 +442,7 @@ public:
     return retVal;
   }
 
-private:
+ private:
   std::mutex lock_;
   std::unordered_map<KType, std::weak_ptr<VType>, Hash> storage_;
 };
@@ -453,7 +453,7 @@ private:
  */
 template <typename CallbackType, typename... Args>
 class ScopedCallbacks {
-public:
+ public:
   ScopedCallbacks(CallbackType enter, CallbackType exit, Args&... args)
       : exit_(std::bind(exit, args...)) {
     enter(args...);
@@ -464,7 +464,7 @@ public:
 
   ~ScopedCallbacks() { exit_(); }
 
-private:
+ private:
   std::function<void()> exit_;
 };
 
@@ -475,7 +475,7 @@ private:
  */
 template <typename T, size_t Alignment>
 class AlignedAllocator {
-public:
+ public:
   /// std campatible typedefs.
   typedef T* pointer;
   typedef const T* const_pointer;
@@ -552,12 +552,12 @@ public:
     return this->allocate(n);
   }
 
-private:
+ private:
   AlignedAllocator& operator=(const AlignedAllocator&);  // disable
 };
 
 class Deprecated {
-public:
+ public:
   explicit Deprecated(const std::string& msg = "") {
     if (msg.empty()) {
       LOG(WARNING) << "This class is deprecated, please do not use this class.";
diff --git a/paddle/utils/arch/linux/Locks.cpp b/paddle/utils/arch/linux/Locks.cpp
index a4e6c8f7b8397adc262588612c250bac5ef5eaa6..409af8bce3621c51bfd7a69c6b4ec1f9cc6be8e4 100644
--- a/paddle/utils/arch/linux/Locks.cpp
+++ b/paddle/utils/arch/linux/Locks.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 
 namespace paddle {
 class SemaphorePrivate {
-public:
+ public:
   sem_t sem;
 };
 
@@ -45,7 +45,7 @@ void Semaphore::post() { sem_post(&m->sem); }
 #ifdef PADDLE_USE_PTHREAD_SPINLOCK
 
 class SpinLockPrivate {
-public:
+ public:
   inline SpinLockPrivate() { pthread_spin_init(&lock_, 0); }
   inline ~SpinLockPrivate() { pthread_spin_destroy(&lock_); }
 
@@ -63,7 +63,7 @@ public:
 // clang-format on
 
 class SpinLockPrivate {
-public:
+ public:
   inline void lock() {
     while (lock_.test_and_set(std::memory_order_acquire)) {
     }
@@ -86,7 +86,7 @@ void SpinLock::unlock() { m->unlock(); }
 #ifdef PADDLE_USE_PTHREAD_BARRIER
 
 class ThreadBarrierPrivate {
-public:
+ public:
   pthread_barrier_t barrier_;
 
   inline explicit ThreadBarrierPrivate(int count) {
@@ -101,7 +101,7 @@ public:
 #else
 
 class ThreadBarrierPrivate {
-public:
+ public:
   pthread_mutex_t mutex_;
   pthread_cond_t cond_;
   int count_;
diff --git a/paddle/utils/arch/osx/Locks.cpp b/paddle/utils/arch/osx/Locks.cpp
index e03992363fd6051a1970664d63406b2e7a47fce3..f3905091bd024ab02c3f5d39cfed6dbc38fabbbc 100644
--- a/paddle/utils/arch/osx/Locks.cpp
+++ b/paddle/utils/arch/osx/Locks.cpp
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 
 class SemaphorePrivate {
-public:
+ public:
   ~SemaphorePrivate() { dispatch_release(sem); }
 
   dispatch_semaphore_t sem;
@@ -45,7 +45,7 @@ void Semaphore::wait() {
 void Semaphore::post() { dispatch_semaphore_signal(m->sem); }
 
 class SpinLockPrivate {
-public:
+ public:
   std::atomic_flag lock_ = ATOMIC_FLAG_INIT;
   char padding_[64 - sizeof(lock_)];  // Padding to cache line size
 };
@@ -61,7 +61,7 @@ void SpinLock::lock() {
 void SpinLock::unlock() { m->lock_.clear(std::memory_order_release); }
 
 class ThreadBarrierPrivate {
-public:
+ public:
   pthread_mutex_t mutex_;
   pthread_cond_t cond_;
   int count_;
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index a44e078d0c13717643a6cfc6dd8bff5901ee9c97..7940dabcfb03cc9eb46f678365685a6e99bcceec 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -36,9 +36,11 @@ class DataToLoDTensorConverter(object):
             self.dtype = 'float64'
         elif dtype == core.VarDesc.VarType.INT32:
             self.dtype = 'int32'
+        elif dtype == core.VarDesc.VarType.UINT8:
+            self.dtype = 'uint8'
         else:
             raise ValueError("dtype must be any of [int32, float32, int64, "
-                             "float64]")
+                             "float64, uint8]")
 
         self.data = []
         self.lod = []
diff --git a/python/paddle/fluid/debuger.py b/python/paddle/fluid/debugger.py
similarity index 100%
rename from python/paddle/fluid/debuger.py
rename to python/paddle/fluid/debugger.py
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 08b756d95b9b72db5d978afbe437bbfcb52025b0..33b5caa0eab0ec192eb4a3b63cf82a672c58d2cb 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -797,7 +797,7 @@ class Block(object):
         Rename variable in vars and ops' inputs and outputs
         """
         if not self.has_var(name):
-            raise ValueError("var %s is not in current" % name)
+            raise ValueError("var %s is not in current block" % name)
         v = self.var(name)
         if type(v) == Parameter:
             var_type = "Parameter"
@@ -843,6 +843,7 @@ class Block(object):
         self.vars[new_name] = var
         del self.vars[name]
         self.sync_with_cpp()
+        return var
 
     def remove_var(self, name):
         self.sync_with_cpp()
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 1470f8c2e50004abb08e75980decd9485c22dece..8758ac9f94ab91b5be5fc70917c64db38997d1c1 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import contextlib
 
 from .. import core
 from ..framework import convert_np_dtype_to_dtype_, default_main_program, default_startup_program, Program
@@ -21,7 +22,8 @@ from ..executor import global_scope
 
 __all__ = [
     'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
-    'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer'
+    'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer',
+    'random_data_generator', 'Preprocessor'
 ]
 
 
@@ -193,21 +195,23 @@ def Send(endpoints, send_vars, get_vars=None):
     endpoints = list(set(epmap))
 
     helper = LayerHelper("Send", **locals())
-    rpc_client_var = default_main_program().global_block().create_var(
-        name="RPC_CLIENT_VAR", persistable=True, type=core.VarDesc.VarType.RAW)
     if not get_vars:
         get_vars = []
         for s in send_vars:
             v = helper.create_tmp_variable(dtype=s.dtype, stop_gradient=True)
             get_vars.append(v)
+    rpc_op_role_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
 
     helper.append_op(
         type="send",
         inputs={"X": send_vars},
-        outputs={"Out": get_vars,
-                 "RPCClient": rpc_client_var},
-        attrs={"endpoints": endpoints,
-               "epmap": epmap})
+        outputs={"Out": get_vars},
+        attrs={
+            "endpoints": endpoints,
+            "epmap": epmap,
+            rpc_op_role_name: core.op_proto_and_checker_maker.OpRole.RPC
+        })
+
     return get_vars
 
 
@@ -535,8 +539,6 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None):
         inputs={'UnderlyingReader': reader},
         outputs={'Out': [new_reader]},
         attrs=attrs)
-    new_reader.persistable = True
-    new_reader.stop_gradient = True
     return monkey_patch_reader_methods(new_reader)
 
 
@@ -581,3 +583,82 @@ def read_file(file_obj):
         return out[0]
     else:
         return out
+
+
+class Preprocessor(object):
+    BEFORE_SUB_BLOCK = 0
+    IN_SUB_BLOCK = 1
+    AFTER_SUB_BLOCK = 2
+
+    def __init__(self, reader, name=None):
+        self.underlying_reader = reader
+        new_reader_name = name if name is not None else unique_name(
+            "create_custom_reader")
+        self.main_prog = default_main_program()
+        self.reader = self.main_prog.current_block().create_var(
+            name=new_reader_name)
+        self.sub_block = None
+        self.source_var_names = None
+        self.sink_var_names = None
+        self.status = Preprocessor.BEFORE_SUB_BLOCK
+
+    def is_completed(self):
+        return self.sub_block and self.source_var_names and self.sink_var_names
+
+    @contextlib.contextmanager
+    def block(self):
+        self.status = Preprocessor.IN_SUB_BLOCK
+        self.sub_block = self.main_prog.create_block()
+        yield
+        self.main_prog.rollback()
+        self.status = Preprocessor.AFTER_SUB_BLOCK
+        if not self.is_completed():
+            raise RuntimeError(
+                "The definition of preprocessor is incompleted! "
+                "Please make sure that you have set input and output "
+                "variables by invoking 'inputs' and 'outputs' in "
+                "Preprocessor's sub-block.")
+
+    def inputs(self):
+        if self.status != Preprocessor.IN_SUB_BLOCK:
+            raise RuntimeError(
+                "Preprocessor.inputs() can only be invoked inside the sub-block."
+            )
+
+        source_shapes = self.underlying_reader.desc.shapes()
+        source_dtypes = self.underlying_reader.desc.dtypes()
+        source_lod_levels = self.underlying_reader.desc.lod_levels()
+        self.source_var_names = [
+            unique_name("preprocessor_source")
+            for _ in xrange(len(source_shapes))
+        ]
+        source_vars = []
+        for var_name, shape, dtype, lod_level in zip(
+                self.source_var_names, source_shapes, source_dtypes,
+                source_lod_levels):
+            source_vars.append(self.main_prog.current_block().create_var(
+                name=var_name, shape=shape, dtype=dtype, lod_level=lod_level))
+        return source_vars
+
+    def outputs(self, *outs):
+        if self.status != Preprocessor.IN_SUB_BLOCK:
+            raise RuntimeError(
+                "Preprocessor.outputs() can only be invoked inside the sub-block."
+            )
+        self.sink_var_names = [var.name for var in outs]
+
+    def __call__(self, *args, **kwargs):
+        if self.status != Preprocessor.AFTER_SUB_BLOCK:
+            raise RuntimeError(
+                "Preprocessor output can only be retrieved after rnn block.")
+
+        self.main_prog.current_block().append_op(
+            type="create_custom_reader",
+            inputs={'UnderlyingReader': self.underlying_reader},
+            outputs={'Out': [self.reader]},
+            attrs={
+                "sub_block": self.sub_block,
+                "source_var_names": self.source_var_names,
+                "sink_var_names": self.sink_var_names
+            })
+        return monkey_patch_reader_methods(self.reader)
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 04ee8ac9aee92a0e161e83bf1bb34d3ce727a0fb..56f5c6b4bedb6ae864c5b6f54afc758b8be8c415 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -81,7 +81,9 @@ __all__ = [
     'label_smooth',
     'roi_pool',
     'dice_loss',
-    'bilinear_interp',
+    'upsampling_bilinear2d',
+    'gather',
+    'random_crop',
 ]
 
 
@@ -154,7 +156,8 @@ def fc(input,
     Examples:
         .. code-block:: python
 
-          data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+          data = fluid.layers.data(
+              name="data", shape=[32, 32], dtype="float32")
           fc = fluid.layers.fc(input=data, size=1000, act="tanh")
     """
 
@@ -177,11 +180,8 @@ def fc(input,
             inputs={"X": input_var,
                     "Y": w},
             outputs={"Out": tmp},
-            attrs={
-                "x_num_col_dims": num_flatten_dims,
-                "y_num_col_dims": 1,
-                "use_mkldnn": use_mkldnn
-            })
+            attrs={"x_num_col_dims": num_flatten_dims,
+                   "y_num_col_dims": 1})
         mul_results.append(tmp)
 
     if len(mul_results) == 1:
@@ -349,7 +349,8 @@ def dynamic_lstm(input,
         cell_activation(str): The activation for cell output. Choices = ["sigmoid",
                               "tanh", "relu", "identity"], default "tanh".
         candidate_activation(str): The activation for candidate hidden state.
-                              Choices = ["sigmoid", "tanh", "relu", "identity"],
+                              Choices = ["sigmoid", "tanh",
+                                  "relu", "identity"],
                               default "tanh".
         dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
         name(str|None): A name for this layer(optional). If set None, the layer
@@ -516,10 +517,12 @@ def dynamic_lstmp(input,
         cell_activation(str): The activation for cell output. Choices = ["sigmoid",
                               "tanh", "relu", "identity"], default "tanh".
         candidate_activation(str): The activation for candidate hidden state.
-                              Choices = ["sigmoid", "tanh", "relu", "identity"],
+                              Choices = ["sigmoid", "tanh",
+                                  "relu", "identity"],
                               default "tanh".
         proj_activation(str): The activation for projection output.
-                              Choices = ["sigmoid", "tanh", "relu", "identity"],
+                              Choices = ["sigmoid", "tanh",
+                                  "relu", "identity"],
                               default "tanh".
         dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
         name(str|None): A name for this layer(optional). If set None, the layer
@@ -855,7 +858,7 @@ def cos_sim(X, Y):
     return out
 
 
-def dropout(x, dropout_prob, is_test=False, seed=None):
+def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
     """
     Computes dropout.
 
@@ -873,6 +876,8 @@ def dropout(x, dropout_prob, is_test=False, seed=None):
                   parameter is set to None, a random seed is used.
                   NOTE: If an integer seed is given, always the same output
                   units will be dropped. DO NOT use a fixed seed in training.
+       name(str|None): A name for this layer(optional). If set None, the layer
+                    will be named automatically.
 
     Returns:
         Variable: A tensor variable.
@@ -1117,7 +1122,7 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
     return softmax_out
 
 
-def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
+def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
     helper = LayerHelper('softmax', **locals())
     dtype = helper.input_dtype()
     softmax_out = helper.create_tmp_variable(dtype)
@@ -1855,6 +1860,7 @@ def conv2d_transpose(input,
             'strides': stride,
             'paddings': padding,
             'dilations': dilation,
+            'groups': groups,
             'use_cudnn': use_cudnn
         })
 
@@ -2171,7 +2177,8 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_mean(x)  # [0.4375]
             fluid.layers.reduce_mean(x, dim=0)  # [0.15, 0.25, 0.55, 0.8]
             fluid.layers.reduce_mean(x, dim=-1)  # [0.475, 0.4]
-            fluid.layers.reduce_mean(x, dim=1, keep_dim=True)  # [[0.475], [0.4]]
+            fluid.layers.reduce_mean(
+                x, dim=1, keep_dim=True)  # [[0.475], [0.4]]
 
             # x is a Tensor variable with shape [2, 2, 2] and elements as below:
             #      [[[1.0, 2.0], [3.0, 4.0]],
@@ -2390,7 +2397,8 @@ def split(input, num_or_sections, dim=-1, name=None):
             x0.shape  # [3, 3, 5]
             x1.shape  # [3, 3, 5]
             x2.shape  # [3, 3, 5]
-            x0, x1, x2 = fluid.layers.split(x, num_or_sections=[2, 3, 4], dim=1)
+            x0, x1, x2 = fluid.layers.split(
+                x, num_or_sections=[2, 3, 4], dim=1)
             x0.shape  # [3, 2, 5]
             x1.shape  # [3, 3, 5]
             x2.shape  # [3, 4, 5]
@@ -2609,7 +2617,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     return out
 
 
-def topk(input, k):
+def topk(input, k, name=None):
     """
     This operator is used to find values and indices of the k largest entries
     for the last dimension.
@@ -2625,6 +2633,8 @@ def topk(input, k):
         input(Variable): The input variable which can be a vector or Tensor with
             higher rank.
         k(int): An integer value to specify the top k largest elements.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
 
     Returns:
         values(Variable): The k largest elements along each last dimensional
@@ -3300,7 +3310,8 @@ def softmax_with_cross_entropy(logits, label, soft_label=False):
             data = fluid.layers.data(name='data', shape=[128], dtype='float32')
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             fc = fluid.layers.fc(input=data, size=100)
-            out = fluid.layers.softmax_with_cross_entropy(logits=fc, label=label)
+            out = fluid.layers.softmax_with_cross_entropy(
+                logits=fc, label=label)
     """
     helper = LayerHelper('softmax_with_cross_entropy', **locals())
     softmax = helper.create_tmp_variable(dtype=logits.dtype)
@@ -3347,7 +3358,8 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
         .. code-block:: python
 
             data = fluid.layers.data(name='data', shape=[128], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[100], dtype='float32')
+            label = fluid.layers.data(
+                name='label', shape=[100], dtype='float32')
             fc = fluid.layers.fc(input=data, size=100)
             out = fluid.layers.smooth_l1(x=fc, y=label)
     """
@@ -3669,7 +3681,8 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
     Examples:
         .. code-block:: python
 
-          data = fluid.layers.data(name="data", shape=[3, 112, 112], dtype="float32")
+          data = fluid.layers.data(
+              name="data", shape=[3, 112, 112], dtype="float32")
           lrn = fluid.layers.lrn(input=data)
     """
     helper = LayerHelper('lrn', **locals())
@@ -3877,7 +3890,6 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
 
 def dice_loss(input, label, epsilon=0.00001):
     """
-    **Dice loss Layer**
     Dice loss for comparing the similarity of two batch of data,
     usually is used for binary image segmentation i.e. labels are binary.
     The dice loss can be defined as below equation:
@@ -3917,40 +3929,150 @@ def dice_loss(input, label, epsilon=0.00001):
     return reduce_mean(dice_score)
 
 
-def bilinear_interp(input, out_h, out_w, name=None):
+def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
     """
+    The mathematical meaning of upsampling_bilinear2d is also called
+    Bilinear interpolation.
     Bilinear interpolation is an extension of linear interpolation for
     interpolating functions of two variables (e.g. H-direction and
     W-direction in this layer) on a rectilinear 2D grid.
-    
+
     For details, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bilinear_interpolation
-    
+
     Args:
         input (Variable): The input tensor of bilinear interpolation,
                           This is a 4-D tensor of the shape
                           (num_batches, channels, in_h, in_w).
-        out_h (int): output height of bilinear interpolation layer.
-        out_w (int): output width of bilinear interpolation layer.
+        out_shape(list|tuple|Variable|None): Output shape of bilinear interpolation
+                                    layer, the shape is (out_h, out_w).
+                                    Default: None
+        scale(int|None): The multiplier for the input height or width.
+                         At least one of out_shape or scale must be set.
+                         And out_shape has a higher priority than scale.
+                         Default: None
         name(str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
 
     Returns:
         out (Variable): The output is a 4-D tensor of the shape
                         (num_batches, channls, out_h, out_w).
-   
+
     Examples:
         .. code-block:: python
 
-            out = fluid.layers.bilinear_interp(input, out_h=12, out_w=12)
+            out = fluid.layers.bilinear_interp(input, out_shape=[12, 12])
     """
+    if out_shape is None and scale is None:
+        raise ValueError("One of out_shape and scale must not be None")
     helper = LayerHelper('bilinear_interp', **locals())
     dtype = helper.input_dtype()
+
+    def _is_list_or_turple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
+    out_h = 0
+    out_w = 0
+    inputs = {"X": input}
+    if out_shape is not None:
+        if not (_is_list_or_turple_(out_shape) and len(out_shape) == 2) and (
+                out_shape is not Variable):
+            raise ValueError('out_shape should be a list or tuple ',
+                             'with length 2, (out_h, out_w).')
+        if _is_list_or_turple_(out_shape):
+            out_shape = list(map(int, out_shape))
+            out_h = out_shape[0]
+            out_w = out_shape[1]
+        else:
+            inputs['OutSize'] = out_shape
+    else:
+        out_h = int(input.shape[2] * scale)
+        out_w = int(input.shape[3] * scale)
+
     out = helper.create_tmp_variable(dtype)
     helper.append_op(
         type="bilinear_interp",
-        inputs={"X": input},
+        inputs=inputs,
         outputs={"Out": out},
         attrs={"out_h": out_h,
                "out_w": out_w})
     return out
+
+
+def gather(input, index):
+    """
+    Output is obtained by gathering entries of the outer-most dimension 
+    of X indexed by `index` and concatenate them together.
+
+    .. math::
+
+	Out = X[Index]
+
+
+    .. code-block:: text
+
+
+                Given:
+
+    		X = [[1, 2],
+         	     [3, 4],
+                     [5, 6]]
+
+                Index = [1, 2]
+
+                Then:
+
+                Out = [[3, 4],
+                       [5, 6]]
+
+    Args:
+        input (Variable): The source input with rank>=1. 
+        index (Variable): The index input with rank=1.
+
+    Returns:
+        output (Variable): The output is a tensor with the same rank as input.
+
+    Examples:
+        .. code-block:: python
+
+            output = fluid.layers.gather(x, index)
+    """
+    helper = LayerHelper('gather', **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="gather",
+        inputs={"X": input,
+                "Index": index},
+        outputs={"Out": out})
+    return out
+
+
+def random_crop(input, shape, seed=1):
+    helper = LayerHelper("random_crop", **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    if isinstance(seed, int):
+        seed_value = seed
+        seed = helper.create_tmp_variable(dtype="int64")
+        helper.append_op(
+            type="fill_constant",
+            inputs={},
+            outputs={"Out": seed},
+            attrs={
+                "dtype": seed.dtype,
+                "shape": [1],
+                "value": float(seed_value),
+                "force_cpu": True
+            })
+    elif not isinstance(seed, Variable):
+        raise ValueError("'seed' must be a Variable or an int.")
+    seed_out = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="random_crop",
+        inputs={"X": input,
+                "Seed": seed},
+        outputs={"Out": out,
+                 "SeedOut": seed_out},
+        attrs={"shape": shape})
+    return out
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index a9fe25744cc0b385479c9366af1b731ec221dd5a..60f8cbbfa714e8500606fdf68b7a23e1ffb9d37a 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -71,6 +71,7 @@ __all__ = [
     'cumsum',
     'scatter',
     'sum',
+    'shape',
 ] + __activations__
 
 for _OP in set(__all__):
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 4be0dc6a6bfeed3ac254f5c363d3560973c031b4..be34cc81a5d5ca0e781e5984b6c3eeaa4e25eb90 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -112,7 +112,7 @@ def cast(x, dtype):
     return out
 
 
-def concat(input, axis=0):
+def concat(input, axis=0, name=None):
     """
     **Concat**
 
@@ -122,6 +122,8 @@ def concat(input, axis=0):
     Args:
         input(list): List of tensors to be concatenated
         axis(int): Integer axis along which the tensors will be concatenated
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
 
     Returns:
         Variable: Output variable of the concatenation
diff --git a/python/paddle/fluid/lod_tensor.py b/python/paddle/fluid/lod_tensor.py
index 555e371952d0f902063133c2a227eb78f082726c..9946d0a4ff33b2f5040f6d2e31aa20fcf9c609a7 100644
--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
@@ -93,12 +93,12 @@ def _convert_lod(lod):
 
 
 def create_lod_tensor(data, lod, place):
-    """Create a lod tensor from a numpy array or an existing lod tensor.
+    """Create a lod tensor from a numpy array, a list, or an existing lod tensor.
 
     Create a lod tensor by doing the following:
     1. Check that the length-based input lod is valid.
     2. Convert the length-based lod to a offset-based LoD.
-    3. Copy the data from a numpy array or a existing lod tensor to 
+    3. Copy the data from a numpy array, a list or a existing lod tensor to 
        CPU or GPU device (based on input place).
     4. Set the level of detail (LoD) using the offset-based LoD.
     
@@ -117,7 +117,7 @@ def create_lod_tensor(data, lod, place):
     for more details regarding LoD.
 
     Args:
-        data: a numpy array or a LoDTensor holding the data to be copied.
+        data: a numpy array or a LoDTensor or a list holding the data to be copied.
         lod: a list of lists indicating the length-based LoD info specified by the user. 
         place: CPU or GPU place indicating where the data in the new LoDTensor will be stored.
 
@@ -126,6 +126,18 @@ def create_lod_tensor(data, lod, place):
     """
     if isinstance(data, core.LoDTensor):
         return create_lod_tensor(np.array(data), lod, place)
+    elif isinstance(data, list):
+        # When input data is a list, it only deal with the case where the base element 
+        # is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated 
+        # LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number 
+        # of words or other indexes in the sequence. 
+        new_lod = []
+        for seq in data:
+            new_lod.append(len(seq))
+        assert [new_lod] == lod, "data and lod do not match"
+        flattened_data = np.concatenate(data, axis=0).astype("int64")
+        flattened_data = flattened_data.reshape([len(flattened_data), 1])
+        return create_lod_tensor(flattened_data, lod, place)
     elif isinstance(data, np.ndarray):
         assert _validate_lod(lod,
                              data.shape[0]), "the provided lod info is invalid"
@@ -134,9 +146,8 @@ def create_lod_tensor(data, lod, place):
         tensor.set_lod(_convert_lod(lod))
         return tensor
     else:
-        raise Exception(
-            "data should be either a LoDTensor or a Numpy array, but you pass type %s instead"
-            % (type(data)))
+        raise TypeError(
+            "data should be either a LoDTensor, a Numpy array or a list")
 
 
 def create_random_int_lodtensor(lod, base_shape, place, low, high):
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index 04fd05cc33cff3d720be75923d4af3767942669f..e2bd1d4c9a1ea5ddc0dfd19c769dcb40bfd6d04c 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -16,7 +16,10 @@ import core
 from contextlib import contextmanager
 import os
 
-__all__ = ['cuda_profiler', 'reset_profiler', 'profiler']
+__all__ = [
+    'cuda_profiler', 'reset_profiler', 'profiler', 'start_profiler',
+    'stop_profiler'
+]
 
 NVPROF_CONFIG = [
     "gpustarttimestamp",
@@ -72,20 +75,31 @@ def reset_profiler():
     core.reset_profiler()
 
 
-@contextmanager
-def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
-    """The profiler interface.
-    Different from cuda_profiler, this profiler can be used to profile both CPU
-    and GPU program. By defalut, it records the CPU and GPU operator kernels,
-    if you want to profile other program, you can refer the profiling tutorial
-    to add more records.
+def start_profiler(state):
+    """Enable the profiler.
+
+    Args:
+        state (string) : The profiling state, which should be 'CPU', 'GPU'
+            or 'All'. 'CPU' means only profile CPU. 'GPU' means profiling
+            GPU as well. 'All' also generates timeline.
+    """
+    if core.is_profiler_enabled():
+        return
+    if state not in ['CPU', 'GPU', "All"]:
+        raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
+    if state == "GPU":
+        prof_state = core.ProfilerState.kCUDA
+    elif state == "CPU":
+        prof_state = core.ProfilerState.kCPU
+    else:
+        prof_state = core.ProfilerState.kAll
+    core.enable_profiler(prof_state)
+
+
+def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
+    """Stop the profiler.
 
     Args:
-        state (string) : The profiling state, which should be 'CPU' or 'GPU',
-            telling the profiler to use CPU timer or GPU timer for profiling.
-            Although users may have already specified the execution place
-            (CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
-            would not inherit this place.
         sorted_key (string) : If None, the profiling results will be printed
             in the order of first end time of events. Otherwise, the profiling
             results will be sorted by the this flag. This flag should be one
@@ -98,17 +112,8 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
         profile_path (string) : If state == 'All', it will write a profile
             proto output file.
     """
-    if state not in ['CPU', 'GPU', "All"]:
-        raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
-    if state == "GPU":
-        prof_state = core.ProfilerState.kCUDA
-    elif state == "CPU":
-        prof_state = core.ProfilerState.kCPU
-    else:
-        prof_state = core.ProfilerState.kAll
-    core.enable_profiler(prof_state)
-    yield
-
+    if not core.is_profiler_enabled():
+        return
     sorted_key = 'default' if sorted_key is None else sorted_key
     if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']:
         raise ValueError("The sorted_key must be None or in 'calls', 'total', "
@@ -124,3 +129,34 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
     # TODO(qingqing) : redirect C++ ostream to Python stream.
     # with core.ostream_redirect(stdout=True, stderr=True):
     core.disable_profiler(key_map[sorted_key], profile_path)
+
+
+@contextmanager
+def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
+    """The profiler interface.
+    Different from cuda_profiler, this profiler can be used to profile both CPU
+    and GPU program. By defalut, it records the CPU and GPU operator kernels,
+    if you want to profile other program, you can refer the profiling tutorial
+    to add more records.
+
+    Args:
+        state (string) : The profiling state, which should be 'CPU' or 'GPU',
+            telling the profiler to use CPU timer or GPU timer for profiling.
+            Although users may have already specified the execution place
+            (CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
+            would not inherit this place.
+        sorted_key (string) : If None, the profiling results will be printed
+            in the order of first end time of events. Otherwise, the profiling
+            results will be sorted by the this flag. This flag should be one
+            of 'calls', 'total', 'max', 'min' or 'ave'.
+            The `calls` means sorting by the number of calls.
+            The `total` means sorting by the total execution time.
+            The `max` means sorting by the maximum execution time.
+            The `min` means sorting by the minimum execution time.
+            The `ave` means sorting by the average execution time.
+        profile_path (string) : If state == 'All', it will write a profile
+            proto output file.
+    """
+    start_profiler(state)
+    yield
+    stop_profiler(sorted_key, profile_path)
diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
index 5fba561e024b0690f10939267146f2622c567fa5..de3906fc6a005181b0ab04a846eb2e7ce14004c2 100644
--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
@@ -48,7 +48,7 @@ def linear():
     return avg_loss
 
 
-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     trainer = fluid.Trainer(
@@ -68,8 +68,8 @@ def train(use_cuda, train_program, save_dirname):
                 ['15.343549569447836']
                 ...
                 '''
-                if save_dirname is not None:
-                    trainer.save_params(save_dirname)
+                if params_dirname is not None:
+                    trainer.save_params(params_dirname)
                 trainer.stop()
 
     trainer.train(
@@ -80,13 +80,13 @@ def train(use_cuda, train_program, save_dirname):
 
 
 # infer
-def infer(use_cuda, inference_program, save_dirname=None):
-    if save_dirname is None:
+def infer(use_cuda, inference_program, params_dirname=None):
+    if params_dirname is None:
         return
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_dirname, place=place)
+        infer_func=inference_program, param_path=params_dirname, place=place)
 
     batch_size = 10
     tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
@@ -100,10 +100,10 @@ def main(use_cuda):
         return
 
     # Directory for saving the trained model
-    save_dirname = "fit_a_line.inference.model"
+    params_dirname = "fit_a_line.inference.model"
 
-    train(use_cuda, linear, save_dirname)
-    infer(use_cuda, inference_program, save_dirname)
+    train(use_cuda, linear, params_dirname)
+    infer(use_cuda, inference_program, params_dirname)
 
 
 class TestFitALine(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
index 1160e500dbd6db784eeb81b72968386347fec59a..63dc1b6ce30974ede22a3f7772b76bf207bbae39 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -85,7 +85,7 @@ def train_network():
     return [avg_cost, accuracy]
 
 
-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
     BATCH_SIZE = 128
     EPOCH_NUM = 1
 
@@ -105,8 +105,8 @@ def train(use_cuda, train_program, save_dirname):
             print('Loss {0:2.2}, Acc {1:2.2}'.format(avg_cost, accuracy))
 
             if accuracy > 0.01:  # Low threshold for speeding up CI
-                if save_dirname is not None:
-                    trainer.save_params(save_dirname)
+                if params_dirname is not None:
+                    trainer.save_params(params_dirname)
                 return
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
@@ -122,10 +122,10 @@ def train(use_cuda, train_program, save_dirname):
         feed_order=['pixel', 'label'])
 
 
-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_dirname, place=place)
+        infer_func=inference_program, param_path=params_dirname, place=place)
 
     # The input's dimension of conv should be 4-D or 5-D.
     # Use normilized image pixels as input data, which should be in the range
@@ -142,12 +142,14 @@ def main(use_cuda):
     save_path = "image_classification_resnet.inference.model"
 
     train(
-        use_cuda=use_cuda, train_program=train_network, save_dirname=save_path)
+        use_cuda=use_cuda,
+        train_program=train_network,
+        params_dirname=save_path)
 
     infer(
         use_cuda=use_cuda,
         inference_program=inference_network,
-        save_dirname=save_path)
+        params_dirname=save_path)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
index 1e3e955ba0299f2cc0fcc02d79ae6fd8ff4c1171..0bf8f265a1c1b11364ecfa11061af183ce20d51e 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -64,7 +64,7 @@ def train_network():
     return [avg_cost, accuracy]
 
 
-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
     BATCH_SIZE = 128
     train_reader = paddle.batch(
         paddle.reader.shuffle(
@@ -82,8 +82,8 @@ def train(use_cuda, train_program, save_dirname):
             print('Loss {0:2.2}, Acc {1:2.2}'.format(avg_cost, accuracy))
 
             if accuracy > 0.01:  # Low threshold for speeding up CI
-                if save_dirname is not None:
-                    trainer.save_params(save_dirname)
+                if params_dirname is not None:
+                    trainer.save_params(params_dirname)
                 return
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
@@ -99,10 +99,10 @@ def train(use_cuda, train_program, save_dirname):
         feed_order=['pixel', 'label'])
 
 
-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_dirname, place=place)
+        infer_func=inference_program, param_path=params_dirname, place=place)
 
     # The input's dimension of conv should be 4-D or 5-D.
     # Use normilized image pixels as input data, which should be in the range
@@ -119,12 +119,14 @@ def main(use_cuda):
     save_path = "image_classification_vgg.inference.model"
 
     train(
-        use_cuda=use_cuda, train_program=train_network, save_dirname=save_path)
+        use_cuda=use_cuda,
+        train_program=train_network,
+        params_dirname=save_path)
 
     infer(
         use_cuda=use_cuda,
         inference_program=inference_network,
-        save_dirname=save_path)
+        params_dirname=save_path)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
index f4344988141af44af83fda24d73da25f597796ef..8cce398ff33695dc15ae6fb01a887194596af001 100755
--- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
@@ -141,7 +141,7 @@ def train_program():
     return [avg_cost]
 
 
-def train(use_cuda, train_program, save_path):
+def train(use_cuda, train_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     optimizer = fluid.optimizer.SGD(learning_rate=0.01)
 
@@ -172,7 +172,7 @@ def train(use_cuda, train_program, save_path):
             print("avg_cost: %s" % avg_cost)
 
             if float(avg_cost) < 100.0:  # Large value to increase CI speed
-                trainer.save_params(save_path)
+                trainer.save_params(params_dirname)
             else:
                 print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
                                                               float(avg_cost)))
@@ -183,7 +183,7 @@ def train(use_cuda, train_program, save_path):
             print("Step {0}, Epoch {1} Metrics {2}".format(
                 event.step, event.epoch, map(np.array, event.metrics)))
             if event.step == 1:  # Run 2 iterations to speed CI
-                trainer.save_params(save_path)
+                trainer.save_params(params_dirname)
                 trainer.stop()
 
     train_reader = paddle.batch(
@@ -197,10 +197,10 @@ def train(use_cuda, train_program, save_path):
         feed_order=feed_order)
 
 
-def infer(use_cuda, inference_program, save_path):
+def infer(use_cuda, inference_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     inferencer = fluid.Inferencer(
-        inference_program, param_path=save_path, place=place)
+        inference_program, param_path=params_dirname, place=place)
 
     # Setup inputs by creating LoDTensors to represent sequences of words.
     # Here each word is the basic element of these LoDTensors and the shape of 
@@ -217,8 +217,6 @@ def infer(use_cuda, inference_program, save_path):
     # The range of random integers is [low, high]
     word = fluid.create_random_int_lodtensor(
         lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
-    pred = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=PRED_DICT_LEN - 1)
     ctx_n2 = fluid.create_random_int_lodtensor(
         lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     ctx_n1 = fluid.create_random_int_lodtensor(
@@ -229,18 +227,20 @@ def infer(use_cuda, inference_program, save_path):
         lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     ctx_p2 = fluid.create_random_int_lodtensor(
         lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+    pred = fluid.create_random_int_lodtensor(
+        lod, base_shape, place, low=0, high=PRED_DICT_LEN - 1)
     mark = fluid.create_random_int_lodtensor(
         lod, base_shape, place, low=0, high=MARK_DICT_LEN - 1)
 
     results = inferencer.infer(
         {
             'word_data': word,
-            'verb_data': pred,
             'ctx_n2_data': ctx_n2,
             'ctx_n1_data': ctx_n1,
             'ctx_0_data': ctx_0,
             'ctx_p1_data': ctx_p1,
             'ctx_p2_data': ctx_p2,
+            'verb_data': pred,
             'mark_data': mark
         },
         return_numpy=False)
@@ -251,9 +251,9 @@ def infer(use_cuda, inference_program, save_path):
 def main(use_cuda):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
-    save_path = "label_semantic_roles.inference.model"
-    train(use_cuda, train_program, save_path)
-    infer(use_cuda, inference_program, save_path)
+    params_dirname = "label_semantic_roles.inference.model"
+    train(use_cuda, train_program, params_dirname)
+    infer(use_cuda, inference_program, params_dirname)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
index 7204c7b3c7648a24de89d41e205db5b18ed2a5fc..d4b723d3e6b619709ab3dc76a32ae87f1cdec274 100644
--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
@@ -53,7 +53,7 @@ def encoder(is_sparse):
     return encoder_out
 
 
-def decoder_train(context, is_sparse):
+def train_decoder(context, is_sparse):
     # decoder
     trg_language_word = pd.data(
         name="target_language_word", shape=[1], dtype='int64', lod_level=1)
@@ -81,7 +81,7 @@ def decoder_train(context, is_sparse):
     return rnn()
 
 
-def decoder_decode(context, is_sparse):
+def decode(context, is_sparse):
     init_state = context
     array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
     counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)
@@ -148,31 +148,9 @@ def decoder_decode(context, is_sparse):
     return translation_ids, translation_scores
 
 
-def set_init_lod(data, lod, place):
-    res = fluid.LoDTensor()
-    res.set(data, place)
-    res.set_lod(lod)
-    return res
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
 def train_program(is_sparse):
     context = encoder(is_sparse)
-    rnn_out = decoder_train(context, is_sparse)
+    rnn_out = train_decoder(context, is_sparse)
     label = pd.data(
         name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
     cost = pd.cross_entropy(input=rnn_out, label=label)
@@ -218,13 +196,12 @@ def train(use_cuda, is_sparse, is_local=True):
 
 
 def decode_main(use_cuda, is_sparse):
-
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     context = encoder(is_sparse)
-    translation_ids, translation_scores = decoder_decode(context, is_sparse)
+    translation_ids, translation_scores = decode(context, is_sparse)
 
     exe = Executor(place)
     exe.run(framework.default_startup_program())
@@ -234,26 +211,32 @@ def decode_main(use_cuda, is_sparse):
         [1. for _ in range(batch_size)], dtype='float32')
     init_ids_data = init_ids_data.reshape((batch_size, 1))
     init_scores_data = init_scores_data.reshape((batch_size, 1))
-    init_lod = [i for i in range(batch_size)] + [batch_size]
+    init_lod = [1] * batch_size
     init_lod = [init_lod, init_lod]
 
+    init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
+    init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
+
     train_data = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.wmt14.train(dict_size), buf_size=1000),
         batch_size=batch_size)
-    for _, data in enumerate(train_data()):
-        init_ids = set_init_lod(init_ids_data, init_lod, place)
-        init_scores = set_init_lod(init_scores_data, init_lod, place)
 
-        src_word_data = to_lodtensor(map(lambda x: x[0], data), place)
+    feed_order = ['src_word_id']
+    feed_list = [
+        framework.default_main_program().global_block().var(var_name)
+        for var_name in feed_order
+    ]
+    feeder = fluid.DataFeeder(feed_list, place)
+
+    for data in train_data():
+        feed_dict = feeder.feed(map(lambda x: [x[0]], data))
+        feed_dict['init_ids'] = init_ids
+        feed_dict['init_scores'] = init_scores
 
         result_ids, result_scores = exe.run(
             framework.default_main_program(),
-            feed={
-                'src_word_id': src_word_data,
-                'init_ids': init_ids,
-                'init_scores': init_scores
-            },
+            feed=feed_dict,
             fetch_list=[translation_ids, translation_scores],
             return_numpy=False)
         print result_ids.lod()
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
index 2aac70463c64019ec97b0c3893b4b52f77967797..03439cbd37671b4727879bf3d0793f016f55247a 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -57,7 +57,7 @@ def train_program():
     return [avg_cost, acc]
 
 
-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     optimizer = fluid.optimizer.Adam(learning_rate=0.001)
 
@@ -78,7 +78,7 @@ def train(use_cuda, train_program, save_dirname):
             print("acc     : %s" % acc)
 
             if acc > 0.2:  # Smaller value to increase CI speed
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
             else:
                 print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
                     event.epoch + 1, avg_cost, acc))
@@ -100,11 +100,11 @@ def train(use_cuda, train_program, save_dirname):
         feed_order=['img', 'label'])
 
 
-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_dirname, place=place)
+        infer_func=inference_program, param_path=params_dirname, place=place)
 
     batch_size = 1
     tensor_img = numpy.random.uniform(-1.0, 1.0,
@@ -116,17 +116,17 @@ def infer(use_cuda, inference_program, save_dirname=None):
 
 
 def main(use_cuda):
-    save_dirname = "recognize_digits_conv.inference.model"
+    params_dirname = "recognize_digits_conv.inference.model"
 
     # call train() with is_local argument to run distributed train
     train(
         use_cuda=use_cuda,
         train_program=train_program,
-        save_dirname=save_dirname)
+        params_dirname=params_dirname)
     infer(
         use_cuda=use_cuda,
         inference_program=inference_program,
-        save_dirname=save_dirname)
+        params_dirname=params_dirname)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
index 32653157994f81c46f420c1b55ceddbbbf06f2fe..89bbd21bea7d64a8dd6fc32829b6addb680da62e 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
@@ -44,7 +44,7 @@ def train_program():
     return [avg_cost, acc]
 
 
-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     optimizer = fluid.optimizer.Adam(learning_rate=0.001)
 
@@ -62,7 +62,7 @@ def train(use_cuda, train_program, save_dirname):
             print("acc     : %s" % acc)
 
             if acc > 0.2:  # Smaller value to increase CI speed
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
             else:
                 print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
                     event.epoch + 1, avg_cost, acc))
@@ -81,11 +81,11 @@ def train(use_cuda, train_program, save_dirname):
         feed_order=['img', 'label'])
 
 
-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_dirname, place=place)
+        infer_func=inference_program, param_path=params_dirname, place=place)
 
     batch_size = 1
     tensor_img = numpy.random.uniform(-1.0, 1.0,
@@ -97,17 +97,17 @@ def infer(use_cuda, inference_program, save_dirname=None):
 
 
 def main(use_cuda):
-    save_dirname = "recognize_digits_mlp.inference.model"
+    params_dirname = "recognize_digits_mlp.inference.model"
 
     # call train() with is_local argument to run distributed train
     train(
         use_cuda=use_cuda,
         train_program=train_program,
-        save_dirname=save_dirname)
+        params_dirname=params_dirname)
     infer(
         use_cuda=use_cuda,
         inference_program=inference_program,
-        save_dirname=save_dirname)
+        params_dirname=params_dirname)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
index 259680cb097a12a4fc92107f6fd8595393f88bd5..dfc7325acf23176c05fe42761b9997b98d23372a 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
@@ -155,7 +155,7 @@ def train_program():
     return [avg_cost, scale_infer]
 
 
-def train(use_cuda, train_program, save_path):
+def train(use_cuda, train_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     optimizer = fluid.optimizer.SGD(learning_rate=0.2)
 
@@ -180,7 +180,7 @@ def train(use_cuda, train_program, save_path):
             print("avg_cost: %s" % avg_cost)
 
             if float(avg_cost) < 4:  # Smaller value to increase CI speed
-                trainer.save_params(save_path)
+                trainer.save_params(params_dirname)
                 trainer.stop()
             else:
                 print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
@@ -197,43 +197,30 @@ def train(use_cuda, train_program, save_path):
         num_epochs=1,
         event_handler=event_handler,
         reader=train_reader,
-        feed_order=[
-            'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id',
-            'category_id', 'movie_title', 'score'
-        ])
+        feed_order=feed_order)
 
 
-def infer(use_cuda, inference_program, save_path):
+def infer(use_cuda, inference_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     inferencer = fluid.Inferencer(
-        inference_program, param_path=save_path, place=place)
-
-    def create_lod_tensor(data, lod=None):
-        tensor = fluid.LoDTensor()
-        if lod is None:
-            # Tensor, the shape is [batch_size, 1]
-            index = 0
-            lod_0 = [index]
-            for l in range(len(data)):
-                index += 1
-                lod_0.append(index)
-            lod = [lod_0]
-        tensor.set_lod(lod)
-
-        flattened_data = np.concatenate(data, axis=0).astype("int64")
-        flattened_data = flattened_data.reshape([len(flattened_data), 1])
-        tensor.set(flattened_data, place)
-        return tensor
-
-    # Generate a random input for inference
-    user_id = create_lod_tensor([[1]])
-    gender_id = create_lod_tensor([[1]])
-    age_id = create_lod_tensor([[0]])
-    job_id = create_lod_tensor([[10]])
-    movie_id = create_lod_tensor([[783]])
-    category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
-    movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
-                                    [[0, 5]])
+        inference_program, param_path=params_dirname, place=place)
+
+    # Use the first data from paddle.dataset.movielens.test() as input.
+    # Use create_lod_tensor(data, lod, place) API to generate LoD Tensor,
+    # where `data` is a list of sequences of index numbers, `lod` is 
+    # the level of detail (lod) info associated with `data`.
+    # For example, data = [[10, 2, 3], [2, 3]] means that it contains
+    # two sequences of indexes, of length 3 and 2, respectively.
+    # Correspondingly, lod = [[3, 2]] contains one level of detail info,
+    # indicating that `data` consists of two sequences of length 3 and 2. 
+    user_id = fluid.create_lod_tensor([[1]], [[1]], place)
+    gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
+    age_id = fluid.create_lod_tensor([[0]], [[1]], place)
+    job_id = fluid.create_lod_tensor([[10]], [[1]], place)
+    movie_id = fluid.create_lod_tensor([[783]], [[1]], place)
+    category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place)
+    movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]], [[5]],
+                                          place)
 
     results = inferencer.infer(
         {
@@ -253,12 +240,15 @@ def infer(use_cuda, inference_program, save_path):
 def main(use_cuda):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
-    save_path = "recommender_system.inference.model"
-    train(use_cuda=use_cuda, train_program=train_program, save_path=save_path)
+    params_dirname = "recommender_system.inference.model"
+    train(
+        use_cuda=use_cuda,
+        train_program=train_program,
+        params_dirname=params_dirname)
     infer(
         use_cuda=use_cuda,
         inference_program=inference_program,
-        save_path=save_path)
+        params_dirname=params_dirname)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt
index 673c965b662a022739f8d489c331f4de9455a926..d71147a85e77ea6dc5b6391aa169abd9b02a0aa1 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt
@@ -1,6 +1,11 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
+# This test is buggy
+# py_test(test_understand_sentiment_dynamic_rnn SRCS
+# 	test_understand_sentiment_dynamic_rnn.py SERIAL)
+LIST(REMOVE_ITEM TEST_OPS test_understand_sentiment_dynamic_rnn)
+
 # default test
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
index 7e32696f9909a0a440f6bdc401ac9f9594c4dec7..11e9fd1bec1450f6753dbe38c7014090d6e136b6 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
@@ -64,7 +64,7 @@ def train_program(word_dict):
     return [avg_cost, accuracy]
 
 
-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
 
@@ -85,7 +85,7 @@ def train(use_cuda, train_program, save_dirname):
             print("acc     : %s" % acc)
 
             if acc > 0.2:  # Smaller value to increase CI speed
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
                 trainer.stop()
 
             else:
@@ -97,7 +97,7 @@ def train(use_cuda, train_program, save_dirname):
             print("Step {0}, Epoch {1} Metrics {2}".format(
                 event.step, event.epoch, map(np.array, event.metrics)))
             if event.step == 1:  # Run 2 iterations to speed CI
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
                 trainer.stop()
 
     train_reader = paddle.batch(
@@ -112,13 +112,13 @@ def train(use_cuda, train_program, save_dirname):
         feed_order=['words', 'label'])
 
 
-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     word_dict = paddle.dataset.imdb.word_dict()
 
     inferencer = fluid.Inferencer(
         infer_func=partial(inference_program, word_dict),
-        param_path=save_dirname,
+        param_path=params_dirname,
         place=place)
 
     # Setup input by creating LoDTensor to represent sequence of words.
@@ -143,9 +143,9 @@ def infer(use_cuda, inference_program, save_dirname=None):
 def main(use_cuda):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
-    save_path = "understand_sentiment_conv.inference.model"
-    train(use_cuda, train_program, save_path)
-    infer(use_cuda, inference_program, save_path)
+    params_dirname = "understand_sentiment_conv.inference.model"
+    train(use_cuda, train_program, params_dirname)
+    infer(use_cuda, inference_program, params_dirname)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
index e50b7920b17f86eada3abc700c5403053fca8771..90757d54f99715163518ce5a094e6ba3a67efed3 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
@@ -79,7 +79,7 @@ def train_program(word_dict):
     return [avg_cost, accuracy]
 
 
-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
 
@@ -100,7 +100,7 @@ def train(use_cuda, train_program, save_dirname):
             print("acc     : %s" % acc)
 
             if acc > 0.2:  # Smaller value to increase CI speed
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
                 trainer.stop()
 
             else:
@@ -112,7 +112,7 @@ def train(use_cuda, train_program, save_dirname):
             print("Step {0}, Epoch {1} Metrics {2}".format(
                 event.step, event.epoch, map(np.array, event.metrics)))
             if event.step == 1:  # Run 2 iterations to speed CI
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
                 trainer.stop()
 
     train_reader = paddle.batch(
@@ -127,13 +127,13 @@ def train(use_cuda, train_program, save_dirname):
         feed_order=['words', 'label'])
 
 
-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     word_dict = paddle.dataset.imdb.word_dict()
 
     inferencer = fluid.Inferencer(
         infer_func=partial(inference_program, word_dict),
-        param_path=save_dirname,
+        param_path=params_dirname,
         place=place)
 
     # Setup input by creating LoDTensor to represent sequence of words.
@@ -158,9 +158,9 @@ def infer(use_cuda, inference_program, save_dirname=None):
 def main(use_cuda):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
-    save_path = "understand_sentiment_conv.inference.model"
-    train(use_cuda, train_program, save_path)
-    infer(use_cuda, inference_program, save_path)
+    params_dirname = "understand_sentiment_conv.inference.model"
+    train(use_cuda, train_program, params_dirname)
+    infer(use_cuda, inference_program, params_dirname)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
index d4fb80168814359827708ad921bd3f53b14bb2ee..52b7d4a83779d01936afb3d9d1e4864b05d55b5a 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
@@ -71,7 +71,7 @@ def train_program(word_dict):
     return [avg_cost, accuracy]
 
 
-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
 
@@ -92,7 +92,7 @@ def train(use_cuda, train_program, save_dirname):
             print("acc     : %s" % acc)
 
             if acc > 0.2:  # Smaller value to increase CI speed
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
                 trainer.stop()
 
             else:
@@ -104,7 +104,7 @@ def train(use_cuda, train_program, save_dirname):
             print("Step {0}, Epoch {1} Metrics {2}".format(
                 event.step, event.epoch, map(np.array, event.metrics)))
             if event.step == 1:  # Run 2 iterations to speed CI
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
                 trainer.stop()
 
     train_reader = paddle.batch(
@@ -119,13 +119,13 @@ def train(use_cuda, train_program, save_dirname):
         feed_order=['words', 'label'])
 
 
-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     word_dict = paddle.dataset.imdb.word_dict()
 
     inferencer = fluid.Inferencer(
         infer_func=partial(inference_program, word_dict),
-        param_path=save_dirname,
+        param_path=params_dirname,
         place=place)
 
     # Setup input by creating LoDTensor to represent sequence of words.
@@ -150,9 +150,9 @@ def infer(use_cuda, inference_program, save_dirname=None):
 def main(use_cuda):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
-    save_path = "understand_sentiment_stacked_lstm.inference.model"
-    train(use_cuda, train_program, save_path)
-    infer(use_cuda, inference_program, save_path)
+    params_dirname = "understand_sentiment_stacked_lstm.inference.model"
+    train(use_cuda, train_program, params_dirname)
+    infer(use_cuda, inference_program, params_dirname)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
index 16d73d4aff4ba31327e6d8f5ac04a36387f59daa..eeb8e67087334ea96aab9cdb6272e34e2eb99939 100644
--- a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
@@ -80,7 +80,7 @@ def train_program(is_sparse):
     return avg_cost
 
 
-def train(use_cuda, train_program, save_dirname):
+def train(use_cuda, train_program, params_dirname):
     train_reader = paddle.batch(
         paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
     test_reader = paddle.batch(
@@ -97,7 +97,7 @@ def train(use_cuda, train_program, save_dirname):
             print("loss= ", avg_cost)
 
             if avg_cost < 10.0:
-                trainer.save_params(save_dirname)
+                trainer.save_params(params_dirname)
                 trainer.stop()
 
             if math.isnan(avg_cost):
@@ -115,10 +115,10 @@ def train(use_cuda, train_program, save_dirname):
         feed_order=['firstw', 'secondw', 'thirdw', 'forthw', 'nextw'])
 
 
-def infer(use_cuda, inference_program, save_dirname=None):
+def infer(use_cuda, inference_program, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_dirname, place=place)
+        infer_func=inference_program, param_path=params_dirname, place=place)
 
     # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word 
     # is simply an index to look up for the corresponding word vector and hence 
@@ -153,17 +153,17 @@ def main(use_cuda, is_sparse):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
 
-    save_path = "word2vec.inference.model"
+    params_dirname = "word2vec.inference.model"
 
     train(
         use_cuda=use_cuda,
         train_program=partial(train_program, is_sparse),
-        save_dirname=save_path)
+        params_dirname=params_dirname)
 
     infer(
         use_cuda=use_cuda,
         inference_program=partial(inference_program, is_sparse),
-        save_dirname=save_path)
+        params_dirname=params_dirname)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py
index e8a75f473f62df528b7f39bf5f9085076e005c25..23e5900f127a7a3253c551f8f7fbceba08382209 100644
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -147,28 +147,6 @@ def decoder_decode(context, is_sparse):
     return translation_ids, translation_scores
 
 
-def set_init_lod(data, lod, place):
-    res = fluid.LoDTensor()
-    res.set(data, place)
-    res.set_lod(lod)
-    return res
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
 def train_main(use_cuda, is_sparse, is_local=True):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
@@ -192,23 +170,25 @@ def train_main(use_cuda, is_sparse, is_local=True):
             paddle.dataset.wmt14.train(dict_size), buf_size=1000),
         batch_size=batch_size)
 
+    feed_order = [
+        'src_word_id', 'target_language_word', 'target_language_next_word'
+    ]
+
     exe = Executor(place)
 
     def train_loop(main_program):
         exe.run(framework.default_startup_program())
 
+        feed_list = [
+            main_program.global_block().var(var_name) for var_name in feed_order
+        ]
+        feeder = fluid.DataFeeder(feed_list, place)
+
         batch_id = 0
         for pass_id in xrange(1):
             for data in train_data():
-                word_data = to_lodtensor(map(lambda x: x[0], data), place)
-                trg_word = to_lodtensor(map(lambda x: x[1], data), place)
-                trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
                 outs = exe.run(main_program,
-                               feed={
-                                   'src_word_id': word_data,
-                                   'target_language_word': trg_word,
-                                   'target_language_next_word': trg_word_next
-                               },
+                               feed=feeder.feed(data),
                                fetch_list=[avg_cost])
                 avg_cost_val = np.array(outs[0])
                 print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
@@ -258,26 +238,32 @@ def decode_main(use_cuda, is_sparse):
         [1. for _ in range(batch_size)], dtype='float32')
     init_ids_data = init_ids_data.reshape((batch_size, 1))
     init_scores_data = init_scores_data.reshape((batch_size, 1))
-    init_lod = [i for i in range(batch_size)] + [batch_size]
+    init_lod = [1] * batch_size
     init_lod = [init_lod, init_lod]
 
+    init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
+    init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
+
     train_data = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.wmt14.train(dict_size), buf_size=1000),
         batch_size=batch_size)
-    for _, data in enumerate(train_data()):
-        init_ids = set_init_lod(init_ids_data, init_lod, place)
-        init_scores = set_init_lod(init_scores_data, init_lod, place)
 
-        src_word_data = to_lodtensor(map(lambda x: x[0], data), place)
+    feed_order = ['src_word_id']
+    feed_list = [
+        framework.default_main_program().global_block().var(var_name)
+        for var_name in feed_order
+    ]
+    feeder = fluid.DataFeeder(feed_list, place)
+
+    for data in train_data():
+        feed_dict = feeder.feed(map(lambda x: [x[0]], data))
+        feed_dict['init_ids'] = init_ids
+        feed_dict['init_scores'] = init_scores
 
         result_ids, result_scores = exe.run(
             framework.default_main_program(),
-            feed={
-                'src_word_id': src_word_data,
-                'init_ids': init_ids,
-                'init_scores': init_scores
-            },
+            feed=feed_dict,
             fetch_list=[translation_ids, translation_scores],
             return_numpy=False)
         print result_ids.lod()
diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py
index 7be924f762ddeb045dda890dbfdcd96a65449553..65d6552acc9b3d31a97a45290e4613a633fffa3c 100644
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -173,63 +173,33 @@ def train(use_cuda, save_dirname, is_local=True):
     test_reader = paddle.batch(
         paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
 
-    feeding = {
-        'user_id': 0,
-        'gender_id': 1,
-        'age_id': 2,
-        'job_id': 3,
-        'movie_id': 4,
-        'category_id': 5,
-        'movie_title': 6,
-        'score': 7
-    }
-
-    def func_feed(feeding, data):
-        feed_tensors = {}
-        for (key, idx) in feeding.iteritems():
-            tensor = fluid.LoDTensor()
-            if key != "category_id" and key != "movie_title":
-                if key == "score":
-                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
-                        "float32")
-                else:
-                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
-                        "int64")
-            else:
-                numpy_data = map(lambda x: np.array(x[idx]).astype("int64"),
-                                 data)
-                lod_info = [len(item) for item in numpy_data]
-                offset = 0
-                lod = [offset]
-                for item in lod_info:
-                    offset += item
-                    lod.append(offset)
-                numpy_data = np.concatenate(numpy_data, axis=0)
-                tensor.set_lod([lod])
-
-            numpy_data = numpy_data.reshape([numpy_data.shape[0], 1])
-            tensor.set(numpy_data, place)
-            feed_tensors[key] = tensor
-        return feed_tensors
+    feed_order = [
+        'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',
+        'movie_title', 'score'
+    ]
 
     def train_loop(main_program):
         exe.run(framework.default_startup_program())
 
+        feed_list = [
+            main_program.global_block().var(var_name) for var_name in feed_order
+        ]
+        feeder = fluid.DataFeeder(feed_list, place)
+
         PASS_NUM = 100
         for pass_id in range(PASS_NUM):
             for batch_id, data in enumerate(train_reader()):
                 # train a mini-batch
                 outs = exe.run(program=main_program,
-                               feed=func_feed(feeding, data),
+                               feed=feeder.feed(data),
                                fetch_list=[avg_cost])
                 out = np.array(outs[0])
                 if (batch_id + 1) % 10 == 0:
                     avg_cost_set = []
                     for test_data in test_reader():
-                        avg_cost_np = exe.run(
-                            program=test_program,
-                            feed=func_feed(feeding, test_data),
-                            fetch_list=[avg_cost])
+                        avg_cost_np = exe.run(program=test_program,
+                                              feed=feeder.feed(test_data),
+                                              fetch_list=[avg_cost])
                         avg_cost_set.append(avg_cost_np[0])
                         break  # test only 1 segment for speeding up CI
 
@@ -279,23 +249,6 @@ def infer(use_cuda, save_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
 
-    def create_lod_tensor(data, lod=None):
-        tensor = fluid.LoDTensor()
-        if lod is None:
-            # Tensor, the shape is [batch_size, 1]
-            index = 0
-            lod_0 = [index]
-            for l in range(len(data)):
-                index += 1
-                lod_0.append(index)
-            lod = [lod_0]
-        tensor.set_lod(lod)
-
-        flattened_data = np.concatenate(data, axis=0).astype("int64")
-        flattened_data = flattened_data.reshape([len(flattened_data), 1])
-        tensor.set(flattened_data, place)
-        return tensor
-
     inference_scope = fluid.core.Scope()
     with fluid.scope_guard(inference_scope):
         # Use fluid.io.load_inference_model to obtain the inference program desc,
@@ -307,26 +260,33 @@ def infer(use_cuda, save_dirname=None):
 
         # Use the first data from paddle.dataset.movielens.test() as input
         assert feed_target_names[0] == "user_id"
-        user_id = create_lod_tensor([[1]])
+        # Use create_lod_tensor(data, lod, place) API to generate LoD Tensor
+        # where `data` is a list of sequences of index numbers, `lod` is 
+        # the level of detail (lod) info associated with `data`.
+        # For example, data = [[10, 2, 3], [2, 3]] means that it contains
+        # two sequences of indexes, of length 3 and 2, respectively.
+        # Correspondingly, lod = [[3, 2]] contains one level of detail info,
+        # indicating that `data` consists of two sequences of length 3 and 2. 
+        user_id = fluid.create_lod_tensor([[1]], [[1]], place)
 
         assert feed_target_names[1] == "gender_id"
-        gender_id = create_lod_tensor([[1]])
+        gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
 
         assert feed_target_names[2] == "age_id"
-        age_id = create_lod_tensor([[0]])
+        age_id = fluid.create_lod_tensor([[0]], [[1]], place)
 
         assert feed_target_names[3] == "job_id"
-        job_id = create_lod_tensor([[10]])
+        job_id = fluid.create_lod_tensor([[10]], [[1]], place)
 
         assert feed_target_names[4] == "movie_id"
-        movie_id = create_lod_tensor([[783]])
+        movie_id = fluid.create_lod_tensor([[783]], [[1]], place)
 
         assert feed_target_names[5] == "category_id"
-        category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
+        category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place)
 
         assert feed_target_names[6] == "movie_title"
-        movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
-                                        [[0, 5]])
+        movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]],
+                                              [[5]], place)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
diff --git a/python/paddle/fluid/tests/book/notest_rnn_encoder_decoder.py b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
similarity index 87%
rename from python/paddle/fluid/tests/book/notest_rnn_encoder_decoder.py
rename to python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
index ce640dece8a5067bd10f410a2bb58874b7cc0908..7ada57def6bfedb113ea1a56f9677116b80488ea 100644
--- a/python/paddle/fluid/tests/book/notest_rnn_encoder_decoder.py
+++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -152,29 +152,6 @@ def seq_to_seq_net():
     return avg_cost, prediction
 
 
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = core.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
-def create_random_lodtensor(lod, place, low, high):
-    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
-    res = fluid.LoDTensor()
-    res.set(data, place)
-    res.set_lod([lod])
-    return res
-
-
 def train(use_cuda, save_dirname=None):
     [avg_cost, prediction] = seq_to_seq_net()
 
@@ -188,22 +165,20 @@ def train(use_cuda, save_dirname=None):
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = Executor(place)
-
     exe.run(framework.default_startup_program())
 
+    feed_order = ['source_sequence', 'target_sequence', 'label_sequence']
+    feed_list = [
+        framework.default_main_program().global_block().var(var_name)
+        for var_name in feed_order
+    ]
+    feeder = fluid.DataFeeder(feed_list, place)
+
     batch_id = 0
     for pass_id in xrange(2):
         for data in train_data():
-            word_data = to_lodtensor(map(lambda x: x[0], data), place)
-            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
-            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
-
             outs = exe.run(framework.default_main_program(),
-                           feed={
-                               'source_sequence': word_data,
-                               'target_sequence': trg_word,
-                               'label_sequence': trg_word_next
-                           },
+                           feed=feeder.feed(data),
                            fetch_list=[avg_cost])
 
             avg_cost_val = np.array(outs[0])
@@ -237,9 +212,23 @@ def infer(use_cuda, save_dirname=None):
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
 
-        lod = [0, 4, 10]
-        word_data = create_random_lodtensor(lod, place, low=0, high=1)
-        trg_word = create_random_lodtensor(lod, place, low=0, high=1)
+        # Setup input by creating LoDTensor to represent sequence of words.
+        # Here each word is the basic element of the LoDTensor and the shape of 
+        # each word (base_shape) should be [1] since it is simply an index to 
+        # look up for the corresponding word vector.
+        # Suppose the length_based level of detail (lod) info is set to [[4, 6]],
+        # which has only one lod level. Then the created LoDTensor will have only 
+        # one higher level structure (sequence of words, or sentence) than the basic 
+        # element (word). Hence the LoDTensor will hold data for two sentences of 
+        # length 4 and 6, respectively. 
+        # Note that lod info should be a list of lists.
+        lod = [[4, 6]]
+        base_shape = [1]
+        # The range of random integers is [low, high]
+        word_data = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=1)
+        trg_word = fluid.create_random_int_lodtensor(
+            lod, base_shape, place, low=0, high=1)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
diff --git a/python/paddle/fluid/tests/test_lod_tensor.py b/python/paddle/fluid/tests/test_lod_tensor.py
index b11131456a1f87419407c4d8626ebcde26dd7640..013d72f418cf7ac11eb31fd221052039e896e203 100644
--- a/python/paddle/fluid/tests/test_lod_tensor.py
+++ b/python/paddle/fluid/tests/test_lod_tensor.py
@@ -53,11 +53,14 @@ class TestLoDTensor(unittest.TestCase):
         self.assertEqual(_convert_lod(lod), converted_lod)
 
     def test_create_lod_tensor(self):
-        # Only numpy array or a fluid LoDTensor is valid input to
-        # create_lod_tensor function, currently a list of lists is not.
-        data = [[1, 2], [3, 4]]
-        self.assertRaises(Exception, create_lod_tensor, data, [],
+        # Create LoDTensor from a list
+        data = [[1, 2, 3], [3, 4]]
+        wrong_lod = [[2, 2]]
+        correct_lod = [[3, 2]]
+        self.assertRaises(AssertionError, create_lod_tensor, data, wrong_lod,
                           fluid.CPUPlace())
+        tensor = create_lod_tensor(data, correct_lod, fluid.CPUPlace())
+        self.assertEqual(tensor.lod(), [[0, 3, 5]])
 
         # Create LoDTensor from numpy array
         data = numpy.random.random([10, 1])
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index eed1412ba4f2b8f2209c0573359bea1e4b20d8d5..fead95ffdab25c7ea96b7ef223efc0abf7eea3e3 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -48,3 +48,5 @@ foreach(TEST_OP ${TEST_OPS})
 endforeach(TEST_OP)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
 py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
+# tests that need to be done in fixed timeout
+set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 709b4bf2fcfb180c747ba3539711a58a57e3b77f..b611470fa1ff326df960c349b71006f52d586d8e 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -479,9 +479,9 @@ class OpTest(unittest.TestCase):
     def np_dtype_to_fluid_dtype(input):
         """Change the dtype of float16 numpy array
 
-        numpy float16 is binded to paddle::platform::float16 
+        numpy float16 is binded to paddle::platform::float16
         in tensor_py.h via the help of uint16 data type since
-        the internal memory representation of float16 is 
+        the internal memory representation of float16 is
         uint16_t in paddle and np.uint16 in numpy, which are
         themselves binded together by pybind.
 
@@ -489,9 +489,9 @@ class OpTest(unittest.TestCase):
             input: input numpy array
 
         Returns:
-            input: The dtype of input will be changed to np.uint16 if 
+            input: The dtype of input will be changed to np.uint16 if
                 it is originally np.float16, such that the internal memory
-                of input will be reinterpreted as of dtype np.uint16. 
+                of input will be reinterpreted as of dtype np.uint16.
         """
         if input.dtype == np.float16:
             input.dtype = np.uint16
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
index bffb4f3b666a7ddcc133b7c30fab132b49aa1d0e..87c11e7880e73b911f21dda77c1cc2b4850b3591 100644
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
@@ -17,7 +17,10 @@ import numpy as np
 from op_test import OpTest
 
 
-def bilinear_interp_np(input, out_h, out_w):
+def bilinear_interp_np(input, out_h, out_w, out_size):
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
     batch_size, channel, in_h, in_w = input.shape
     if out_h > 1:
         ratio_h = (in_h - 1.0) / (out_h - 1.0)
@@ -49,12 +52,15 @@ def bilinear_interp_np(input, out_h, out_w):
 
 class TestBilinearInterpOp(OpTest):
     def setUp(self):
+        self.out_size = None
         self.init_test_case()
         self.op_type = "bilinear_interp"
         input_np = np.random.random(self.input_shape).astype("float32")
-        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w)
-
+        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
+                                       self.out_size)
         self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
         self.attrs = {'out_h': self.out_h, 'out_w': self.out_w}
         self.outputs = {'Out': output_np}
 
@@ -68,6 +74,7 @@ class TestBilinearInterpOp(OpTest):
         self.input_shape = [2, 3, 4, 4]
         self.out_h = 2
         self.out_w = 2
+        self.out_size = np.array([3, 3]).astype("int32")
 
 
 class TestCase1(TestBilinearInterpOp):
@@ -91,5 +98,29 @@ class TestCase3(TestBilinearInterpOp):
         self.out_w = 128
 
 
+class TestCase4(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.out_size = np.array([2, 2]).astype("int32")
+
+
+class TestCase5(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.out_size = np.array([11, 11]).astype("int32")
+
+
+class TestCase6(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 128, 64]
+        self.out_h = 64
+        self.out_w = 128
+        self.out_size = np.array([65, 129]).astype("int32")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_debugger.py b/python/paddle/fluid/tests/unittests/test_debugger.py
index 67b03f635b6f8a3003efabe5425325080d47f61c..870952f2f916dcdec5991ac5c10d2da3a7ab18a8 100644
--- a/python/paddle/fluid/tests/unittests/test_debugger.py
+++ b/python/paddle/fluid/tests/unittests/test_debugger.py
@@ -15,7 +15,7 @@
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid import debuger
+from paddle.fluid import debugger
 from paddle.fluid.framework import Program
 
 
@@ -51,9 +51,9 @@ class TestDebugger(unittest.TestCase):
             outputs={"Out": mul_out},
             attrs={"x_num_col_dims": 1})
 
-        print(debuger.pprint_program_codes(p))
+        print(debugger.pprint_program_codes(p))
 
-        debuger.draw_block_graphviz(p.block(0), path="./test.dot")
+        debugger.draw_block_graphviz(p.block(0), path="./test.dot")
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 10f8c4f3f0167632bb4a3d454ab026ba73a8f305..fa49bd41a5876847d046682dce5c3d3868a18500 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -49,7 +49,6 @@ class TestDistTranspiler(unittest.TestCase):
     def test_transpiler(self):
         trainer = self.get_trainer()
         pserver, startup = self.get_pserver(self.current_pserver_ep)
-
         self.assertEqual([op.type for op in trainer.global_block().ops],
                          self.get_expect_trainer_ops())
 
@@ -67,7 +66,7 @@ class TestDistTranspiler(unittest.TestCase):
             "fill_constant", "fill_constant", "uniform_random", "uniform_random"
         ])
 
-        # the variable #fc_w will be split into two blocks 
+        # the variable #fc_w will be split into two blocks
         fc_w_var = startup.global_block().var("fc_w.block1")
         self.assertEqual(fc_w_var.shape, (500, 1000))
 
@@ -86,8 +85,12 @@ class TestDistTranspiler(unittest.TestCase):
             optimize_ops, params_grads = self.net_conf()
 
         delete_ops(trainer.global_block(), optimize_ops)
-        return [op.type for op in trainer.global_block().ops
-                ] + ["split_byref", "send", "concat"]
+        ops = [op.type for op in trainer.global_block().ops] + [
+            "split_byref", "send_vars", "send_barrier", "recv", "recv",
+            "fetch_barrier", "concat"
+        ]
+        ops.insert(ops.index("elementwise_add_grad") + 1, "send_vars")
+        return ops
 
     def get_trainer(self):
         return self._transpiler_instance().get_trainer_program()
diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..281068e945e76a42635868d19573498f79fde1f3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
@@ -0,0 +1,60 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import math
+from op_test import OpTest
+
+
+def quantize_max_abs(x, num_bits):
+    range = math.pow(2, num_bits) - 1
+    scale = np.max(np.abs(x).flatten())
+    y = np.round(x / scale * range)
+    return y, scale
+
+
+def dequantize_max_abs(x, num_bits, scale):
+    range = math.pow(2, num_bits) - 1
+    y = (scale / range) * x
+    return y
+
+
+class TestFakeDequantizeMaxAbsOp(OpTest):
+    def set_args(self):
+        self.num_bits = 8
+
+    def setUp(self):
+        self.set_args()
+        self.op_type = "fake_dequantize_max_abs"
+        x = np.random.randn(31, 65).astype("float32")
+        yq, scale = quantize_max_abs(x, self.num_bits)
+        print 'scale ', scale
+        ydq = dequantize_max_abs(yq, self.num_bits, scale)
+
+        self.inputs = {'X': yq}
+        self.attrs = {'num_bits': self.num_bits, 'scale': float(scale)}
+        self.outputs = {'Out': ydq}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestFakeDequantizeMaxAbsOp5Bits(OpTest):
+    def set_args(self):
+        self.num_bits = 5
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
index 66e3e2d51d118756d4881955b4df8eb4d2bbc094..533d8ccfac82a2e298af16181ab16bf7aa3db282 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
@@ -50,5 +50,27 @@ class TestFillConstantBatchSizeLikeWhenSecondDimIsBatchSize(OpTest):
         self.check_output()
 
 
+class TestFillConstantBatchSizeLikeWithLoDTensor(OpTest):
+    def setUp(self):
+        self.op_type = "fill_constant_batch_size_like"
+        self.inputs = {
+            'Input': (np.random.random((31, 28)).astype("float32"),
+                      [[0, 9, 23, 31]])
+        }
+        self.attrs = {
+            'value': 3.5,
+            'shape': [-1, 16],
+            'input_dim_idx': 0,
+            'output_dim_idx': 0
+        }
+
+        out = np.random.random((3, 16)).astype("float32")
+        out.fill(3.5)
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py
index 6fd043c27e27db53c95be3630b6c08216e8e35f4..4ae90864806204197c52bbbdc5516f141afd4613 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
@@ -20,8 +20,9 @@ from op_test import OpTest
 class TestGatherOp(OpTest):
     def setUp(self):
         self.op_type = "gather"
-        xnp = np.random.random((10, 20)).astype("float32")
-        self.inputs = {'X': xnp, 'Index': np.array([1, 3, 5]).astype("int32")}
+        self.config()
+        xnp = np.random.random(self.x_shape).astype("float32")
+        self.inputs = {'X': xnp, 'Index': np.array(self.index).astype("int32")}
         self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
 
     def test_check_output(self):
@@ -30,6 +31,16 @@ class TestGatherOp(OpTest):
     def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
+    def config(self):
+        self.x_shape = (10, 20)
+        self.index = [1, 3, 5]
+
+
+class TestCase1(TestGatherOp):
+    def config(self):
+        self.x_shape = (10)
+        self.index = [1, 3, 5]
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index c44ac59ccdb7fa212ab2a8ab83ee0c70fc498f9f..60dc1f83fc32e2551eb2a04ef35f1c8a0ffec769 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -369,11 +369,13 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(output)
         print(str(program))
 
-    def test_bilinear_interp(self):
+    def test_upsampling_bilinear2d(self):
         program = Program()
         with program_guard(program):
             x = layers.data(name='x', shape=[3, 9, 6], dtype="float32")
-            output = layers.bilinear_interp(x, 12, 12)
+            output = layers.upsampling_bilinear2d(x, out_shape=[12, 12])
+            self.assertIsNotNone(output)
+            output = layers.upsampling_bilinear2d(x, scale=3)
             self.assertIsNotNone(output)
         print(str(program))
 
diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf89f9d0ebf6200933e539ef7fa8cbdc8f6db058
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -0,0 +1,109 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import os
+import signal
+import subprocess
+import time
+import unittest
+from multiprocessing import Process
+from op_test import OpTest
+
+
+def run_pserver(use_cuda, sync_mode, ip, port, trainer_count, trainer_id):
+    x = fluid.layers.data(name='x', shape=[1], dtype='float32')
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+    # loss function
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+
+    # optimizer
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(avg_cost)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    port = os.getenv("PADDLE_INIT_PORT", port)
+    pserver_ips = os.getenv("PADDLE_INIT_PSERVERS", ip)  # ip,ip...
+    eplist = []
+    for ip in pserver_ips.split(","):
+        eplist.append(':'.join([ip, port]))
+    pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
+    trainers = int(os.getenv("TRAINERS", trainer_count))
+    current_endpoint = os.getenv("POD_IP", ip) + ":" + port
+    trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID", trainer_id))
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id,
+        pservers=pserver_endpoints,
+        trainers=trainers,
+        sync_mode=sync_mode)
+    pserver_prog = t.get_pserver_program(current_endpoint)
+    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+    exe.run(pserver_startup)
+    exe.run(pserver_prog)
+
+
+class TestListenAndServOp(OpTest):
+    def setUp(self):
+        self.sleep_time = 5
+        self.ip = "127.0.0.1"
+        self.port = "6173"
+        self.trainer_count = 1
+        self.trainer_id = 1
+
+    def _raise_signal(self, parent_pid, raised_signal):
+        time.sleep(self.sleep_time)
+        ps_command = subprocess.Popen(
+            "ps -o pid --ppid %d --noheaders" % parent_pid,
+            shell=True,
+            stdout=subprocess.PIPE)
+        ps_output = ps_command.stdout.read()
+        retcode = ps_command.wait()
+        assert retcode == 0, "ps command returned %d" % retcode
+
+        for pid_str in ps_output.split("\n")[:-1]:
+            try:
+                os.kill(int(pid_str), raised_signal)
+            except Exception:
+                continue
+
+    def _start_pserver(self, use_cuda, sync_mode):
+        p = Process(
+            target=run_pserver,
+            args=(use_cuda, sync_mode, self.ip, self.port, self.trainer_count,
+                  self.trainer_id))
+        p.start()
+
+    def test_handle_signal_in_serv_op(self):
+        # run pserver on CPU in sync mode
+        self._start_pserver(False, True)
+
+        # raise SIGINT to pserver
+        self._raise_signal(os.getpid(), signal.SIGINT)
+
+        # run pserver on CPU in async mode
+        self._start_pserver(False, False)
+
+        # raise SIGTERM to pserver
+        self._raise_signal(os.getpid(), signal.SIGTERM)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_mul_mkldnn_op.py
deleted file mode 100644
index 42d68ef376dc4a664a96ff5a24545c1997ee924a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_mul_mkldnn_op.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from test_mul_op import TestMulOp, TestMulOp2, TestFP16MulOp1, TestFP16MulOp2
-
-
-class TestMKLDNNMulOp(TestMulOp):
-    def init_op_test(self):
-        super(TestMKLDNNMulOp, self).setUp()
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNMulOp2(TestMulOp2):
-    def init_op_test(self):
-        super(TestMKLDNNMulOp2, self).setUp()
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNFP16MulOp1(TestFP16MulOp1):
-    def init_op_test(self):
-        super(TestMKLDNNFP16MulOp1, self).setUp()
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNFP16MulOp2(TestFP16MulOp2):
-    def init_op_test(self):
-        super(TestMKLDNNFP16MulOp2, self).setUp()
-        self.attrs = {"use_mkldnn": True}
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py
index d984393c89f44f5b9679a22bf7bb6182599233e3..862b7f8cb93620da4dd4673028776cfe565eeb0b 100644
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
@@ -21,12 +21,10 @@ from op_test import OpTest
 class TestMulOp(OpTest):
     def setUp(self):
         self.op_type = "mul"
-        self.use_mkldnn = False
         self.inputs = {
             'X': np.random.random((32, 84)).astype("float32"),
             'Y': np.random.random((84, 100)).astype("float32")
         }
-        self.attrs = {'use_mkldnn': self.use_mkldnn}
         self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
 
     def test_check_output(self):
@@ -47,16 +45,11 @@ class TestMulOp(OpTest):
 class TestMulOp2(OpTest):
     def setUp(self):
         self.op_type = "mul"
-        self.use_mkldnn = False
         self.inputs = {
             'X': np.random.random((15, 4, 12, 10)).astype("float32"),
             'Y': np.random.random((4, 30, 8, 2, 9)).astype("float32")
         }
-        self.attrs = {
-            'x_num_col_dims': 2,
-            'y_num_col_dims': 2,
-            'use_mkldnn': self.use_mkldnn
-        }
+        self.attrs = {'x_num_col_dims': 2, 'y_num_col_dims': 2}
         result = np.dot(self.inputs['X'].reshape(15 * 4, 12 * 10),
                         self.inputs['Y'].reshape(4 * 30, 8 * 2 * 9))
         result = result.reshape(15, 4, 8, 2, 9)
@@ -80,11 +73,9 @@ class TestMulOp2(OpTest):
 class TestFP16MulOp1(OpTest):
     def setUp(self):
         self.op_type = "mul"
-        self.use_mkldnn = False
         x = np.random.random((32, 84)).astype("float16")
         y = np.random.random((84, 100)).astype("float16")
         self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)}
-        self.attrs = {'use_mkldnn': self.use_mkldnn}
         self.outputs = {'Out': np.dot(x, y)}
 
     def test_check_output(self):
@@ -97,15 +88,10 @@ class TestFP16MulOp1(OpTest):
 class TestFP16MulOp2(OpTest):
     def setUp(self):
         self.op_type = "mul"
-        self.use_mkldnn = False
         x = np.random.random((15, 4, 12, 10)).astype("float16")
         y = np.random.random((4, 30, 8, 2, 9)).astype("float16")
         self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)}
-        self.attrs = {
-            'x_num_col_dims': 2,
-            'y_num_col_dims': 2,
-            'use_mkldnn': self.use_mkldnn
-        }
+        self.attrs = {'x_num_col_dims': 2, 'y_num_col_dims': 2}
         result = np.dot(
             x.reshape(15 * 4, 12 * 10), y.reshape(4 * 30, 8 * 2 * 9))
         result = result.reshape(15, 4, 8, 2, 9)
diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py
index 8b15aa6822aee7bb4d53dcf1d87565fae5504821..c098a5a0cb0364f9ec93c95c1ef50912e574b3d9 100644
--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
@@ -63,10 +63,7 @@ class TestOperator(unittest.TestCase):
         self.assertEqual(mul_op.output("Out"), ["mul.out"])
         self.assertEqual(
             set(mul_op.attr_names),
-            set([
-                "x_num_col_dims", "y_num_col_dims", "use_mkldnn", "op_role",
-                "op_role_var"
-            ]))
+            set(["x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var"]))
         self.assertEqual(mul_op.has_attr("x_num_col_dims"), True)
         self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT)
         self.assertEqual(mul_op.attr("x_num_col_dims"), 1)
diff --git a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..2105d320665367e3ec1bfd7b3a353a144c91244f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
@@ -0,0 +1,68 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def PolygonBoxRestore(input):
+    shape = input.shape
+    batch_size = shape[0]
+    geo_channels = shape[1]
+    h = shape[2]
+    w = shape[3]
+    h_indexes = np.array(range(h) * w).reshape(
+        [w, h]).transpose()[np.newaxis, :]  # [1, h, w]
+    w_indexes = np.array(range(w) * h).reshape(
+        [h, w])[np.newaxis, :]  # [1, h, w]
+    indexes = np.concatenate(
+        (w_indexes, h_indexes))[np.newaxis, :]  # [1, 2, h, w]
+    indexes = indexes.repeat(
+        [geo_channels / 2],
+        axis=0)[np.newaxis, :]  # [1, geo_channels/2, 2, h, w]
+    indexes = indexes.repeat(
+        [batch_size], axis=0)  # [batch_size, geo_channels/2, 2, h, w]
+    return indexes.reshape(
+        input.shape) - input  # [batch_size, geo_channels, h, w]
+
+
+class TestPolygonBoxRestoreOp(OpTest):
+    def config(self):
+        self.input_shape = (1, 8, 2, 2)
+
+    def setUp(self):
+        self.config()
+        self.op_type = "polygon_box_transform"
+        input = np.random.random(self.input_shape).astype("float32")
+        self.inputs = {'Input': input}
+        output = PolygonBoxRestore(input)
+        self.outputs = {'Output': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestCase1(TestPolygonBoxRestoreOp):
+    def config(self):
+        self.input_shape = (2, 10, 3, 2)
+
+
+class TestCase2(TestPolygonBoxRestoreOp):
+    def config(self):
+        self.input_shape = (3, 12, 4, 5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_preprocessor.py b/python/paddle/fluid/tests/unittests/test_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbf1a7e0c50a87cd43507ffdb94109873cf4e5d9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_preprocessor.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+import paddle.v2.dataset.mnist as mnist
+
+
+class TestPreprocessor(unittest.TestCase):
+    def setUp(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(mnist.train(), batch_size=32)
+            feeder = fluid.DataFeeder(
+                feed_list=[  # order is image and label
+                    fluid.layers.data(
+                        name='image', shape=[784]),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            self.num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
+                './mnist_for_preprocessor_test.recordio', reader, feeder)
+
+    def test_main(self):
+        N = 10
+
+        img_expected_res = []
+        lbl_expected_res = []
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data_file = fluid.layers.io.open_recordio_file(
+                './mnist_for_preprocessor_test.recordio',
+                shapes=[[-1, 784], [-1, 1]],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'])
+            img, lbl = fluid.layers.io.read_file(data_file)
+
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            else:
+                place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for _ in range(N):
+                img_v, lbl_v = exe.run(fetch_list=[img, lbl])
+                img_expected_res.append(img_v / 2)
+                lbl_expected_res.append(lbl_v + 1)
+
+        img_actual_res = []
+        lbl_actual_res = []
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data_file = fluid.layers.io.open_recordio_file(
+                './mnist_for_preprocessor_test.recordio',
+                shapes=[[-1, 784], [-1, 1]],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'])
+            preprocessor = fluid.layers.io.Preprocessor(reader=data_file)
+            with preprocessor.block():
+                img, lbl = preprocessor.inputs()
+                img_out = img / 2
+                lbl_out = lbl + 1
+                preprocessor.outputs(img_out, lbl_out)
+
+            data_file = fluid.layers.io.double_buffer(preprocessor())
+            img, lbl = fluid.layers.io.read_file(data_file)
+
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            else:
+                place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for _ in range(N):
+                img_v, lbl_v = exe.run(fetch_list=[img, lbl])
+                img_actual_res.append(img_v)
+                lbl_actual_res.append(lbl_v)
+
+        for idx in range(N):
+            np.allclose(img_expected_res[idx], img_actual_res[idx])
+            np.allclose(lbl_expected_res[idx], lbl_actual_res[idx])
diff --git a/python/paddle/fluid/tests/unittests/test_random_crop_op.py b/python/paddle/fluid/tests/unittests/test_random_crop_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c708d0386da4028f1f3d177d0a3fd494c077c6e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_random_crop_op.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+
+
+class TestRandomCropOp(OpTest):
+    def setUp(self):
+        to_crop = np.array([[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]] *
+                           5).astype("float32")
+        self.possible_res = [
+            np.array([[1, 2, 3], [5, 6, 7]]), np.array([[2, 3, 4], [6, 7, 8]]),
+            np.array([[5, 6, 7], [9, 10, 11]]),
+            np.array([[6, 7, 8], [10, 11, 12]])
+        ]
+        self.op_type = "random_crop"
+        self.inputs = {'X': to_crop, 'Seed': np.array([10])}
+        self.outputs = {'Out': np.array([]), 'SeedOut': np.array([])}
+        self.attrs = {'shape': [2, 3]}
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)
+
+    def verify_output(self, outs):
+        out = np.array(outs[1])
+        for ins in out[:]:
+            is_equal = [(ins == res).all() for res in self.possible_res]
+            self.assertIn(True, is_equal)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_shape_op.py b/python/paddle/fluid/tests/unittests/test_shape_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..a62ee050075cb8c9f8817c142825a89c24bdfedf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_shape_op.py
@@ -0,0 +1,47 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestShapeOp(OpTest):
+    def setUp(self):
+        self.op_type = "shape"
+        self.config()
+        self.shape = [2, 3]
+        input = np.zeros(self.shape)
+        self.inputs = {'Input': input}
+        self.outputs = {'Out': np.array(self.shape)}
+
+    def config(self):
+        self.shape = [2, 3]
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class case1(TestShapeOp):
+    def config(self):
+        self.shape = [2]
+
+
+class case2(TestShapeOp):
+    def config(self):
+        self.shape = [1, 2, 3]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_split_var.py b/python/paddle/fluid/tests/unittests/test_split_var.py
index 0c5e8901b903375c7d4de32943e657b205d8fae9..157def9b56e44092a86023035d1ab444c938aa07 100644
--- a/python/paddle/fluid/tests/unittests/test_split_var.py
+++ b/python/paddle/fluid/tests/unittests/test_split_var.py
@@ -14,7 +14,7 @@
 
 import math
 import unittest
-from paddle.fluid.transpiler.distribute_transpiler import split_dense_variable
+from paddle.fluid.transpiler.distribute_transpiler import split_variable
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import random
@@ -31,7 +31,7 @@ class TestSplitVar(unittest.TestCase):
                 # dtype=core.VarDesc.VarType.LOD_TENSOR,
                 shape=shape)
             var_list.append(var)
-        blocks = split_dense_variable(var_list, 10, min_size)
+        blocks = split_variable(var_list, 10, min_size)
         all_sizes = []
         for s in expected_sizes:
             for s2 in s:
diff --git a/python/paddle/fluid/transpiler/__init__.py b/python/paddle/fluid/transpiler/__init__.py
index 413c36c5c41bbe0169f1c050ccdac040202d66df..045ca537b2e84c02298d6375a7ef5bdbb5517380 100644
--- a/python/paddle/fluid/transpiler/__init__.py
+++ b/python/paddle/fluid/transpiler/__init__.py
@@ -16,8 +16,9 @@ from distribute_transpiler import DistributeTranspiler
 from inference_transpiler import InferenceTranspiler
 from memory_optimization_transpiler import memory_optimize, release_memory
 from distribute_transpiler_simple import SimpleDistributeTranspiler
+from ps_dispatcher import HashName, RoundRobin
 
 __all__ = [
     "DistributeTranspiler", "InferenceTranspiler", "SimpleDistributeTranspiler",
-    "memory_optimize", "release_memory"
+    "memory_optimize", "release_memory", "HashName", "RoundRobin"
 ]
diff --git a/python/paddle/fluid/transpiler/details/__init__.py b/python/paddle/fluid/transpiler/details/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc597c33849dc06cc975b245099672f64c3539d3
--- /dev/null
+++ b/python/paddle/fluid/transpiler/details/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from program_utils import *
+from ufind import *
diff --git a/python/paddle/fluid/transpiler/details/program_utils.py b/python/paddle/fluid/transpiler/details/program_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f10b496306a002ee131d01798a0698b807d379ca
--- /dev/null
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def delete_ops(block, ops):
+    try:
+        start = list(block.ops).index(ops[0])
+        end = list(block.ops).index(ops[-1])
+        [block.remove_op(start) for _ in xrange(end - start + 1)]
+    except Exception, e:
+        raise e
+    block.program.sync_with_cpp()
+
+
+def find_op_by_input_arg(block, arg_name):
+    for index, op in enumerate(block.ops):
+        if arg_name in op.input_arg_names:
+            return index
+    return -1
+
+
+def find_op_by_output_arg(block, arg_name):
+    for index, op in enumerate(block.ops):
+        if arg_name in op.output_arg_names:
+            return index
+    return -1
diff --git a/python/paddle/fluid/transpiler/details/ufind.py b/python/paddle/fluid/transpiler/details/ufind.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e30d0e3f9c5712c494daf17b2b4bcec86f69c23
--- /dev/null
+++ b/python/paddle/fluid/transpiler/details/ufind.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class UnionFind(object):
+    """ Union-find data structure.
+
+    Union-find is a data structure that keeps track of a set of elements partitioned
+    into a number of disjoint (non-overlapping) subsets.
+
+    Reference:
+    https://en.wikipedia.org/wiki/Disjoint-set_data_structure
+
+    Args:
+      elements(list): The initialize element list.
+    """
+
+    def __init__(self, elementes=None):
+        self._parents = []  # index -> parent index
+        self._index = {}  # element -> index
+        self._curr_idx = 0
+        if not elementes:
+            elementes = []
+        for ele in elementes:
+            self._parents.append(self._curr_idx)
+            self._index.update({ele: self._curr_idx})
+            self._curr_idx += 1
+
+    def find(self, x):
+        # Find the root index of given element x,
+        # execute the path compress while findind the root index
+        if not x in self._index:
+            return -1
+        idx = self._index[x]
+        while idx != self._parents[idx]:
+            t = self._parents[idx]
+            self._parents[idx] = self._parents[t]
+            idx = t
+        return idx
+
+    def union(self, x, y):
+        # Union two given element
+        x_root = self.find(x)
+        y_root = self.find(y)
+
+        if x_root == y_root:
+            return
+        self._parents[x_root] = y_root
+
+    def is_connected(self, x, y):
+        # If two given elements have the same root index,
+        # then they are connected.
+        return self.find(x) == self.find(y)
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 42ff0a9eb1112ed5709749e3867794c80be8f1d1..06b0a1375ce6568cca864cd8a2dd69ee46b223a7 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -11,20 +11,48 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+Transpile the program to distributed data-parallelism programs.
+The main_program will be transformed to use a remote parameter server
+to do parameter optimization. And the optimization graph will be put
+into a parameter server program.
+
+Use different methods to split trainable variables to different
+parameter servers.
+
+Steps to transpile trainer:
+1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
+2. rename splited grad variables to add trainer_id suffix ".trainer_%d".
+3. modify trainer program add split_op to each grad variable.
+4. append send_op to send splited variables to server and fetch
+    params(splited blocks or origin param) from server.
+5. append concat_op to merge splited blocks to update local weights.
+
+Steps to transpile pserver:
+1. create new program for parameter server.
+2. create params and grad variables that assigned to current server instance.
+3. create a sub-block in the server side program
+4. append ops that should run on current server instance.
+5. add listen_and_serv op
+"""
 
 from __future__ import print_function
 
 import math
 
-import distributed_splitter as splitter
+from ps_dispatcher import RoundRobin, HashName, PSDispatcher
 from .. import core, framework
 from ..framework import Program, default_main_program, \
                         default_startup_program, \
                         Variable, Parameter, grad_var_name
+from details import *
 
 LOOKUP_TABLE_TYPE = "lookup_table"
 LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
-RPC_CLIENT_VAR_NAME = "RPC_CLIENT_VAR"
+OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
+RPC_OP_ROLE_ATTR_NAME = op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName(
+)
+RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC
 
 
 class VarBlock:
@@ -38,62 +66,11 @@ class VarBlock:
         return "%s:%d:%d" % (self.varname, self.offset, self.size)
 
 
-class UnionFind(object):
-    """ Union-find data structure.
-
-    Union-find is a data structure that keeps track of a set of elements partitioned
-    into a number of disjoint (non-overlapping) subsets.
-
-    Reference:
-    https://en.wikipedia.org/wiki/Disjoint-set_data_structure
-
-    Args:
-      elements(list): The initialize element list.
-    """
-
-    def __init__(self, elementes=None):
-        self._parents = []  # index -> parent index
-        self._index = {}  # element -> index
-        self._curr_idx = 0
-        if not elementes:
-            elementes = []
-        for ele in elementes:
-            self._parents.append(self._curr_idx)
-            self._index.update({ele: self._curr_idx})
-            self._curr_idx += 1
-
-    def find(self, x):
-        # Find the root index of given element x,
-        # execute the path compress while findind the root index
-        if not x in self._index:
-            return -1
-        idx = self._index[x]
-        while idx != self._parents[idx]:
-            t = self._parents[idx]
-            self._parents[idx] = self._parents[t]
-            idx = t
-        return idx
-
-    def union(self, x, y):
-        # Union two given element
-        x_root = self.find(x)
-        y_root = self.find(y)
-
-        if x_root == y_root:
-            return
-        self._parents[x_root] = y_root
-
-    def is_connected(self, x, y):
-        # If two given elements have the same root index,
-        # then they are connected.
-        return self.find(x) == self.find(y)
-
-
 def same_or_split_var(p_name, var_name):
     return p_name == var_name or p_name.startswith(var_name + ".block")
 
 
-def split_dense_variable(var_list, service_count, min_block_size=8192):
+def split_variable(var_list, service_count, min_block_size=8192):
     """
     We may need to split dense tensor to one or more blocks and put
     them equally onto parameter server. One block is a sub-tensor
@@ -139,84 +116,15 @@ def split_dense_variable(var_list, service_count, min_block_size=8192):
     return blocks
 
 
-def delete_ops(block, ops):
-    try:
-        start = list(block.ops).index(ops[0])
-        end = list(block.ops).index(ops[-1])
-        [block.remove_op(start) for _ in xrange(end - start + 1)]
-    except Exception, e:
-        raise e
-    block.program.sync_with_cpp()
-
-
 class DistributeTranspiler:
-    def transpile(self,
-                  trainer_id,
-                  program=None,
-                  pservers="127.0.0.1:6174",
-                  trainers=1,
-                  split_method=splitter.round_robin,
-                  sync_mode=True):
-        """
-        Transpile the program to distributed data-parallelism programs.
-        The main_program will be transformed to use a remote parameter server
-        to do parameter optimization. And the optimization graph will be put
-        into a parameter server program.
-
-        Use different methods to split trainable variables to different
-        parameter servers.
-
-        Steps to transpile trainer:
-        1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
-        2. rename splited grad variables to add trainer_id suffix ".trainer_%d".
-        3. modify trainer program add split_op to each grad variable.
-        4. append send_op to send splited variables to server and fetch
-            params(splited blocks or origin param) from server.
-        5. append concat_op to merge splited blocks to update local weights.
-
-        Steps to transpile pserver:
-        1. create new program for parameter server.
-        2. create params and grad variables that assigned to current server instance.
-        3. create a sub-block in the server side program
-        4. append ops that should run on current server instance.
-        5. add listen_and_serv op
-
-        :param trainer_id: one unique id for each trainer in a job.
-        :type trainer_id: int
-        :param program: program to transpile, default is default_main_program
-        :type program: Program
-        :param pservers: parameter server endpoints like "m1:6174,m2:6174"
-        :type pservers: string
-        :param trainers: total number of workers/trainers in the job
-        :type trainers: int
-        :param split_method: A function to determin how to split variables
-            to different servers equally.
-        :type split_method: function
-        :type sync_mode: boolean default True
-        :param sync_mode: if sync_mode is set True, it means that dist transpiler
-        will transpile the program into sync_mode pserver and trainer program.
-        """
-        assert (callable(split_method))
-        if program is None:
-            program = default_main_program()
-        self.origin_program = program
-        self.trainer_num = trainers
-        self.sync_mode = sync_mode
-        # TODO(typhoonzero): currently trainer_id is fetched from cluster system
-        # like Kubernetes, we should port this to use etcd later when developing
-        # fluid distributed training with fault-tolerance.
-        self.trainer_id = trainer_id
-        pserver_endpoints = pservers.split(",")
-        self.pserver_endpoints = pserver_endpoints
-        self.optimize_ops, params_grads = self._get_optimize_pass()
-
+    def _has_distributed_lookup_table(self):
         # process lookup_table_op
         # 1. check all lookup_table_op is distributed
         # 2. check all lookup_table_op share the same table.
         distributed_lookup_table_ops = []
         # support only one distributed_lookup_table now
         self.table_name = None
-        for op in program.global_block().ops:
+        for op in self.origin_program.global_block().ops:
             if op.type == LOOKUP_TABLE_TYPE:
                 if op.attrs['is_distributed'] is True:
                     if self.table_name is None:
@@ -229,20 +137,13 @@ class DistributeTranspiler:
                     if self.table_name is not None:
                         assert op.input("W")[0] != self.table_name
 
-        self.has_distributed_lookup_table = len(
-            distributed_lookup_table_ops) > 0
-
-        # step1: For large parameters and gradients, split them into smaller
-        # blocks.
-        param_list = []
-        grad_list = []
-        for p, g in params_grads:
-            # skip parameter marked not trainable
-            if type(p) == Parameter and p.trainable == False:
-                continue
-            param_list.append(p)
-            grad_list.append(g)
+        return len(distributed_lookup_table_ops) > 0
 
+    def _update_dist_lookup_table_vars(self, param_list, grad_list,
+                                       params_grads):
+        # TODO(wuyi): put find a way to put dist lookup table stuff all together.
+        # update self.table_param_grad and self.trainer_side_table_grad_list
+        program = self.origin_program
         if self.has_distributed_lookup_table:
             param_list = [
                 param for param in param_list if param.name != self.table_name
@@ -256,67 +157,191 @@ class DistributeTranspiler:
                 if param_grad[0].name == self.table_name
             ][0]
             table_grad_var = self.table_param_grad[1]
-            self.table_grad_list = [
-                program.global_block().create_var(
-                    name="%s.trainer_%d.pserver_%d" %
-                    (table_grad_var.name, trainer_id, index),
-                    type=table_grad_var.type,
-                    shape=table_grad_var.shape,
-                    dtype=table_grad_var.dtype)
-                for index in range(len(self.pserver_endpoints))
-            ]
+            if self.sync_mode:
+                self.trainer_side_table_grad_list = [
+                    program.global_block().create_var(
+                        name="%s.trainer_%d.pserver_%d" %
+                        (table_grad_var.name, self.trainer_id, index),
+                        type=table_grad_var.type,
+                        shape=table_grad_var.shape,
+                        dtype=table_grad_var.dtype)
+                    for index in range(len(self.pserver_endpoints))
+                ]
+            else:
+                self.trainer_side_table_grad_list = [
+                    program.global_block().create_var(
+                        name="%s.pserver_%d" % (table_grad_var.name, index),
+                        type=table_grad_var.type,
+                        shape=table_grad_var.shape,
+                        dtype=table_grad_var.dtype)
+                    for index in range(len(self.pserver_endpoints))
+                ]
+
+    def _init_splited_vars(self, split_method):
+        # update these mappings for further transpile:
+        # 1. param_var_mapping: param var name -> [splited params vars]
+        # 2. grad_var_mapping: grad var name -> [splited grads vars]
+        # 3. grad_param_mapping: grad.blockx -> param.blockx
+        # 4. param_grad_ep_mapping: ep -> {"params": [], "grads": []}
+
+        param_list = []
+        grad_list = []
+        for p, g in self.params_grads:
+            # skip parameter marked not trainable
+            if type(p) == Parameter and p.trainable == False:
+                continue
+            param_list.append(p)
+            grad_list.append(g)
+
+        self._update_dist_lookup_table_vars(param_list, grad_list,
+                                            self.params_grads)
+
+        grad_blocks = split_variable(grad_list, len(self.pserver_endpoints))
+        param_blocks = split_variable(param_list, len(self.pserver_endpoints))
+        assert (len(grad_blocks) == len(param_blocks))
+        # origin_varname -> [splited_var]
+        self.param_var_mapping = self._create_vars_from_blocklist(
+            self.origin_program, param_blocks)
+        self.grad_var_mapping = self._create_vars_from_blocklist(
+            self.origin_program,
+            grad_blocks,
+            add_trainer_suffix=self.trainer_num > 1)
+        self.grad_param_mapping = dict()
+        for g, p in zip(grad_blocks, param_blocks):
+            g_name, g_bid, _ = g.split(":")
+            p_name, p_bid, _ = p.split(":")
+            self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] =  \
+                    self.param_var_mapping[p_name][int(p_bid)]
 
-        grad_blocks = split_dense_variable(grad_list, len(pserver_endpoints))
-        param_blocks = split_dense_variable(param_list, len(pserver_endpoints))
-        # step2: Create new vars for the parameters and gradients blocks and
-        # add ops to do the split.
-        grad_var_mapping = self._append_split_op(program, grad_blocks)
-        param_var_mapping = self._create_vars_from_blocklist(program,
-                                                             param_blocks)
-
-        # step3: Add gradients as send op inputs and parameters as send
-        # op outputs.
-        send_inputs = []
-        send_outputs = []
-        for b in grad_blocks:  # append by order
-            varname, block_id, _ = b.split(":")
-            send_inputs.append(grad_var_mapping[varname][int(block_id)])
-
-        for b in param_blocks:
-            varname, block_id, _ = b.split(":")
-            send_outputs.append(param_var_mapping[varname][int(block_id)])
-
-        # let send_op know which endpoint to send which var to, eplist has the same
-        # order as send_inputs.
-        eplist = split_method(send_inputs, pserver_endpoints)
         # create mapping of endpoint -> split var to create pserver side program
         self.param_grad_ep_mapping = dict()
+        [
+            self.param_grad_ep_mapping.update({
+                ep: {
+                    "params": [],
+                    "grads": []
+                }
+            }) for ep in self.pserver_endpoints
+        ]
+
+    def transpile(self,
+                  trainer_id,
+                  program=None,
+                  pservers="127.0.0.1:6174",
+                  trainers=1,
+                  split_method=RoundRobin,
+                  sync_mode=True):
+        """
+        :param trainer_id: one unique id for each trainer in a job.
+        :type trainer_id: int
+        :param program: program to transpile, default is default_main_program
+        :type program: Program
+        :param pservers: parameter server endpoints like "m1:6174,m2:6174"
+        :type pservers: string
+        :param trainers: total number of workers/trainers in the job
+        :type trainers: int
+        :param split_method: A function to determin how to split variables
+            to different servers equally.
+        :type split_method: function
+        :type sync_mode: boolean default True
+        :param sync_mode: if sync_mode is set True, it means that dist transpiler
+        will transpile the program into sync_mode pserver and trainer program.
+        """
+        assert (split_method.__bases__[0] == PSDispatcher)
+        if program is None:
+            program = default_main_program()
+        self.origin_program = program
+        self.trainer_num = trainers
+        self.sync_mode = sync_mode
+        self.trainer_id = trainer_id
+        pserver_endpoints = pservers.split(",")
+        self.pserver_endpoints = pserver_endpoints
+        self.optimize_ops, self.params_grads = self._get_optimize_pass()
+
+        ps_dispatcher = split_method(self.pserver_endpoints)
+        self.has_distributed_lookup_table = self._has_distributed_lookup_table()
+
+        # split and create vars, then put splited vars in dicts for later use.
+        self._init_splited_vars(split_method)
+
+        # step 3.1: insert send op to send gradient vars to parameter servers
+        ps_dispatcher.reset()
+        send_vars = []
+        for orig_varname, splited_vars in self.grad_var_mapping.items():
+            eplist = ps_dispatcher.dispatch(splited_vars)
+            if len(splited_vars) == 1:
+                orig_varname = splited_vars[0].name
+                index = find_op_by_output_arg(program.global_block(),
+                                              orig_varname)
+            elif len(splited_vars) > 1:
+                orig_var = program.global_block().vars[orig_varname]
+                index = find_op_by_output_arg(program.global_block(),
+                                              orig_varname)
+                self._insert_split_op(program, orig_var, index, splited_vars)
+                index += 1
+            else:
+                AssertionError("Can not insert the send op by original "
+                               "variable name :", orig_varname)
+
+            program.global_block().insert_op(
+                index=index + 1,
+                type="send_vars",
+                inputs={"X": splited_vars},
+                outputs={},
+                attrs={
+                    "epmap": eplist,
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                })
+            for _, var in enumerate(splited_vars):
+                send_vars.append(var)
+
+        if self.sync_mode:
+            program.global_block().append_op(
+                type="send_barrier",
+                inputs={},
+                outputs={},
+                attrs={
+                    "endpoints": pserver_endpoints,
+                    "sync_mode": self.sync_mode,
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                })
+
+        # step 3.2: insert recv op to receive parameters from parameter server
+        recv_vars = []
+        for _, var in enumerate(send_vars):
+            recv_vars.append(self.grad_param_mapping[var])
+        ps_dispatcher.reset()
+        eplist = ps_dispatcher.dispatch(recv_vars)
+
         for i, ep in enumerate(eplist):
-            param = send_outputs[i]
-            grad = send_inputs[i]
-            if not self.param_grad_ep_mapping.has_key(ep):
-                self.param_grad_ep_mapping[ep] = {"params": [], "grads": []}
-            self.param_grad_ep_mapping[ep]["params"].append(param)
-            self.param_grad_ep_mapping[ep]["grads"].append(grad)
-
-        rpc_client_var = program.global_block().create_var(
-            name=RPC_CLIENT_VAR_NAME,
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
-
-        # create send_op
+            self.param_grad_ep_mapping[ep]["params"].append(recv_vars[i])
+            self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i])
+        # step4: Concat the parameters splits together after recv.
+        for varname, splited_var in self.param_var_mapping.iteritems():
+            eps = []
+            for var in splited_var:
+                index = [v.name for v in recv_vars].index(var.name)
+                eps.append(eplist[index])
+
+            program.global_block().append_op(
+                type="recv",
+                inputs={},
+                outputs={"Out": splited_var},
+                attrs={
+                    "epmap": eps,
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                })
+
         program.global_block().append_op(
-            type="send",
-            inputs={"X": send_inputs},
-            outputs={"Out": send_outputs,
-                     "RPCClient": rpc_client_var},
+            type="fetch_barrier",
+            inputs={},
+            outputs={},
             attrs={
                 "endpoints": pserver_endpoints,
-                "epmap": eplist,
-                "sync_mode": self.sync_mode
+                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
             })
-        # step4: Concat the parameters splits together after recv.
-        for varname, splited_var in param_var_mapping.iteritems():
+
+        for varname, splited_var in self.param_var_mapping.iteritems():
             if len(splited_var) <= 1:
                 continue
             orig_param = program.global_block().vars[varname]
@@ -327,10 +352,9 @@ class DistributeTranspiler:
                 attrs={"axis": 0})
 
         if self.has_distributed_lookup_table:
-            self._replace_lookup_table_op_with_prefetch(program, rpc_client_var,
-                                                        eplist)
-            self._split_table_grad_and_add_send_vars(program, rpc_client_var,
-                                                     pserver_endpoints)
+            self._replace_lookup_table_op_with_prefetch(program,
+                                                        pserver_endpoints)
+            self._split_table_grad_and_add_send_vars(program, pserver_endpoints)
 
     def get_trainer_program(self):
         # remove optimize ops and add a send op to main_program
@@ -358,7 +382,6 @@ class DistributeTranspiler:
             # we don't need to create them when grad arrives.
             # change client side var name to origin name by
             # removing ".trainer_%d" suffix
-
             suff_idx = v.name.find(".trainer_")
             if suff_idx >= 0:
                 orig_var_name = v.name[:suff_idx]
@@ -395,24 +418,14 @@ class DistributeTranspiler:
         # located on current pserver
         opt_op_on_pserver = []
         for _, op in enumerate(self.optimize_ops):
-            if self._is_opt_op(op) and self._is_opt_op_on_pserver(endpoint, op):
+            if self._is_optimizer_op(op) and self._is_opt_op_on_pserver(
+                    endpoint, op):
                 opt_op_on_pserver.append(op)
         # step 3.3
         # Iterate through the ops, and if an op and the optimize ops
         # which located on current pserver are in one set, then
         # append it into the sub program.
 
-        # We try to put optimization program run parallelly, assume
-        # optimization program always looks like:
-        #
-        # prevop -> prevop -> opt op -> following op -> following op; ->
-        # prevop -> prevop -> opt op -> following op -> following op; ->
-        # global op -> global op
-        #
-        # we put operators that can run parallelly to many program blocks.
-        # in above example, we seperate ops by the ";". Global ops must run
-        # after all the optimize ops finished.
-
         global_ops = []
         # HACK: optimization global ops only used to scale beta1 and beta2
         # replace it with dependency engine.
@@ -420,12 +433,18 @@ class DistributeTranspiler:
             if self._is_adam_connected_op(op):
                 global_ops.append(op)
 
-        def __append_optimize_op__(op, block, grad_to_block_id):
-            if self._is_opt_op(op):
+        def __append_optimize_op__(op, block, grad_to_block_id, merged_var):
+            if self._is_optimizer_op(op):
                 self._append_pserver_ops(block, op, endpoint, grad_to_block_id,
-                                         self.origin_program)
+                                         self.origin_program, merged_var)
             else:
-                self._append_pserver_non_opt_ops(block, op)
+                self._append_pserver_non_opt_ops(block, op, endpoint)
+
+        def __op_have_grad_input__(op):
+            for varname in op.input_arg_names:
+                if varname.find("@GRAD") >= 0:
+                    return varname
+            return ""
 
         # append lr decay ops to the child block if exists
         lr_ops = self._get_lr_ops()
@@ -433,17 +452,26 @@ class DistributeTranspiler:
             lr_decay_block = pserver_program.create_block(
                 pserver_program.num_blocks - 1)
             for _, op in enumerate(lr_ops):
-                self._append_pserver_non_opt_ops(lr_decay_block, op)
+                self._append_pserver_non_opt_ops(lr_decay_block, op, endpoint)
 
         # append op to the current block
         grad_to_block_id = []
         pre_block_idx = pserver_program.num_blocks - 1
         for idx, opt_op in enumerate(opt_op_on_pserver):
             per_opt_block = pserver_program.create_block(pre_block_idx)
+            # append grad merging ops before clip and weight decay
+            for _, op in enumerate(self.optimize_ops):
+                # find the origin @GRAD var before clipping
+                grad_varname_for_block = __op_have_grad_input__(op)
+                if ufind.is_connected(op, opt_op) and grad_varname_for_block:
+                    merged_var = self._append_pserver_grad_merge_ops(
+                        per_opt_block, grad_varname_for_block, endpoint,
+                        grad_to_block_id, self.origin_program)
             for _, op in enumerate(self.optimize_ops):
                 # optimizer is connected to itself
                 if ufind.is_connected(op, opt_op) and op not in global_ops:
-                    __append_optimize_op__(op, per_opt_block, grad_to_block_id)
+                    __append_optimize_op__(op, per_opt_block, grad_to_block_id,
+                                           merged_var)
 
         # append global ops
         if global_ops:
@@ -451,22 +479,14 @@ class DistributeTranspiler:
                 pserver_program.num_blocks - 1)
             for glb_op in global_ops:
                 __append_optimize_op__(glb_op, opt_state_block,
-                                       grad_to_block_id)
-
-        # NOT USED: single block version:
-        #
-        # for _, op in enumerate(self.optimize_ops):
-        #     for _, opt_op in enumerate(opt_op_on_pserver):
-        #         if ufind.is_connected(op, opt_op):
-        #             __append_optimize_op__(glb_op, optimize_block)
-        #             break
+                                       grad_to_block_id, None)
 
         # process distributed lookup_table
         prefetch_block = None
         if self.has_distributed_lookup_table:
             pserver_index = self.pserver_endpoints.index(endpoint)
             table_opt_block = self._create_table_optimize_block(
-                pserver_index, pserver_program, pre_block_idx)
+                pserver_index, pserver_program, pre_block_idx, grad_to_block_id)
             prefetch_block = self._create_prefetch_block(
                 pserver_index, pserver_program, table_opt_block)
 
@@ -549,9 +569,11 @@ class DistributeTranspiler:
                     attrs=op.attrs)
         return s_prog
 
+    # ====================== private transpiler functions =====================
+
     # transpiler function for dis lookup_table
-    def _replace_lookup_table_op_with_prefetch(self, program, rpc_client_var,
-                                               eplist):
+    def _replace_lookup_table_op_with_prefetch(self, program,
+                                               pserver_endpoints):
         # 1. replace lookup_table_op with split_ids_op -> prefetch_op -> sum_op
         self.prefetch_input_vars = None
         self.prefetch_output_vars = None
@@ -598,11 +620,11 @@ class DistributeTranspiler:
                         index=op_index + 1,
                         type="prefetch",
                         inputs={'X': self.prefetch_input_vars},
-                        outputs={
-                            "Out": self.prefetch_output_vars,
-                            "RPCClient": rpc_client_var
-                        },
-                        attrs={"epmap": eplist})
+                        outputs={"Out": self.prefetch_output_vars},
+                        attrs={
+                            "epmap": pserver_endpoints,
+                            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                        })
 
                     # insert concat_op
                     program.global_block().insert_op(
@@ -622,8 +644,7 @@ class DistributeTranspiler:
                     # break for loop
                     break
 
-    def _split_table_grad_and_add_send_vars(self, program, rpc_client_var,
-                                            pserver_endpoints):
+    def _split_table_grad_and_add_send_vars(self, program, pserver_endpoints):
         # 2. add split_ids_op and send_vars_op to send gradient to pservers
         # there should only be one table_name
         all_ops = program.global_block().ops
@@ -638,14 +659,17 @@ class DistributeTranspiler:
                     inputs={
                         'Ids': [program.global_block().vars[table_grad_name]]
                     },
-                    outputs={"Out": self.table_grad_list})
+                    outputs={"Out": self.trainer_side_table_grad_list})
                 program.global_block().insert_op(
                     index=op_index + 2,
                     type="send_vars",
-                    inputs={'X': self.table_grad_list},
-                    outputs={"RPCClient": rpc_client_var},
-                    attrs={"sync_send": True,
-                           "epmap": pserver_endpoints})
+                    inputs={'X': self.trainer_side_table_grad_list},
+                    outputs={},
+                    attrs={
+                        "sync_send": True,
+                        "epmap": pserver_endpoints,
+                        RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                    })
                 break
 
     def _create_prefetch_block(self, pserver_index, pserver_program,
@@ -678,16 +702,7 @@ class DistributeTranspiler:
         return prefetch_block
 
     def _create_table_optimize_block(self, pserver_index, pserver_program,
-                                     pre_block_idx):
-        def _clone_var(block, var, persistable=True):
-            assert isinstance(var, Variable)
-            return block.create_var(
-                name=var.name,
-                shape=var.shape,
-                dtype=var.dtype,
-                type=var.type,
-                persistable=persistable)
-
+                                     pre_block_idx, grad_to_block_id):
         # STEP: create table optimize block
         # create table param and grad var in pserver program
         origin_param_var = self.origin_program.global_block().vars[
@@ -698,11 +713,11 @@ class DistributeTranspiler:
             dtype=origin_param_var.dtype,
             type=core.VarDesc.VarType.SELECTED_ROWS,
             persistable=True)
-        grad_var = _clone_var(
-            pserver_program.global_block(),
+        # parameter must be selected rows
+        param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
+        grad_var = pserver_program.global_block().clone_variable(
             self.origin_program.global_block().vars[grad_var_name(
-                self.table_name)],
-            persistable=False)
+                self.table_name)])
 
         # create table optimize block in pserver program
         table_opt_op = [
@@ -716,7 +731,7 @@ class DistributeTranspiler:
         if self.sync_mode:
             # create grad vars in pserver program
             table_grad_var = self.table_param_grad[1]
-            table_grad_list = [
+            pserver_side_table_grad_list = [
                 pserver_program.global_block().create_var(
                     name="%s.trainer_%d.pserver_%d" %
                     (table_grad_var.name, index, pserver_index),
@@ -726,11 +741,21 @@ class DistributeTranspiler:
                 for index in range(self.trainer_num)
             ]
 
-            # append sum op for table_grad_list
+            # append sum op for pserver_side_table_grad_list
             table_opt_block.append_op(
                 type="sum",
-                inputs={"X": table_grad_list},
+                inputs={"X": pserver_side_table_grad_list},
                 outputs={"Out": [grad_var]})
+        else:
+            # in async_mode, for table gradient, it also need to be splited to each parameter server
+            origin_grad_name = grad_var.name
+            splited_grad_name = self.trainer_side_table_grad_list[
+                pserver_index].name
+            if not splited_grad_name.startswith(origin_grad_name):
+                raise ValueError("origin_grad_var: " + splited_grad_name +
+                                 " grad_var:" + grad_var.name)
+            grad_var = pserver_program.global_block().rename_var(
+                origin_grad_name, splited_grad_name)
 
         lr_var = pserver_program.global_block().vars[table_opt_op.input(
             "LearningRate")[0]]
@@ -746,9 +771,11 @@ class DistributeTranspiler:
             outputs=outputs,
             attrs=table_opt_op.attrs)
 
+        # add table parameter gradient and it's block id to grad_to_block_id
+        grad_to_block_id.append(grad_var.name + ":" + str(table_opt_block.idx))
+
         return table_opt_block
 
-    # ====================== private transpiler functions =====================
     def _create_vars_from_blocklist(self,
                                     program,
                                     block_list,
@@ -838,50 +865,31 @@ class DistributeTranspiler:
             lod_level=var.lod_level,
             persistable=persistable)
 
-    def _append_split_op(self, program, gradblocks):
-        """
-        Split variables that need to be split and append respective ops
-        Args:
-            program (ProgramDesc): ProgramDesc that gradients blong.
-            gradblocks (list[(varname, block_id, block_size)]): List of gradient blocks.
-        Returns:
-            var_mapping (dict(varname->[new_splitted_variable])):A dict mapping 
-                from original var name to each var split.
-        """
-
-        add_suffix = False
-        if self.trainer_num > 1:
-            add_suffix = True
-        var_mapping = self._create_vars_from_blocklist(
-            program, gradblocks, add_trainer_suffix=add_suffix)
-        for varname, splited_vars in var_mapping.iteritems():
-            # variable that don't need to split have empty splited_vars
-            if len(splited_vars) <= 1:
-                continue
-            orig_var = program.global_block().vars[varname]
-            if orig_var.type == core.VarDesc.VarType.SELECTED_ROWS:
-                height_sections = []
-                for v in splited_vars:
-                    height_sections.append(v.shape[0])
-                program.global_block().append_op(
-                    type="split_selected_rows",
-                    inputs={"X": orig_var},
-                    outputs={"Out": splited_vars},
-                    attrs={"height_sections": height_sections})
-            elif orig_var.type == core.VarDesc.VarType.LOD_TENSOR:
-                sections = []
-                for v in splited_vars:
-                    sections.append(v.shape[0])
-                program.global_block().append_op(
-                    type="split_byref",
-                    inputs={"X": orig_var},
-                    outputs={"Out": splited_vars},
-                    attrs={"sections": sections}  # assume split evenly
-                )
-            else:
-                AssertionError("Variable type should be in set "
-                               "[LOD_TENSOR, SELECTED_ROWS]")
-        return var_mapping
+    def _insert_split_op(self, program, orig_var, index, splited_vars):
+        if orig_var.type == core.VarDesc.VarType.SELECTED_ROWS:
+            height_sections = []
+            for v in splited_vars:
+                height_sections.append(v.shape[0])
+            program.global_block().insert_op(
+                index=index + 1,
+                type="split_selected_rows",
+                inputs={"X": orig_var},
+                outputs={"Out": splited_vars},
+                attrs={"height_sections": height_sections})
+        elif orig_var.type == core.VarDesc.VarType.LOD_TENSOR:
+            sections = []
+            for v in splited_vars:
+                sections.append(v.shape[0])
+            program.global_block().insert_op(
+                index=index + 1,
+                type="split_byref",
+                inputs={"X": orig_var},
+                outputs={"Out": splited_vars},
+                attrs={"sections": sections}  # assume split evenly
+            )
+        else:
+            AssertionError("Variable type should be in set "
+                           "[LOD_TENSOR, SELECTED_ROWS]")
 
     def _get_optimizer_input_shape(self, op_type, varkey, orig_shape,
                                    param_shape):
@@ -910,17 +918,74 @@ class DistributeTranspiler:
             pass
         return orig_shape
 
-    def _orig_varname(self, varname):
-        suff_idx = varname.find(".trainer_")
+    def _get_varname_parts(self, varname):
+        # returns origin, blockid, trainerid
         orig_var_name = ""
-        if suff_idx >= 0:
-            orig_var_name = varname[:suff_idx]
+        trainer_part = ""
+        block_part = ""
+        trainer_idx = varname.find(".trainer_")
+        if trainer_idx >= 0:
+            trainer_part = varname[trainer_idx + 1:]
+        else:
+            trainer_idx = len(varname)
+        block_index = varname.find(".block")
+        if block_index >= 0:
+            block_part = varname[block_index + 1:trainer_idx]
         else:
-            orig_var_name = varname
-        return orig_var_name
+            block_index = len(varname)
+        orig_var_name = varname[0:min(block_index, trainer_idx)]
+        return orig_var_name, block_part, trainer_part
+
+    def _orig_varname(self, varname):
+        orig, _, _ = self._get_varname_parts(varname)
+        return orig
+
+    def _append_pserver_grad_merge_ops(self, optimize_block,
+                                       grad_varname_for_block, endpoint,
+                                       grad_to_block_id, origin_program):
+        program = optimize_block.program
+        pserver_block = program.global_block()
+        grad_block = None
+        for g in self.param_grad_ep_mapping[endpoint]["grads"]:
+            if self._orig_varname(g.name) == \
+                    self._orig_varname(grad_varname_for_block):
+                grad_block = g
+                break
+        if not grad_block:
+            # do not append this op if current endpoint
+            # is not dealing with this grad block
+            return
+        orig_varname, block_name, trainer_name = self._get_varname_parts(
+            grad_block.name)
+        if block_name:
+            merged_var_name = '.'.join([orig_varname, block_name])
+        else:
+            merged_var_name = orig_varname
+        merged_var = \
+            pserver_block.vars[merged_var_name]
+        grad_to_block_id.append(merged_var.name + ":" + str(optimize_block.idx))
+        if self.sync_mode and self.trainer_num > 1:
+            vars2merge = []
+            for i in xrange(self.trainer_num):
+                per_trainer_name = "%s.trainer_%d" % \
+                (merged_var_name, i)
+                vars2merge.append(pserver_block.vars[per_trainer_name])
+
+            optimize_block.append_op(
+                type="sum",
+                inputs={"X": vars2merge},
+                outputs={"Out": merged_var})
+            # TODO(panyx0718): What if it's SELECTED_ROWS.
+            if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
+                optimize_block.append_op(
+                    type="scale",
+                    inputs={"X": merged_var},
+                    outputs={"Out": merged_var},
+                    attrs={"scale": 1.0 / float(self.trainer_num)})
+        return merged_var
 
     def _append_pserver_ops(self, optimize_block, opt_op, endpoint,
-                            grad_to_block_id, origin_program):
+                            grad_to_block_id, origin_program, merged_var):
         program = optimize_block.program
         pserver_block = program.global_block()
         new_inputs = dict()
@@ -928,40 +993,6 @@ class DistributeTranspiler:
         # moment can use the updated shape
         for key in opt_op.input_names:
             if key == "Grad":
-                grad_block = None
-                for g in self.param_grad_ep_mapping[endpoint]["grads"]:
-                    if same_or_split_var(
-                            self._orig_varname(g.name),
-                            self._orig_varname(opt_op.input(key)[0])):
-                        grad_block = g
-                        break
-                if not grad_block:
-                    # do not append this op if current endpoint
-                    # is not dealing with this grad block
-                    return
-                merged_var = \
-                    pserver_block.vars[self._orig_varname(grad_block.name)]
-                grad_to_block_id.append(merged_var.name + ":" + str(
-                    optimize_block.idx))
-                if self.sync_mode and self.trainer_num > 1:
-                    vars2merge = []
-                    for i in xrange(self.trainer_num):
-                        per_trainer_name = "%s.trainer_%d" % \
-                        (self._orig_varname(grad_block.name), i)
-                        vars2merge.append(pserver_block.vars[per_trainer_name])
-
-                    optimize_block.append_op(
-                        type="sum",
-                        inputs={"X": vars2merge},
-                        outputs={"Out": merged_var})
-                    # TODO(panyx0718): What if it's SELECTED_ROWS.
-                    if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
-                        optimize_block.append_op(
-                            type="scale",
-                            inputs={"X": merged_var},
-                            outputs={"Out": merged_var},
-                            attrs={"scale": 1.0 / float(self.trainer_num)})
-
                 new_inputs[key] = merged_var
             elif key == "Param":
                 # param is already created on global program
@@ -1020,17 +1051,31 @@ class DistributeTranspiler:
             outputs=outputs,
             attrs=opt_op.attrs)
 
-    def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
+    def _is_splited_grad_var(self, var, var_dict):
+        grad_block = None
+        for _, g in var_dict.iteritems():
+            if self._orig_varname(g.name) == self._orig_varname(var.name):
+                if g.name.find(".trainer_") == -1:
+                    grad_block = g
+                    break
+        return grad_block
+
+    def _append_pserver_non_opt_ops(self, optimize_block, opt_op, endpoint):
         program = optimize_block.program
         # Append the ops for parameters that do not need to be optimized/updated
         inputs = self._get_input_map_from_op(
             self.origin_program.global_block().vars, opt_op)
-        for varlist in inputs.itervalues():
+        for key, varlist in inputs.iteritems():
             if not isinstance(varlist, list):
                 varlist = [varlist]
-
             for var in varlist:
-                if not program.global_block().vars.has_key(var.name):
+                # for ops like clipping and weight decay, get the splited var
+                # for inputs/outputs
+                grad_block = self._is_splited_grad_var(
+                    var, program.global_block().vars)
+                if grad_block:
+                    inputs[key] = grad_block
+                elif not program.global_block().vars.has_key(var.name):
                     program.global_block().create_var(
                         name=var.name,
                         persistable=var.persistable,
@@ -1039,13 +1084,16 @@ class DistributeTranspiler:
 
         outputs = self._get_output_map_from_op(
             self.origin_program.global_block().vars, opt_op)
-
-        for varlist in outputs.itervalues():
+        for key, varlist in outputs.iteritems():
             if not isinstance(varlist, list):
                 varlist = [varlist]
-
             for var in varlist:
-                program.global_block().clone_variable(var)
+                grad_block = self._is_splited_grad_var(
+                    var, program.global_block().vars)
+                if grad_block:
+                    outputs[key] = grad_block
+                elif not program.global_block().vars.has_key(var.name):
+                    program.global_block().clone_variable(var)
 
         optimize_block.append_op(
             type=opt_op.type,
@@ -1091,9 +1139,17 @@ class DistributeTranspiler:
                     ufind.union(op1, op2)
         return ufind
 
-    def _is_opt_op(self, op):
-        # NOTE: It's a HACK implement.
-        # optimize op: SGDOptimize, MomentumOptimizer, AdamOptimizer and etc...
+    def _is_opt_role_op(self, op):
+        # NOTE: depend on oprole to find out whether this op is for
+        # optimize
+        op_maker = core.op_proto_and_checker_maker
+        optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
+        if op_maker.kOpRoleAttrName() in op.attrs and \
+            int(op.attrs[op_maker.kOpRoleAttrName()]) == int(optimize_role):
+            return True
+        return False
+
+    def _is_optimizer_op(self, op):
         if "Param" in op.input_names and \
             "LearningRate" in op.input_names:
             return True
@@ -1143,7 +1199,7 @@ class DistributeTranspiler:
         # find learning rate variables by optimize op
         lr_vars = set()
         for op in self.optimize_ops:
-            if self._is_opt_op(op):
+            if self._is_optimizer_op(op):
                 lr_vars.add(op.input("LearningRate")[0])
 
         find_ops = []
@@ -1160,7 +1216,7 @@ class DistributeTranspiler:
                 # NOTE: we need to skip all optimize ops, since it is connected
                 # with forward/backward ops and lr ops, we only need the lr ops.
                 if op1 != op2 and self._is_op_connected(op1, op2) and \
-                    not self._is_opt_op(op1) and not self._is_opt_op(op2):
+                    not self._is_optimizer_op(op1) and not self._is_optimizer_op(op2):
                     ufind.union(op1, op2)
         # find all ops which is related with lr var
         for op1 in block.ops:
@@ -1181,13 +1237,21 @@ class DistributeTranspiler:
         block = self.origin_program.global_block()
         opt_ops = []
         params_grads = []
+        origin_var_dict = self.origin_program.global_block().vars
         for op in block.ops:
-            if self._is_opt_op(op):
+            if self._is_opt_role_op(op):
                 opt_ops.append(op)
-                params_grads.append((self.origin_program.global_block().var(
-                    op.input("Param")[0]),
-                                     self.origin_program.global_block().var(
-                                         op.input("Grad")[0])))
+                # HACK(wuyi): if we find grad vars from input of optimize
+                # ops, we may get the output of clip op. Use syntax "@GRAD"
+                # and op_role_var to get the pair.
+                for input_name in op.input_arg_names:
+                    if input_name.find("@GRAD") != -1 and \
+                        op.attrs[RPC_OP_ROLE_ATTR_NAME]:
+                        param_name = op.attrs[OP_ROLE_VAR_ATTR_NAME][0]
+                        params_grads.append([
+                            origin_var_dict[param_name],
+                            origin_var_dict[input_name]
+                        ])
             elif self._is_adam_connected_op(op):
                 opt_ops.append(op)
             else:
diff --git a/python/paddle/fluid/transpiler/distributed_splitter.py b/python/paddle/fluid/transpiler/distributed_splitter.py
deleted file mode 100644
index 060c1df8ad2badc5132f45ff0f44d136d828faa1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/transpiler/distributed_splitter.py
+++ /dev/null
@@ -1,57 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-def hash_name(varlist, pserver_endpoints):
-    """
-    hash variable names to several endpoints.
-
-    Args:
-        varlist(list): a list of Variables
-
-    Returns(dict): a map of pserver endpoint -> varname
-    """
-
-    def _hash_block(block_str, total):
-        return hash(block_str) % total
-
-    eplist = []
-    for var in varlist:
-        server_id = _hash_block(var.name(), len(pserver_endpoints))
-        server_for_param = pserver_endpoints[server_id]
-        eplist.append(server_for_param)
-    return eplist
-
-
-def round_robin(varlist, pserver_endpoints):
-    """
-    Distribute variables to several endpoints.
-    Args:
-        varlist(list): a list of variables
-        pserver_endpoints(list): a list of pserver endpoints
-
-    Returns(list[int]): the endpoint for each variable
-    """
-    assert (len(varlist) >= len(pserver_endpoints))
-
-    eplist = []
-    pserver_idx = 0
-    for var in varlist:
-        server_for_param = pserver_endpoints[pserver_idx]
-        eplist.append(server_for_param)
-
-        pserver_idx += 1
-        if pserver_idx >= len(pserver_endpoints):
-            pserver_idx = 0
-    return eplist
diff --git a/python/paddle/fluid/transpiler/ps_dispatcher.py b/python/paddle/fluid/transpiler/ps_dispatcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6a68677527deb09ace0e3a23cbc093d6d7b4349
--- /dev/null
+++ b/python/paddle/fluid/transpiler/ps_dispatcher.py
@@ -0,0 +1,78 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class PSDispatcher(object):
+    """
+    PSDispatcher is the base class for dispatching vars
+    into different pserver instance.
+    You need to implement the `dispatch` inferface.
+    """
+
+    def __init__(self, pserver_endpoints):
+        self._eps = pserver_endpoints
+        self._step = 0
+
+    @property
+    def eps(self):
+        return self._eps
+
+    def reset(self):
+        self._step = 0
+
+    def dispatch(self, varlist):
+        """
+        :param varlist: a list of Variables
+        :return: a map of pserver endpoint -> varname 
+        """
+        AssertionError("Interface has not been implemented.")
+
+
+class HashName(PSDispatcher):
+    """
+      Hash variable names to several endpoints
+    """
+
+    def __init__(self, pserver_endpoints):
+        super(self.__class__, self).__init__(pserver_endpoints)
+
+    def _hash_block(self, block_str, total):
+        return hash(block_str) % total
+
+    def dispatch(self, varlist):
+        eplist = []
+        for var in varlist:
+            server_id = self._hash_block(var.name(), len(self._eps))
+            server_for_param = self._eps[server_id]
+            eplist.append(server_for_param)
+        return eplist
+
+
+class RoundRobin(PSDispatcher):
+    """
+    Distribute variables to serveral endpoints.
+    """
+
+    def __init__(self, pserver_endpoints):
+        super(self.__class__, self).__init__(pserver_endpoints)
+
+    def dispatch(self, varlist):
+        eplist = []
+        for var in varlist:
+            server_for_param = self._eps[self._step]
+            eplist.append(server_for_param)
+            self._step += 1
+            if self._step >= len(self._eps):
+                self._step = 0
+        return eplist
diff --git a/python/setup.py.in b/python/setup.py.in
index c42601d335f01491156dc3591341c1a3213aecfe..8257f1d5e212a84188a4c51bc2d0f4d4c7af91fb 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -69,7 +69,8 @@ packages=['paddle',
           'paddle.fluid.proto',
           'paddle.fluid.proto.profiler',
           'paddle.fluid.layers',
-          'paddle.fluid.transpiler']
+          'paddle.fluid.transpiler',
+          'paddle.fluid.transpiler.details']
 
 if '${WITH_FLUID_ONLY}'== 'OFF':
     packages+=['paddle.proto',
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
index 94d1e23ce716f7f1d723bad5f1f4c60030f19eb7..b194af76dc529fd52b0aedfab9c41d625fe64c0d 100755
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ b/tools/codestyle/cpplint_pre_commit.hook
@@ -4,8 +4,12 @@ TOTAL_ERRORS=0
 
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
 for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
-    cpplint $file;
-    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
+    if [[ $file =~ ^(paddle/api/.*|paddle/capi/.*|paddle/contrib/.*|paddle/cuda/.*|paddle/function/.*|paddle/gserver/.*|paddle/math/.*|paddle/optimizer/.*|paddle/parameter/.*|paddle/pserver/.*|paddle/trainer/.*|paddle/utils/.*) ]]; then
+        continue;
+    else
+        cpplint $file;
+        TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
+    fi
 done
 
 exit $TOTAL_ERRORS
diff --git a/tools/codestyle/docstring_checker.py b/tools/codestyle/docstring_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..48100e5bf989520043b5ca372b02883faea8a9fd
--- /dev/null
+++ b/tools/codestyle/docstring_checker.py
@@ -0,0 +1,334 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DocstringChecker is used to check python doc string's style."""
+
+import six
+import astroid
+
+from pylint.checkers import BaseChecker, utils
+from pylint.interfaces import IAstroidChecker
+
+from collections import defaultdict
+import re
+
+
+def register(linter):
+    """Register checkers."""
+    linter.register_checker(DocstringChecker(linter))
+
+
+class Docstring(object):
+    """Docstring class holds the parsed doc string elements.
+    """
+
+    def __init__(self):
+        self.d = defaultdict(list)  #name->[]
+        self.clear()
+
+    def clear(self):
+        self.d['Args'] = []
+        self.d['Examples'] = []
+        self.d['Returns'] = []
+        self.d['Raises'] = []
+        self.args = {}  #arg_name->arg_type
+
+    def get_level(self, string, indent='    '):
+        level = 0
+        unit_size = len(indent)
+        while string[:unit_size] == indent:
+            string = string[unit_size:]
+            level += 1
+
+        return level
+
+    def parse(self, doc):
+        """parse gets sections from doc
+        Such as Args, Returns, Raises, Examples s
+        Args:
+            doc (string): is the astroid node doc string.
+        Returns:
+            True if doc is parsed successfully.
+        """
+        self.clear()
+
+        lines = doc.splitlines()
+        state = ("others", -1)
+        for l in lines:
+            c = l.strip()
+            if len(c) <= 0:
+                continue
+
+            level = self.get_level(l)
+            if c.startswith("Args:"):
+                state = ("Args", level)
+            elif c.startswith("Returns:"):
+                state = ("Returns", level)
+            elif c.startswith("Raises:"):
+                state = ("Raises", level)
+            elif c.startswith("Examples:"):
+                state = ("Examples", level)
+            else:
+                if level > state[1]:
+                    self.d[state[0]].append(c)
+                    continue
+
+                state = ("others", -1)
+                self.d[state[0]].append(c)
+
+        self._arg_with_type()
+        return True
+
+    def get_returns(self):
+        return self.d['Returns']
+
+    def get_raises(self):
+        return self.d['Raises']
+
+    def get_examples(self):
+        return self.d['Examples']
+
+    def _arg_with_type(self):
+
+        for t in self.d['Args']:
+            m = re.search('([A-Za-z0-9_-]+)\s{0,4}(\(.+\))\s{0,4}:', t)
+            if m:
+                self.args[m.group(1)] = m.group(2)
+
+        return self.args
+
+
+class DocstringChecker(BaseChecker):
+    """DosstringChecker is pylint checker to
+    check docstring style.
+    """
+    __implements__ = (IAstroidChecker, )
+
+    POSITIONAL_MESSAGE_ID = 'str-used-on-positional-format-argument'
+    KEYWORD_MESSAGE_ID = 'str-used-on-keyword-format-argument'
+
+    name = 'doc-string-checker'
+    symbol = "doc-string"
+    priority = -1
+    msgs = {
+        'W9001': ('One line doc string on > 1 lines', symbol + "-one-line",
+                  'Used when a short doc string is on multiple lines'),
+        'W9002':
+        ('Doc string does not end with "." period', symbol + "-end-with",
+         'Used when a doc string does not end with a period'),
+        'W9003': ('All args with their types must be mentioned in doc string',
+                  symbol + "-with-all-args",
+                  'Used when not all arguments are in the doc string '),
+        'W9005': ('Missing docstring or docstring is too short',
+                  symbol + "-missing", 'Add docstring longer >=10'),
+        'W9006': ('Docstring indent error, use 4 space for indent',
+                  symbol + "-indent-error", 'Use 4 space for indent'),
+        'W9007': ('You should add `Returns` in comments',
+                  symbol + "-with-returns",
+                  'There should be a `Returns` section in comments'),
+        'W9008': ('You should add `Raises` section in comments',
+                  symbol + "-with-raises",
+                  'There should be a `Raises` section in comments'),
+    }
+    options = ()
+
+    def visit_functiondef(self, node):
+        """visit_functiondef checks Function node docstring style.
+        Args:
+            node (astroid.node): The visiting node.
+        Returns:
+            True if successful other wise False.
+        """
+
+        self.check_doc_string(node)
+
+        if node.tolineno - node.fromlineno <= 10:
+            return True
+
+        if not node.doc:
+            return True
+
+        doc = Docstring()
+        doc.parse(node.doc)
+
+        self.all_args_in_doc(node, doc)
+        self.with_returns(node, doc)
+        self.with_raises(node, doc)
+
+    def visit_module(self, node):
+        self.check_doc_string(node)
+
+    def visit_classdef(self, node):
+        self.check_doc_string(node)
+
+    def check_doc_string(self, node):
+        self.missing_doc_string(node)
+        self.one_line(node)
+        self.has_period(node)
+        self.indent_style(node)
+
+    def missing_doc_string(self, node):
+        if node.tolineno - node.fromlineno <= 10:
+            return True
+
+        if node.doc is None or len(node.doc) < 10:
+            self.add_message('W9005', node=node, line=node.fromlineno)
+        return False
+
+    # FIXME(gongwb): give the docstring line-no
+    def indent_style(self, node, indent=4):
+        """indent_style checks docstring's indent style
+        Args:
+            node (astroid.node): The visiting node.
+            indent (int): The default indent of style
+        Returns:
+            True if successful other wise False.
+        """
+        if node.doc is None:
+            return True
+
+        doc = node.doc
+        lines = doc.splitlines()
+
+        for l in lines:
+            cur_indent = len(l) - len(l.lstrip())
+            if cur_indent % indent != 0:
+                self.add_message('W9006', node=node, line=node.fromlineno)
+                return False
+
+        return True
+
+    def one_line(self, node):
+        """one_line checks if docstring (len < 40) is on one line.
+        Args:
+            node (astroid.node): The node visiting.
+        Returns:
+            True if successful otherwise False.
+        """
+
+        doc = node.doc
+        if doc is None:
+            return True
+
+        if len(doc) > 40:
+            return True
+        elif sum(doc.find(nl) for nl in ('\n', '\r', '\n\r')) == -3:
+            return True
+        else:
+            self.add_message('W9001', node=node, line=node.fromlineno)
+            return False
+
+        return True
+
+    def has_period(self, node):
+        """has_period checks if one line doc end-with '.' .
+        Args:
+            node (astroid.node): the node is visiting.
+        Returns:
+            True if successful otherwise False.
+        """
+        if node.doc is None:
+            return True
+
+        if len(node.doc.splitlines()) > 1:
+            return True
+
+        if not node.doc.strip().endswith('.'):
+            self.add_message('W9002', node=node, line=node.fromlineno)
+            return False
+
+        return True
+
+    def with_raises(self, node, doc):
+        """with_raises checks if one line doc end-with '.' .
+        Args:
+            node (astroid.node): the node is visiting.
+            doc (Docstring): Docstring object.
+        Returns:
+            True if successful otherwise False.
+        """
+
+        find = False
+        for t in node.body:
+            if not isinstance(t, astroid.Raise):
+                continue
+
+            find = True
+            break
+
+        if not find:
+            return True
+
+        if len(doc.get_raises()) == 0:
+            self.add_message('W9008', node=node, line=node.fromlineno)
+            return False
+
+        return True
+
+    def with_returns(self, node, doc):
+        """with_returns checks if docstring comments what are returned .
+        Args:
+            node (astroid.node): the node is visiting.
+            doc (Docstring): Docstring object.
+        Returns:
+            True if successful otherwise False.
+        """
+
+        find = False
+        for t in node.body:
+            if not isinstance(t, astroid.Return):
+                continue
+
+            find = True
+            break
+
+        if not find:
+            return True
+
+        if len(doc.get_returns()) == 0:
+            self.add_message('W9007', node=node, line=node.fromlineno)
+            return False
+
+        return True
+
+    def all_args_in_doc(self, node, doc):
+        """all_args_in_doc checks if arguments are mentioned in doc
+        Args:
+            node (astroid.node): the node is visiting.
+            doc (Docstring): Docstring object
+        Returns:
+            True if successful otherwise False.
+        """
+        args = []
+        for arg in node.args.get_children():
+            if (not isinstance(arg, astroid.AssignName)) \
+                or arg.name == "self":
+                continue
+            args.append(arg.name)
+
+        if len(args) <= 0:
+            return True
+
+        parsed_args = doc.args
+        if len(args) > 0 and len(parsed_args) <= 0:
+            print "debug:parsed args: ", parsed_args
+            self.add_message('W9003', node=node, line=node.fromlineno)
+            return False
+
+        for t in args:
+            if t not in parsed_args:
+                print t, " with (type) not in ", parsed_args
+                self.add_message('W9003', node=node, line=node.fromlineno)
+                return False
+
+        return True
diff --git a/tools/codestyle/pylint_pre_commit.hook b/tools/codestyle/pylint_pre_commit.hook
new file mode 100755
index 0000000000000000000000000000000000000000..e7c92ba671e0eb778b2ab5447bea7c4b14fe761b
--- /dev/null
+++ b/tools/codestyle/pylint_pre_commit.hook
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+TOTAL_ERRORS=0
+
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+export PYTHONPATH=$DIR:$PYTHONPATH
+
+# The trick to remove deleted files: https://stackoverflow.com/a/2413151
+for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
+    pylint --disable=all --load-plugins=docstring_checker \
+    --enable=doc-string-one-line,doc-string-end-with,doc-string-with-all-args,doc-string-triple-quotes,doc-string-missing,doc-string-indent-error,doc-string-with-returns,doc-string-with-raises $file;
+    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
+done
+
+#exit $TOTAL_ERRORS
+#For now, just warning:
+exit 0
+
diff --git a/tools/codestyle/test_docstring_checker.py b/tools/codestyle/test_docstring_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..0547f7d1610c64b0ca6efa9384e97d658c8276fe
--- /dev/null
+++ b/tools/codestyle/test_docstring_checker.py
@@ -0,0 +1,232 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import docstring_checker
+import pylint.testutils
+import astroid
+import pytest
+import sys
+
+
+class TestDocstring(pylint.testutils.CheckerTestCase):
+    CHECKER_CLASS = docstring_checker.DocstringChecker
+
+    def test_one_line(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            """get 
+            news.
+            """
+            if True:
+                return 5
+            return 5
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9001' == got[0][0]
+
+    def test_one_line(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            """get news"""
+            if True:
+                return 5
+            return 5
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9002' == got[0][0]
+
+    def test_args(self):
+        func_node = astroid.extract_node('''
+        def test(scale, mean): 
+            """get news.
+            Args:
+                scale (int): scale is the number.
+            """
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9003' == got[0][0]
+
+    def test_missing(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9005' == got[0][0]
+
+    def test_indent(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            """ get get get get get get get get
+              get get get get get get get get.
+            """
+            pass 
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9006' == got[0][0]
+
+    def test_with_resturns(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            """get news.
+            Args:
+                scale (int): scale is the number.
+            """
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            return mean
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9007' == got[0][0]
+
+    def test_with_raises(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            """get news.
+            Args:
+                scale (int): scale is the number.
+            """
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            raise ValueError('A very specific bad thing happened.')
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9008' == got[0][0]
+
+    def test_no_message(self):
+        p = '''
+def fc(input,
+       size,
+       num_flatten_dims=1,
+       param_attr=None,
+       bias_attr=None,
+       act=None,
+       name=None):
+    """
+    **Fully Connected Layer**
+    The fully connected layer can take multiple tensors as its inputs. It
+    creates a variable called weights for each input tensor, which represents
+    a fully connected weight matrix from each input unit to each output unit.
+    The fully connected layer multiplies each input tensor with its coresponding
+    weight to produce an output Tensor. If multiple input tensors are given,
+    the results of multiple multiplications will be sumed up. If bias_attr is
+    not None, a bias variable will be created and added to the output. Finally,
+    if activation is not None, it will be applied to the output as well.
+    This process can be formulated as follows:
+
+    Args:
+        input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
+            the input tensor(s) is at least 2.
+        size(int): The number of output units in this layer.
+        num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than
+            two dimensions. If this happens, the multidimensional tensor will first be flattened
+            into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
+            tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
+            dimensions will be flatten to form the first dimension of the final matrix (height of
+            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
+            form the second dimension of the final matrix (width of the matrix). For example, suppose
+            `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
+            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
+        param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
+            parameters/weights of this layer.
+        bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
+            of this layer. If it is set to None, no bias will be added to the output units.
+        act (str, default None): Activation to be applied to the output of this layer.
+        name (str, default None): The name of this layer.
+    Returns:
+        A tensor variable storing the transformation result.
+    Raises:
+        ValueError: If rank of the input tensor is less than 2.
+    Examples:
+        .. code-block:: python
+            data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+            fc = fluid.layers.fc(input=data, size=1000, act="tanh")
+    """
+    raise ValueError('A very specific bad thing happened.')
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    return size
+    '''
+
+        func_node = astroid.extract_node(p)
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 0